Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.utils import maybe_model_redirect
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_num_threads

logger = init_logger(__name__)
Expand Down
2 changes: 1 addition & 1 deletion tests/lora/test_add_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from vllm.inputs import TextPrompt
from vllm.lora.request import LoRARequest
from vllm.sampling_params import SamplingParams
from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.async_utils import merge_async_iterators

MODEL_PATH = "zai-org/chatglm3-6b"
LORA_RANK = 64
Expand Down
2 changes: 1 addition & 1 deletion tests/models/multimodal/generation/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
)

from vllm.platforms import current_platform
from vllm.utils.functools import identity
from vllm.utils.func_utils import identity

from ....conftest import (
IMAGE_ASSETS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from transformers.video_utils import VideoMetadata

from vllm.logprobs import SampleLogprobs
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of

from .....conftest import HfRunner, ImageAsset, ImageTestAssets
from .types import RunnerOutput
Expand Down
2 changes: 1 addition & 1 deletion tests/models/multimodal/processing/test_tensor_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils.collections import is_list_of
from vllm.utils.collection_utils import is_list_of
from vllm.utils.torch_utils import set_default_torch_dtype

from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS
Expand Down
2 changes: 1 addition & 1 deletion tests/utils_/test_async_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import pytest

from vllm.utils.asyncio import merge_async_iterators
from vllm.utils.async_utils import merge_async_iterators


async def _mock_async_iterator(idx: int):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest

from vllm.utils.collections import swap_dict_values
from vllm.utils.collection_utils import swap_dict_values


@pytest.mark.parametrize(
Expand Down
2 changes: 1 addition & 1 deletion tests/utils_/test_func_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import pytest

from vllm.utils.functools import deprecate_kwargs, supports_kw
from vllm.utils.func_utils import deprecate_kwargs, supports_kw

from ..utils import error_on_warning

Expand Down
25 changes: 25 additions & 0 deletions tests/utils_/test_hashing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import hashlib
import pickle

import pytest

from vllm.utils.hashing import sha256


@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
def test_sha256(input: tuple):
digest = sha256(input)
assert digest is not None
assert isinstance(digest, bytes)
assert digest != b""

input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
assert digest == hashlib.sha256(input_bytes).digest()

# hashing again, returns the same value
assert digest == sha256(input)

# hashing different input, returns different value
assert digest != sha256(input + (1,))
63 changes: 63 additions & 0 deletions tests/utils_/test_mem_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import torch
from vllm_test_utils.monitor import monitor

from vllm.utils.mem_utils import MemorySnapshot, memory_profiling

from ..utils import create_new_process_for_each_test


@create_new_process_for_each_test()
def test_memory_profiling():
# Fake out some model loading + inference memory usage to test profiling
# Memory used by other processes will show up as cuda usage outside of torch
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary

lib = CudaRTLibrary()
# 512 MiB allocation outside of this instance
handle1 = lib.cudaMalloc(512 * 1024 * 1024)

baseline_snapshot = MemorySnapshot()

# load weights

weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)

weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB

def measure_current_non_torch():
free, total = torch.cuda.mem_get_info()
current_used = total - free
current_torch = torch.cuda.memory_reserved()
current_non_torch = current_used - current_torch
return current_non_torch

with (
memory_profiling(
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
) as result,
monitor(measure_current_non_torch) as monitored_values,
):
# make a memory spike, 1 GiB
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
del spike

# Add some extra non-torch memory 256 MiB (simulate NCCL)
handle2 = lib.cudaMalloc(256 * 1024 * 1024)

# this is an analytic value, it is exact,
# we only have 256 MiB non-torch memory increase
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
assert measured_diff == 256 * 1024 * 1024

# Check that the memory usage is within 5% of the expected values
# 5% tolerance is caused by cuda runtime.
# we cannot control cuda runtime in the granularity of bytes,
# which causes a small error (<10 MiB in practice)
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
assert abs(non_torch_ratio - 1) <= 0.05
assert result.torch_peak_increase == 1024 * 1024 * 1024
del weights
lib.cudaFree(handle1)
lib.cudaFree(handle2)
104 changes: 104 additions & 0 deletions tests/utils_/test_torch_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
import torch

from vllm.utils.torch_utils import (
common_broadcastable_dtype,
current_stream,
is_lossless_cast,
)


@pytest.mark.parametrize(
("src_dtype", "tgt_dtype", "expected_result"),
[
# Different precision_levels
(torch.bool, torch.int8, True),
(torch.bool, torch.float16, True),
(torch.bool, torch.complex32, True),
(torch.int64, torch.bool, False),
(torch.int64, torch.float16, True),
(torch.int64, torch.complex32, True),
(torch.float64, torch.bool, False),
(torch.float64, torch.int8, False),
(torch.float64, torch.complex32, True),
(torch.complex128, torch.bool, False),
(torch.complex128, torch.int8, False),
(torch.complex128, torch.float16, False),
# precision_level=0
(torch.bool, torch.bool, True),
# precision_level=1
(torch.int8, torch.int16, True),
(torch.int16, torch.int8, False),
(torch.uint8, torch.int8, False),
(torch.int8, torch.uint8, False),
# precision_level=2
(torch.float16, torch.float32, True),
(torch.float32, torch.float16, False),
(torch.bfloat16, torch.float32, True),
(torch.float32, torch.bfloat16, False),
# precision_level=3
(torch.complex32, torch.complex64, True),
(torch.complex64, torch.complex32, False),
],
)
def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result


@pytest.mark.parametrize(
("dtypes", "expected_result"),
[
([torch.bool], torch.bool),
([torch.bool, torch.int8], torch.int8),
([torch.bool, torch.int8, torch.float16], torch.float16),
([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32), # noqa: E501
],
)
def test_common_broadcastable_dtype(dtypes, expected_result):
assert common_broadcastable_dtype(dtypes) == expected_result


def test_current_stream_multithread():
import threading

if not torch.cuda.is_available():
pytest.skip("CUDA not available")

main_default_stream = torch.cuda.current_stream()
child_stream = torch.cuda.Stream()

thread_stream_ready = threading.Event()
thread_can_exit = threading.Event()

def child_thread_func():
with torch.cuda.stream(child_stream):
thread_stream_ready.set()
thread_can_exit.wait(timeout=10)

child_thread = threading.Thread(target=child_thread_func)
child_thread.start()

try:
assert thread_stream_ready.wait(timeout=5), (
"Child thread failed to enter stream context in time"
)

main_current_stream = current_stream()

assert main_current_stream != child_stream, (
"Main thread's current_stream was contaminated by child thread"
)
assert main_current_stream == main_default_stream, (
"Main thread's current_stream is not the default stream"
)

# Notify child thread it can exit
thread_can_exit.set()

finally:
# Ensure child thread exits properly
child_thread.join(timeout=5)
if child_thread.is_alive():
pytest.fail("Child thread failed to exit properly")
Loading