vllm-project · vllm-bot · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025 · Oct 19, 2025
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -60,7 +60,7 @@
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import BeamSearchParams
 from vllm.transformers_utils.utils import maybe_model_redirect
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_num_threads
 
 logger = init_logger(__name__)

@@ -12,7 +12,7 @@
 from vllm.inputs import TextPrompt
 from vllm.lora.request import LoRARequest
 from vllm.sampling_params import SamplingParams
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 MODEL_PATH = "zai-org/chatglm3-6b"
 LORA_RANK = 64

@@ -17,7 +17,7 @@
 )
 
 from vllm.platforms import current_platform
-from vllm.utils.functools import identity
+from vllm.utils.func_utils import identity
 
 from ....conftest import (
     IMAGE_ASSETS,

@@ -25,7 +25,7 @@
 from transformers.video_utils import VideoMetadata
 
 from vllm.logprobs import SampleLogprobs
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 
 from .....conftest import HfRunner, ImageAsset, ImageTestAssets
 from .types import RunnerOutput

@@ -34,7 +34,7 @@
 from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
 from vllm.multimodal.utils import group_mm_kwargs_by_modality
 from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
-from vllm.utils.collections import is_list_of
+from vllm.utils.collection_utils import is_list_of
 from vllm.utils.torch_utils import set_default_torch_dtype
 
 from ...registry import _MULTIMODAL_EXAMPLE_MODELS, HF_EXAMPLE_MODELS

diff --git a/tests/utils_/test_async_utils.py b/tests/utils_/test_async_utils.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from vllm.utils.asyncio import merge_async_iterators
+from vllm.utils.async_utils import merge_async_iterators
 
 
 async def _mock_async_iterator(idx: int):

diff --git a/tests/utils_/test_collections.py → tests/utils_/test_collection_utils.py b/tests/utils_/test_collections.py → tests/utils_/test_collection_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import pytest
 
-from vllm.utils.collections import swap_dict_values
+from vllm.utils.collection_utils import swap_dict_values
 
 
 @pytest.mark.parametrize(

diff --git a/tests/utils_/test_func_utils.py b/tests/utils_/test_func_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 
-from vllm.utils.functools import deprecate_kwargs, supports_kw
+from vllm.utils.func_utils import deprecate_kwargs, supports_kw
 
 from ..utils import error_on_warning
 

diff --git a/tests/utils_/test_hashing.py b/tests/utils_/test_hashing.py
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import hashlib
+import pickle
+
+import pytest
+
+from vllm.utils.hashing import sha256
+
+
+@pytest.mark.parametrize("input", [(), ("abc",), (None,), (None, bool, [1, 2, 3])])
+def test_sha256(input: tuple):
+    digest = sha256(input)
+    assert digest is not None
+    assert isinstance(digest, bytes)
+    assert digest != b""
+
+    input_bytes = pickle.dumps(input, protocol=pickle.HIGHEST_PROTOCOL)
+    assert digest == hashlib.sha256(input_bytes).digest()
+
+    # hashing again, returns the same value
+    assert digest == sha256(input)
+
+    # hashing different input, returns different value
+    assert digest != sha256(input + (1,))
diff --git a/tests/utils_/test_mem_utils.py b/tests/utils_/test_mem_utils.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm_test_utils.monitor import monitor
+
+from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
+
+from ..utils import create_new_process_for_each_test
+
+
+@create_new_process_for_each_test()
+def test_memory_profiling():
+    # Fake out some model loading + inference memory usage to test profiling
+    # Memory used by other processes will show up as cuda usage outside of torch
+    from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
+
+    lib = CudaRTLibrary()
+    # 512 MiB allocation outside of this instance
+    handle1 = lib.cudaMalloc(512 * 1024 * 1024)
+
+    baseline_snapshot = MemorySnapshot()
+
+    # load weights
+
+    weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
+
+    weights_memory = 128 * 1024 * 1024 * 4  # 512 MiB
+
+    def measure_current_non_torch():
+        free, total = torch.cuda.mem_get_info()
+        current_used = total - free
+        current_torch = torch.cuda.memory_reserved()
+        current_non_torch = current_used - current_torch
+        return current_non_torch
+
+    with (
+        memory_profiling(
+            baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
+        ) as result,
+        monitor(measure_current_non_torch) as monitored_values,
+    ):
+        # make a memory spike, 1 GiB
+        spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
+        del spike
+
+        # Add some extra non-torch memory 256 MiB (simulate NCCL)
+        handle2 = lib.cudaMalloc(256 * 1024 * 1024)
+
+    # this is an analytic value, it is exact,
+    # we only have 256 MiB non-torch memory increase
+    measured_diff = monitored_values.values[-1] - monitored_values.values[0]
+    assert measured_diff == 256 * 1024 * 1024
+
+    # Check that the memory usage is within 5% of the expected values
+    # 5% tolerance is caused by cuda runtime.
+    # we cannot control cuda runtime in the granularity of bytes,
+    # which causes a small error (<10 MiB in practice)
+    non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024)  # noqa
+    assert abs(non_torch_ratio - 1) <= 0.05
+    assert result.torch_peak_increase == 1024 * 1024 * 1024
+    del weights
+    lib.cudaFree(handle1)
+    lib.cudaFree(handle2)
diff --git a/tests/utils_/test_torch_utils.py b/tests/utils_/test_torch_utils.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.utils.torch_utils import (
+    common_broadcastable_dtype,
+    current_stream,
+    is_lossless_cast,
+)
+
+
+@pytest.mark.parametrize(
+    ("src_dtype", "tgt_dtype", "expected_result"),
+    [
+        # Different precision_levels
+        (torch.bool, torch.int8, True),
+        (torch.bool, torch.float16, True),
+        (torch.bool, torch.complex32, True),
+        (torch.int64, torch.bool, False),
+        (torch.int64, torch.float16, True),
+        (torch.int64, torch.complex32, True),
+        (torch.float64, torch.bool, False),
+        (torch.float64, torch.int8, False),
+        (torch.float64, torch.complex32, True),
+        (torch.complex128, torch.bool, False),
+        (torch.complex128, torch.int8, False),
+        (torch.complex128, torch.float16, False),
+        # precision_level=0
+        (torch.bool, torch.bool, True),
+        # precision_level=1
+        (torch.int8, torch.int16, True),
+        (torch.int16, torch.int8, False),
+        (torch.uint8, torch.int8, False),
+        (torch.int8, torch.uint8, False),
+        # precision_level=2
+        (torch.float16, torch.float32, True),
+        (torch.float32, torch.float16, False),
+        (torch.bfloat16, torch.float32, True),
+        (torch.float32, torch.bfloat16, False),
+        # precision_level=3
+        (torch.complex32, torch.complex64, True),
+        (torch.complex64, torch.complex32, False),
+    ],
+)
+def test_is_lossless_cast(src_dtype, tgt_dtype, expected_result):
+    assert is_lossless_cast(src_dtype, tgt_dtype) == expected_result
+
+
+@pytest.mark.parametrize(
+    ("dtypes", "expected_result"),
+    [
+        ([torch.bool], torch.bool),
+        ([torch.bool, torch.int8], torch.int8),
+        ([torch.bool, torch.int8, torch.float16], torch.float16),
+        ([torch.bool, torch.int8, torch.float16, torch.complex32], torch.complex32),  # noqa: E501
+    ],
+)
+def test_common_broadcastable_dtype(dtypes, expected_result):
+    assert common_broadcastable_dtype(dtypes) == expected_result
+
+
+def test_current_stream_multithread():
+    import threading
+
+    if not torch.cuda.is_available():
+        pytest.skip("CUDA not available")
+
+    main_default_stream = torch.cuda.current_stream()
+    child_stream = torch.cuda.Stream()
+
+    thread_stream_ready = threading.Event()
+    thread_can_exit = threading.Event()
+
+    def child_thread_func():
+        with torch.cuda.stream(child_stream):
+            thread_stream_ready.set()
+            thread_can_exit.wait(timeout=10)
+
+    child_thread = threading.Thread(target=child_thread_func)
+    child_thread.start()
+
+    try:
+        assert thread_stream_ready.wait(timeout=5), (
+            "Child thread failed to enter stream context in time"
+        )
+
+        main_current_stream = current_stream()
+
+        assert main_current_stream != child_stream, (
+            "Main thread's current_stream was contaminated by child thread"
+        )
+        assert main_current_stream == main_default_stream, (
+            "Main thread's current_stream is not the default stream"
+        )
+
+        # Notify child thread it can exit
+        thread_can_exit.set()
+
+    finally:
+        # Ensure child thread exits properly
+        child_thread.join(timeout=5)
+        if child_thread.is_alive():
+            pytest.fail("Child thread failed to exit properly")