Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion tests/kernels/core/test_uva.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
import torch

from vllm.utils import is_uva_available
from vllm.utils.platform_utils import is_uva_available
from vllm.utils.torch_utils import get_cuda_view_from_cpu_tensor

CUDA_DEVICES = [f"cuda:{i}" for i in range(1 if torch.cuda.device_count() == 1 else 2)]
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/logits_processors/test_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from vllm.config import VllmConfig
from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.sample.logits_processor import (
BatchUpdate,
BatchUpdateBuilder,
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/sample/test_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from tests.v1.sample.utils import create_allowed_token_ids
from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.sample.logits_processor import LogitsProcessors
from vllm.v1.sample.metadata import SamplingMetadata
Expand Down
2 changes: 1 addition & 1 deletion tests/v1/worker/test_gpu_input_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from vllm.platforms import current_platform
from vllm.sampling_params import SamplingParams
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad
from vllm.v1.pool.metadata import PoolingMetadata
from vllm.v1.sample.logits_processor import LogitsProcessors
Expand Down
2 changes: 1 addition & 1 deletion vllm/device_allocator/cumem.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import torch

from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available

logger = init_logger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion vllm/lora/lora_weights.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import torch.types

from vllm.lora.peft_helper import PEFTHelper
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available


class LoRALayerWeights:
Expand Down
2 changes: 1 addition & 1 deletion vllm/lora/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,8 @@
from vllm.model_executor.models.interfaces import is_pooling_model
from vllm.model_executor.models.module_mapping import MultiModelKeys
from vllm.model_executor.models.utils import PPMissingLayer, WeightsMapper
from vllm.utils import is_pin_memory_available
from vllm.utils.cache import LRUCache
from vllm.utils.platform_utils import is_pin_memory_available

logger = init_logger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/model_loader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
try_create_mm_pooling_model_cls,
)
from vllm.model_executor.models.interfaces import SupportsQuant, supports_multimodal
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available

logger = init_logger(__name__)

Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@
from vllm.multimodal.parse import MultiModalDataItems
from vllm.multimodal.processing import PromptReplacement, PromptUpdate
from vllm.sequence import IntermediateTensors
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.tensor_schema import TensorSchema, TensorShape

from .interfaces import (
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/models/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.multimodal import NestedTensors
from vllm.sequence import IntermediateTensors
from vllm.utils import (
cdiv,
from vllm.utils import cdiv
from vllm.utils.platform_utils import (
is_pin_memory_available,
is_uva_available,
)
Expand Down
2 changes: 1 addition & 1 deletion vllm/usage/usage_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import vllm.envs as envs
from vllm.connections import global_http_connection
from vllm.logger import init_logger
from vllm.utils import cuda_get_device_properties
from vllm.utils.platform_utils import cuda_get_device_properties
from vllm.utils.torch_utils import cuda_device_count_stateless
from vllm.version import __version__ as VLLM_VERSION

Expand Down
94 changes: 3 additions & 91 deletions vllm/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import datetime
import enum
import getpass
import importlib.util
import inspect
import json
import multiprocessing
Expand All @@ -28,12 +27,8 @@
_ArgumentGroup,
)
from collections import defaultdict
from collections.abc import (
Callable,
Sequence,
)
from concurrent.futures.process import ProcessPoolExecutor
from functools import cache, partial, wraps
from collections.abc import Callable
from functools import partial, wraps
from typing import TYPE_CHECKING, Any, TypeVar

import cloudpickle
Expand All @@ -45,6 +40,7 @@
import vllm.envs as envs
from vllm.logger import enable_trace_function_call, init_logger
from vllm.ray.lazy_utils import is_in_ray_actor
from vllm.utils.platform_utils import cuda_is_initialized, xpu_is_initialized

_DEPRECATED_MAPPINGS = {
"cprofile": "profiling",
Expand Down Expand Up @@ -170,21 +166,6 @@ def round_down(x: int, y: int) -> int:
return (x // y) * y


@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform

return current_platform.is_pin_memory_available()


@cache
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available()


# TODO: This function can be removed if transformer_modules classes are
# serialized by value when communicating between processes
def init_cached_hf_modules() -> None:
Expand Down Expand Up @@ -217,35 +198,6 @@ def enable_trace_function_call_for_thread(vllm_config: VllmConfig) -> None:
enable_trace_function_call(log_path)


def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized."""
if not torch.cuda._is_compiled():
return False
return torch.cuda.is_initialized()


def xpu_is_initialized() -> bool:
"""Check if XPU is initialized."""
if not torch.xpu._is_compiled():
return False
return torch.xpu.is_initialized()


def cuda_get_device_properties(
device, names: Sequence[str], init_cuda=False
) -> tuple[Any, ...]:
"""Get specified CUDA device property values without initializing CUDA in
the current process."""
if init_cuda or cuda_is_initialized():
props = torch.cuda.get_device_properties(device)
return tuple(getattr(props, name) for name in names)

# Run in subprocess to avoid initializing CUDA as a side effect.
mp_ctx = multiprocessing.get_context("fork")
with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
return executor.submit(cuda_get_device_properties, device, names, True).result()


def weak_bind(
bound_method: Callable[..., Any],
) -> Callable[..., None]:
Expand Down Expand Up @@ -1048,46 +1000,6 @@ def check_use_alibi(model_config: ModelConfig) -> bool:
)


@cache
def _has_module(module_name: str) -> bool:
"""Return True if *module_name* can be found in the current environment.

The result is cached so that subsequent queries for the same module incur
no additional overhead.
"""
return importlib.util.find_spec(module_name) is not None


def has_pplx() -> bool:
"""Whether the optional `pplx_kernels` package is available."""

return _has_module("pplx_kernels")


def has_deep_ep() -> bool:
"""Whether the optional `deep_ep` package is available."""

return _has_module("deep_ep")


def has_deep_gemm() -> bool:
"""Whether the optional `deep_gemm` package is available."""

return _has_module("deep_gemm")


def has_triton_kernels() -> bool:
"""Whether the optional `triton_kernels` package is available."""

return _has_module("triton_kernels")


def has_tilelang() -> bool:
"""Whether the optional `tilelang` package is available."""

return _has_module("tilelang")


def length_from_prompt_token_ids_or_embeds(
prompt_token_ids: list[int] | None,
prompt_embeds: torch.Tensor | None,
Expand Down
54 changes: 54 additions & 0 deletions vllm/utils/platform_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import multiprocessing
from collections.abc import Sequence
from concurrent.futures.process import ProcessPoolExecutor
from functools import cache
from typing import Any

import torch


def cuda_is_initialized() -> bool:
"""Check if CUDA is initialized."""
if not torch.cuda._is_compiled():
return False
return torch.cuda.is_initialized()


def xpu_is_initialized() -> bool:
"""Check if XPU is initialized."""
if not torch.xpu._is_compiled():
return False
return torch.xpu.is_initialized()


def cuda_get_device_properties(
device, names: Sequence[str], init_cuda=False
) -> tuple[Any, ...]:
"""Get specified CUDA device property values without initializing CUDA in
the current process."""
if init_cuda or cuda_is_initialized():
props = torch.cuda.get_device_properties(device)
return tuple(getattr(props, name) for name in names)

# Run in subprocess to avoid initializing CUDA as a side effect.
mp_ctx = multiprocessing.get_context("fork")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Hardcoding the multiprocessing start method to "fork" is not portable and can cause issues on platforms like macOS or Windows where fork is unavailable or unsafe. It's better to use the centralized get_mp_context utility for consistency and safety.

To avoid a circular dependency (since vllm.utils imports this file), I recommend moving get_mp_context and its helper _maybe_force_spawn from vllm/utils/__init__.py to this file (vllm/utils/hardware_utils.py). This change aligns with the goal of this PR to centralize hardware-related utilities.

After moving the functions, you can change this line to use get_mp_context(). You'll also need to:

  1. Add the necessary imports (os, vllm.envs, vllm.logger, vllm.ray.lazy_utils) to vllm/utils/hardware_utils.py.
  2. Update vllm/utils/__init__.py to import get_mp_context from its new location.
Suggested change
mp_ctx = multiprocessing.get_context("fork")
mp_ctx = get_mp_context()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think get_mp_context and maybe_force_spawn are hardware utility functions. I think I can do a follow up to move these into their own multiprocessor.py util file and have that import the hardware_util functions

with ProcessPoolExecutor(max_workers=1, mp_context=mp_ctx) as executor:
return executor.submit(cuda_get_device_properties, device, names, True).result()


@cache
def is_pin_memory_available() -> bool:
from vllm.platforms import current_platform

return current_platform.is_pin_memory_available()


@cache
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available()
3 changes: 2 additions & 1 deletion vllm/v1/attention/backends/flashinfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,13 @@
)
from vllm.platforms import current_platform
from vllm.triton_utils import tl, triton
from vllm.utils import cdiv, is_pin_memory_available
from vllm.utils import cdiv
from vllm.utils.flashinfer import (
can_use_trtllm_attention,
flashinfer_disable_q_quantization,
use_trtllm_attention,
)
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.utils import (
AttentionCGSupport,
AttentionMetadataBuilder,
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/kv_offload/worker/cpu_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from vllm import _custom_ops as ops
from vllm.attention import AttentionBackend
from vllm.logger import init_logger
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec
from vllm.v1.kv_offload.worker.worker import (
OffloadingHandler,
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/pool/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import torch

from vllm.pooling_params import PoolingParams
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available

pin_memory = is_pin_memory_available()

Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/sample/ops/penalties.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import torch

from vllm.model_executor.layers.utils import apply_penalties
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import make_tensor_with_pad


Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/sample/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch.nn as nn

from vllm.config.model import LogprobsMode
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.outputs import LogprobsTensors, SamplerOutput
from vllm.v1.sample.metadata import SamplingMetadata
from vllm.v1.sample.ops.bad_words import apply_bad_words
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/spec_decode/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.platforms import current_platform
from vllm.utils import is_pin_memory_available
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata
from vllm.v1.attention.backends.tree_attn import (
TreeAttentionMetadata,
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/worker/gpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,13 +72,13 @@
from vllm.utils import (
cdiv,
check_use_alibi,
is_pin_memory_available,
length_from_prompt_token_ids_or_embeds,
round_up,
)
from vllm.utils.jsontree import json_map_leaves
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.mem_utils import DeviceMemoryProfiler
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.utils.torch_utils import (
get_dtype_size,
kv_cache_dtype_str_to_dtype,
Expand Down
3 changes: 2 additions & 1 deletion vllm/v1/worker/tpu_model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.sequence import IntermediateTensors
from vllm.tasks import GenerationTask, PoolingTask, SupportedTask
from vllm.utils import LayerBlockType, cdiv, is_pin_memory_available, prev_power_of_2
from vllm.utils import LayerBlockType, cdiv, prev_power_of_2
from vllm.utils.platform_utils import is_pin_memory_available
from vllm.v1.attention.backends.pallas import (
TPU_STR_DTYPE_TO_TORCH_DTYPE,
PallasAttentionBackend,
Expand Down