From 975b6c8400c6084804ed6a7cfe4bf50da58fbcef Mon Sep 17 00:00:00 2001 From: Cody Yu Date: Thu, 6 Mar 2025 10:40:54 -0800 Subject: [PATCH] fix Signed-off-by: Cody Yu --- vllm/engine/arg_utils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0d285acd15f3..baa075bc5e35 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1440,8 +1440,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None: # When no user override, set the default values based on the usage # context. # Use different default values for different hardware. - from vllm.platforms import current_platform - device_name = current_platform.get_device_name().lower() + + # Try to query the device name on the current platform. If it fails, + # it may be because the platform that imports vLLM is not the same + # as the platform that vLLM is running on (e.g. the case of scaling + # vLLM with Ray) and has no GPUs. In this case we use the default + # values for non-H100/H200 GPUs. + try: + from vllm.platforms import current_platform + device_name = current_platform.get_device_name().lower() + except Exception: + # This is only used to set default_max_num_batched_tokens + device_name = "no-device" + if "h100" in device_name or "h200" in device_name: # For H100 and H200, we use larger default values. default_max_num_batched_tokens = {