From 975b6c8400c6084804ed6a7cfe4bf50da58fbcef Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Thu, 6 Mar 2025 10:40:54 -0800
Subject: [PATCH] fix

Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
---
 vllm/engine/arg_utils.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 0d285acd15f3..baa075bc5e35 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1440,8 +1440,19 @@ def _override_v1_engine_args(self, usage_context: UsageContext) -> None:
         # When no user override, set the default values based on the usage
         # context.
         # Use different default values for different hardware.
-        from vllm.platforms import current_platform
-        device_name = current_platform.get_device_name().lower()
+
+        # Try to query the device name on the current platform. If it fails,
+        # it may be because the platform that imports vLLM is not the same
+        # as the platform that vLLM is running on (e.g. the case of scaling
+        # vLLM with Ray) and has no GPUs. In this case we use the default
+        # values for non-H100/H200 GPUs.
+        try:
+            from vllm.platforms import current_platform
+            device_name = current_platform.get_device_name().lower()
+        except Exception:
+            # This is only used to set default_max_num_batched_tokens
+            device_name = "no-device"
+
         if "h100" in device_name or "h200" in device_name:
             # For H100 and H200, we use larger default values.
             default_max_num_batched_tokens = {