@@ -776,7 +776,7 @@ def get_vllm_port() -> int | None:
776776 # If set, the OpenAI API server will stay alive even after the underlying
777777 # AsyncLLMEngine errors and stops serving requests
778778 "VLLM_KEEP_ALIVE_ON_ENGINE_DEATH" : lambda : bool (
779- os .getenv ("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH" , 0 )
779+ int ( os .getenv ("VLLM_KEEP_ALIVE_ON_ENGINE_DEATH" , "0" ) )
780780 ),
781781 # If the env var VLLM_ALLOW_LONG_MAX_MODEL_LEN is set, it allows
782782 # the user to specify a max sequence length greater than
@@ -1313,7 +1313,9 @@ def get_vllm_port() -> int | None:
13131313 ),
13141314 # If set, it means we pre-downloaded cubin files and flashinfer will
13151315 # read the cubin files directly.
1316- "VLLM_HAS_FLASHINFER_CUBIN" : lambda : os .getenv ("VLLM_HAS_FLASHINFER_CUBIN" , False ),
1316+ "VLLM_HAS_FLASHINFER_CUBIN" : lambda : bool (
1317+ int (os .getenv ("VLLM_HAS_FLASHINFER_CUBIN" , "0" ))
1318+ ),
13171319 # Supported options:
13181320 # - "flashinfer-cudnn": use flashinfer cudnn GEMM backend
13191321 # - "flashinfer-trtllm": use flashinfer trtllm GEMM backend
@@ -1449,8 +1451,8 @@ def get_vllm_port() -> int | None:
14491451 # top 5 collected objects
14501452 "VLLM_GC_DEBUG" : lambda : os .getenv ("VLLM_GC_DEBUG" , "" ),
14511453 # Disables parallel execution of shared_experts via separate cuda stream
1452- "VLLM_DISABLE_SHARED_EXPERTS_STREAM" : lambda : os . getenv (
1453- "VLLM_DISABLE_SHARED_EXPERTS_STREAM" , False
1454+ "VLLM_DISABLE_SHARED_EXPERTS_STREAM" : lambda : bool (
1455+ int ( os . getenv ( "VLLM_DISABLE_SHARED_EXPERTS_STREAM" , "0" ))
14541456 ),
14551457 # Format for saving torch.compile cache artifacts
14561458 # - "binary": saves as binary file
0 commit comments