Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1095,6 +1095,10 @@ def metrics_info(self):
return {key: str(value) for key, value in self.__dict__.items()}

def _verify_args(self) -> None:
if self.cpu_offload_gb < 0:
raise ValueError("CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")

if self.gpu_memory_utilization > 1.0:
raise ValueError(
"GPU memory utilization must be less than 1.0. Got "
Expand Down
65 changes: 25 additions & 40 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1062,6 +1062,17 @@ def from_cli_args(cls, args: argparse.Namespace):
return engine_args

def create_model_config(self) -> ModelConfig:
# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# NOTE: This is to allow model loading from S3 in CI
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
Copy link
Collaborator

@khluu khluu Feb 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we move this outside and before create_model_config gets called since this is changing both model name and load_format (which gets referenced later on in create_load_config)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, merged before I saw this. will fix in follow up

and self.model in MODELS_ON_S3
and self.load_format == LoadFormat.AUTO): # noqa: E501
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: is noqa needed? Line length looks fine to me

self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
self.load_format = LoadFormat.RUNAI_STREAMER

return ModelConfig(
model=self.model,
task=self.task,
Expand Down Expand Up @@ -1101,26 +1112,6 @@ def create_model_config(self) -> ModelConfig:
)

def create_load_config(self) -> LoadConfig:
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)

def create_engine_config(self,
usage_context: Optional[UsageContext] = None
) -> VllmConfig:
from vllm.platforms import current_platform
current_platform.pre_register_and_update()

if envs.VLLM_USE_V1:
self._override_v1_engine_args(usage_context)

# gguf file needs a specific model loader and doesn't use hf_repo
if check_gguf_file(self.model):
self.quantization = self.load_format = "gguf"

# bitsandbytes quantization needs a specific model loader
# so we make sure the quant method and the load format are consistent
if (self.quantization == "bitsandbytes" or
Expand All @@ -1137,19 +1128,23 @@ def create_engine_config(self,
"BitsAndBytes load format and QLoRA adapter only support "
f"'bitsandbytes' quantization, but got {self.quantization}")

assert self.cpu_offload_gb >= 0, (
"CPU offload space must be non-negative"
f", but got {self.cpu_offload_gb}")
return LoadConfig(
load_format=self.load_format,
download_dir=self.download_dir,
model_loader_extra_config=self.model_loader_extra_config,
ignore_patterns=self.ignore_patterns,
)

device_config = DeviceConfig(device=self.device)
def create_engine_config(self,
usage_context: Optional[UsageContext] = None
) -> VllmConfig:
from vllm.platforms import current_platform
current_platform.pre_register_and_update()

# NOTE: This is to allow model loading from S3 in CI
if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3
and self.model in MODELS_ON_S3
and self.load_format == LoadFormat.AUTO): # noqa: E501
self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}"
self.load_format = LoadFormat.RUNAI_STREAMER
if envs.VLLM_USE_V1:
self._override_v1_engine_args(usage_context)

device_config = DeviceConfig(device=self.device)
model_config = self.create_model_config()

if (model_config.is_multimodal_model and not envs.VLLM_USE_V1
Expand Down Expand Up @@ -1281,16 +1276,6 @@ def create_engine_config(self,
if speculative_config is None \
else speculative_config.num_lookahead_slots

if not self.use_v2_block_manager:
logger.warning(
"[DEPRECATED] Block manager v1 has been removed, "
"and setting --use-v2-block-manager to True or False has "
"no effect on vLLM behavior. Please remove "
"--use-v2-block-manager in your engine argument. "
"If your use case is not supported by "
"SelfAttnBlockSpaceManager (i.e. block manager v2),"
" please file an issue with detailed information.")

scheduler_config = SchedulerConfig(
runner_type=model_config.runner_type,
max_num_batched_tokens=self.max_num_batched_tokens,
Expand Down