diff --git a/vllm/config.py b/vllm/config.py index 6bcf34c3cff9..0f6d67a682f1 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1095,6 +1095,10 @@ def metrics_info(self): return {key: str(value) for key, value in self.__dict__.items()} def _verify_args(self) -> None: + if self.cpu_offload_gb < 0: + raise ValueError("CPU offload space must be non-negative" + f", but got {self.cpu_offload_gb}") + if self.gpu_memory_utilization > 1.0: raise ValueError( "GPU memory utilization must be less than 1.0. Got " diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index bab7cfe2aa3a..8378a116a6d4 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1062,6 +1062,17 @@ def from_cli_args(cls, args: argparse.Namespace): return engine_args def create_model_config(self) -> ModelConfig: + # gguf file needs a specific model loader and doesn't use hf_repo + if check_gguf_file(self.model): + self.quantization = self.load_format = "gguf" + + # NOTE: This is to allow model loading from S3 in CI + if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 + and self.model in MODELS_ON_S3 + and self.load_format == LoadFormat.AUTO): # noqa: E501 + self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" + self.load_format = LoadFormat.RUNAI_STREAMER + return ModelConfig( model=self.model, task=self.task, @@ -1101,26 +1112,6 @@ def create_model_config(self) -> ModelConfig: ) def create_load_config(self) -> LoadConfig: - return LoadConfig( - load_format=self.load_format, - download_dir=self.download_dir, - model_loader_extra_config=self.model_loader_extra_config, - ignore_patterns=self.ignore_patterns, - ) - - def create_engine_config(self, - usage_context: Optional[UsageContext] = None - ) -> VllmConfig: - from vllm.platforms import current_platform - current_platform.pre_register_and_update() - - if envs.VLLM_USE_V1: - self._override_v1_engine_args(usage_context) - - # gguf file needs a specific model loader and doesn't use hf_repo - if check_gguf_file(self.model): - self.quantization = self.load_format = "gguf" - # bitsandbytes quantization needs a specific model loader # so we make sure the quant method and the load format are consistent if (self.quantization == "bitsandbytes" or @@ -1137,19 +1128,23 @@ def create_engine_config(self, "BitsAndBytes load format and QLoRA adapter only support " f"'bitsandbytes' quantization, but got {self.quantization}") - assert self.cpu_offload_gb >= 0, ( - "CPU offload space must be non-negative" - f", but got {self.cpu_offload_gb}") + return LoadConfig( + load_format=self.load_format, + download_dir=self.download_dir, + model_loader_extra_config=self.model_loader_extra_config, + ignore_patterns=self.ignore_patterns, + ) - device_config = DeviceConfig(device=self.device) + def create_engine_config(self, + usage_context: Optional[UsageContext] = None + ) -> VllmConfig: + from vllm.platforms import current_platform + current_platform.pre_register_and_update() - # NOTE: This is to allow model loading from S3 in CI - if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 - and self.model in MODELS_ON_S3 - and self.load_format == LoadFormat.AUTO): # noqa: E501 - self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" - self.load_format = LoadFormat.RUNAI_STREAMER + if envs.VLLM_USE_V1: + self._override_v1_engine_args(usage_context) + device_config = DeviceConfig(device=self.device) model_config = self.create_model_config() if (model_config.is_multimodal_model and not envs.VLLM_USE_V1 @@ -1281,16 +1276,6 @@ def create_engine_config(self, if speculative_config is None \ else speculative_config.num_lookahead_slots - if not self.use_v2_block_manager: - logger.warning( - "[DEPRECATED] Block manager v1 has been removed, " - "and setting --use-v2-block-manager to True or False has " - "no effect on vLLM behavior. Please remove " - "--use-v2-block-manager in your engine argument. " - "If your use case is not supported by " - "SelfAttnBlockSpaceManager (i.e. block manager v2)," - " please file an issue with detailed information.") - scheduler_config = SchedulerConfig( runner_type=model_config.runner_type, max_num_batched_tokens=self.max_num_batched_tokens,