-
-
Notifications
You must be signed in to change notification settings - Fork 11.3k
[Misc] Clean Up EngineArgs.create_engine_config
#13734
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1062,6 +1062,17 @@ def from_cli_args(cls, args: argparse.Namespace): | |
| return engine_args | ||
|
|
||
| def create_model_config(self) -> ModelConfig: | ||
| # gguf file needs a specific model loader and doesn't use hf_repo | ||
| if check_gguf_file(self.model): | ||
| self.quantization = self.load_format = "gguf" | ||
|
|
||
| # NOTE: This is to allow model loading from S3 in CI | ||
| if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can we move this outside and before
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oops, merged before I saw this. will fix in follow up |
||
| and self.model in MODELS_ON_S3 | ||
| and self.load_format == LoadFormat.AUTO): # noqa: E501 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: is noqa needed? Line length looks fine to me |
||
| self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" | ||
| self.load_format = LoadFormat.RUNAI_STREAMER | ||
|
|
||
| return ModelConfig( | ||
| model=self.model, | ||
| task=self.task, | ||
|
|
@@ -1101,26 +1112,6 @@ def create_model_config(self) -> ModelConfig: | |
| ) | ||
|
|
||
| def create_load_config(self) -> LoadConfig: | ||
| return LoadConfig( | ||
| load_format=self.load_format, | ||
| download_dir=self.download_dir, | ||
| model_loader_extra_config=self.model_loader_extra_config, | ||
| ignore_patterns=self.ignore_patterns, | ||
| ) | ||
|
|
||
| def create_engine_config(self, | ||
| usage_context: Optional[UsageContext] = None | ||
| ) -> VllmConfig: | ||
| from vllm.platforms import current_platform | ||
| current_platform.pre_register_and_update() | ||
|
|
||
| if envs.VLLM_USE_V1: | ||
| self._override_v1_engine_args(usage_context) | ||
|
|
||
| # gguf file needs a specific model loader and doesn't use hf_repo | ||
| if check_gguf_file(self.model): | ||
| self.quantization = self.load_format = "gguf" | ||
|
|
||
| # bitsandbytes quantization needs a specific model loader | ||
| # so we make sure the quant method and the load format are consistent | ||
| if (self.quantization == "bitsandbytes" or | ||
|
|
@@ -1137,19 +1128,23 @@ def create_engine_config(self, | |
| "BitsAndBytes load format and QLoRA adapter only support " | ||
| f"'bitsandbytes' quantization, but got {self.quantization}") | ||
|
|
||
| assert self.cpu_offload_gb >= 0, ( | ||
| "CPU offload space must be non-negative" | ||
| f", but got {self.cpu_offload_gb}") | ||
| return LoadConfig( | ||
| load_format=self.load_format, | ||
| download_dir=self.download_dir, | ||
| model_loader_extra_config=self.model_loader_extra_config, | ||
| ignore_patterns=self.ignore_patterns, | ||
| ) | ||
|
|
||
| device_config = DeviceConfig(device=self.device) | ||
| def create_engine_config(self, | ||
| usage_context: Optional[UsageContext] = None | ||
| ) -> VllmConfig: | ||
| from vllm.platforms import current_platform | ||
| current_platform.pre_register_and_update() | ||
|
|
||
| # NOTE: This is to allow model loading from S3 in CI | ||
| if (not isinstance(self, AsyncEngineArgs) and envs.VLLM_CI_USE_S3 | ||
| and self.model in MODELS_ON_S3 | ||
| and self.load_format == LoadFormat.AUTO): # noqa: E501 | ||
| self.model = f"{MODEL_WEIGHTS_S3_BUCKET}/{self.model}" | ||
| self.load_format = LoadFormat.RUNAI_STREAMER | ||
| if envs.VLLM_USE_V1: | ||
| self._override_v1_engine_args(usage_context) | ||
|
|
||
| device_config = DeviceConfig(device=self.device) | ||
| model_config = self.create_model_config() | ||
|
|
||
| if (model_config.is_multimodal_model and not envs.VLLM_USE_V1 | ||
|
|
@@ -1281,16 +1276,6 @@ def create_engine_config(self, | |
| if speculative_config is None \ | ||
| else speculative_config.num_lookahead_slots | ||
|
|
||
| if not self.use_v2_block_manager: | ||
| logger.warning( | ||
| "[DEPRECATED] Block manager v1 has been removed, " | ||
| "and setting --use-v2-block-manager to True or False has " | ||
| "no effect on vLLM behavior. Please remove " | ||
| "--use-v2-block-manager in your engine argument. " | ||
| "If your use case is not supported by " | ||
| "SelfAttnBlockSpaceManager (i.e. block manager v2)," | ||
| " please file an issue with detailed information.") | ||
|
|
||
| scheduler_config = SchedulerConfig( | ||
| runner_type=model_config.runner_type, | ||
| max_num_batched_tokens=self.max_num_batched_tokens, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.