|
42 | 42 | try_get_generation_config, uses_mrope) |
43 | 43 | from vllm.transformers_utils.s3_utils import S3Model |
44 | 44 | from vllm.transformers_utils.utils import is_s3, maybe_model_redirect |
45 | | -from vllm.utils import (GiB_bytes, LayerBlockType, cuda_device_count_stateless, |
| 45 | +from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, |
| 46 | + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
| 47 | + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, GiB_bytes, |
| 48 | + LayerBlockType, cuda_device_count_stateless, |
46 | 49 | get_cpu_memory, get_open_port, is_torch_equal_or_newer, |
47 | 50 | random_uuid, resolve_obj_by_qualname) |
48 | 51 |
|
|
64 | 67 |
|
65 | 68 | ConfigT = TypeVar("ConfigT", bound=ConfigType) |
66 | 69 |
|
67 | | -# This value is chosen to have a balance between ITL and TTFT. Note it is |
68 | | -# not optimized for throughput. |
69 | | -_DEFAULT_MAX_NUM_BATCHED_TOKENS = 2048 |
70 | | -_POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768 |
71 | | -_MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120 |
72 | | - |
73 | 70 | TaskOption = Literal["auto", "generate", "embedding", "embed", "classify", |
74 | 71 | "score", "reward", "transcription"] |
75 | 72 |
|
@@ -2074,28 +2071,28 @@ def __post_init__(self) -> None: |
2074 | 2071 | # so we don't reject sequences on account of a short |
2075 | 2072 | # max_num_batched_tokens. |
2076 | 2073 | self.max_num_batched_tokens = max( |
2077 | | - self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
| 2074 | + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2078 | 2075 | else: |
2079 | 2076 | self.max_num_batched_tokens = ( |
2080 | | - _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
| 2077 | + DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2081 | 2078 | else: |
2082 | 2079 | # If max_model_len is too short, use |
2083 | | - # _DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value |
| 2080 | + # DEFAULT_MAX_NUM_BATCHED_TOKENS as the default value |
2084 | 2081 | # for higher throughput. |
2085 | 2082 | self.max_num_batched_tokens = max( |
2086 | | - self.max_model_len, _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
| 2083 | + self.max_model_len, DEFAULT_MAX_NUM_BATCHED_TOKENS) |
2087 | 2084 |
|
2088 | 2085 | if self.runner_type == "pooling": |
2089 | 2086 | # Choose specific value for higher throughput |
2090 | 2087 | self.max_num_batched_tokens = max( |
2091 | 2088 | self.max_num_batched_tokens, |
2092 | | - _POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, |
| 2089 | + POOLING_MODEL_MAX_NUM_BATCHED_TOKENS, |
2093 | 2090 | ) |
2094 | 2091 | if self.is_multimodal_model: |
2095 | 2092 | # The value needs to be at least the number of multimodal tokens |
2096 | 2093 | self.max_num_batched_tokens = max( |
2097 | 2094 | self.max_num_batched_tokens, |
2098 | | - _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
| 2095 | + MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS, |
2099 | 2096 | ) |
2100 | 2097 |
|
2101 | 2098 | # When using default settings, |
@@ -4316,18 +4313,6 @@ def __post_init__(self): |
4316 | 4313 | "full_cuda_graph is not supported with " |
4317 | 4314 | "cascade attention. Disabling cascade attention.") |
4318 | 4315 | self.model_config.disable_cascade_attn = True |
4319 | | - |
4320 | | - if self.model_config and self.model_config.use_mla and \ |
4321 | | - not (current_platform.is_cuda() or current_platform.is_rocm()): |
4322 | | - logger.info( |
4323 | | - "MLA is enabled on a non-GPU platform; forcing chunked " |
4324 | | - "prefill and prefix caching to be disabled.") |
4325 | | - self.scheduler_config.enable_chunked_prefill = False |
4326 | | - self.scheduler_config.chunked_prefill_enabled = False |
4327 | | - self.scheduler_config.max_num_batched_tokens = max( |
4328 | | - self.scheduler_config.max_model_len, |
4329 | | - _DEFAULT_MAX_NUM_BATCHED_TOKENS) |
4330 | | - |
4331 | 4316 | if self.cache_config is not None: |
4332 | 4317 | self.cache_config.enable_prefix_caching = False |
4333 | 4318 |
|
|
0 commit comments