Skip to content

Commit a8652f4

Browse files
authored
Enable CUDA graph support for llama 3.2 vision (#14917)
Signed-off-by: Matt Ritter <[email protected]>
1 parent 2f726b2 commit a8652f4

File tree

3 files changed

+1
-13
lines changed

3 files changed

+1
-13
lines changed

tests/models/encoder_decoder/vision_language/test_mllama.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,6 @@ def _run_test(
215215
max_num_seqs=2,
216216
tensor_parallel_size=tensor_parallel_size,
217217
distributed_executor_backend=distributed_executor_backend,
218-
enforce_eager=True,
219218
limit_mm_per_prompt={"image": _LIMIT_IMAGE_PER_PROMPT
220219
}) as vllm_model:
221220
vllm_outputs_per_image = [
@@ -425,7 +424,6 @@ def test_bnb_regression(
425424
dtype=dtype,
426425
max_model_len=4096,
427426
max_num_seqs=2,
428-
enforce_eager=True,
429427
quantization="bitsandbytes",
430428
load_format="bitsandbytes",
431429
)
@@ -481,7 +479,6 @@ def test_explicit_implicit_prompt(
481479
max_model_len=4096,
482480
max_num_seqs=2,
483481
tensor_parallel_size=1,
484-
enforce_eager=True,
485482
)
486483
sampling_params = SamplingParams(
487484
temperature=0,
@@ -513,7 +510,6 @@ def test_regression(vllm_runner, image_assets, model, dtype, max_tokens,
513510
max_model_len=4096,
514511
max_num_seqs=2,
515512
tensor_parallel_size=1,
516-
enforce_eager=True,
517513
limit_mm_per_prompt={"image":
518514
_LIMIT_IMAGE_PER_PROMPT}) as vllm_model:
519515

vllm/config.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -670,14 +670,6 @@ def _verify_cuda_graph(self) -> None:
670670
self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
671671
self.max_model_len)
672672

673-
MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
674-
if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
675-
and not self.enforce_eager):
676-
logger.warning(
677-
"CUDA graph is not supported for %s yet, fallback to the eager "
678-
"mode.", self.hf_config.model_type)
679-
self.enforce_eager = True
680-
681673
def _verify_bnb_config(self) -> None:
682674
"""
683675
The current version of bitsandbytes (0.44.0) with 8-bit models does not

vllm/model_executor/models/mllama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1368,7 +1368,7 @@ def forward(
13681368
full_text_row_masked_out_mask = (
13691369
attn_metadata.encoder_seq_lens_tensor
13701370
!= 0).reshape(-1, 1).to(input_ids.device)
1371-
skip_cross_attention = max(attn_metadata.encoder_seq_lens) == 0
1371+
skip_cross_attention = attn_metadata.max_encoder_seq_len == 0
13721372

13731373
# For image-present prefill.
13741374
else:

0 commit comments

Comments
 (0)