vllm-project · benchislett · Oct 15, 2025 · Oct 15, 2025 · Oct 16, 2025 · Oct 21, 2025
diff --git a/docs/design/cuda_graphs.md b/docs/design/cuda_graphs.md
@@ -177,8 +177,9 @@ The following table lists backends that support full CUDA Graphs at the time of
 | FlashAttention v3 | `ALWAYS` | has unified routine for both batches, so `FULL` mode is good |
 | Triton Attention | `ALWAYS` | prefer `FULL_AND_PIECEWISE` since it has different kernels for prefill/mixed and pure decode batches |
 | AITER FlashAttention | `UNIFORM_BATCH`| |
-| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | |
+| FlashInfer | `UNIFORM_SINGLE_TOKEN_DECODE` | Will be set to `UNIFORM_BATCH` when using TRTLLM attention on Blackwell |
 | FlashMLA | `UNIFORM_BATCH` | |
+| FlashInferMLA | `UNIFORM_BATCH` | |
 | AITER MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | CUTLASS MLA | `UNIFORM_SINGLE_TOKEN_DECODE` | |
 | Mamba attention| `UNIFORM_SINGLE_TOKEN_DECODE` | |

@@ -3,7 +3,6 @@
 """Attention layer with FlashInfer."""
 
 from dataclasses import dataclass
-from typing import ClassVar
 
 import numpy as np
 import torch
@@ -272,7 +271,9 @@
 
 
 class FlashInferMetadataBuilder(AttentionMetadataBuilder[FlashInferMetadata]):
-    cudagraph_support: ClassVar[AttentionCGSupport] = (
+    # When using TRTLLM attention with cudagraphs, we can use UNIFORM_BATCH
+    # mode. This will be overridden in the initializer if supported.
+    cudagraph_support: AttentionCGSupport = (
         AttentionCGSupport.UNIFORM_SINGLE_TOKEN_DECODE
     )
 
@@ -354,7 +355,10 @@
         else:
             self.q_data_type = self.model_config.dtype
 
+        # If using trtllm attention, we can support uniform_batch speculative decoding
         self._init_reorder_batch_threshold(1, supports_spec_as_decode=can_use_trtllm)
+        if can_use_trtllm:
+            self.cudagraph_support = AttentionCGSupport.UNIFORM_BATCH
 
         self._cascade_wrapper = None  # Wrapper for cascade attention