vllm-project · simondanielsson · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
@@ -27,6 +27,8 @@
     # "yujiepan/mamba2-codestral-v0.1-tiny-random",
 ]
 
+GDN_MODELS = ["tiny-random/qwen3-next-moe"]
+
 HYBRID_MODELS = [
     "ai21labs/Jamba-tiny-dev",
     "pfnet/plamo-2-1b",
@@ -35,8 +37,7 @@
     "ibm-granite/granite-4.0-tiny-preview",
     "tiiuae/Falcon-H1-0.5B-Base",
     "LiquidAI/LFM2-1.2B",
-    "tiny-random/qwen3-next-moe",
-]
+] + GDN_MODELS
 
 FULL_CUDA_GRAPH_MODELS = [
     "ai21labs/Jamba-tiny-dev",
@@ -380,7 +381,7 @@ def _get_vLLM_output(
     return outs, vllm_model
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[3]] + GDN_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -446,7 +447,7 @@ def test_apc_single_prompt(
         )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[3]] + GDN_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -528,7 +529,7 @@ def test_apc_single_prompt_block_align_alignment(
             )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[3]] + GDN_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -595,7 +596,7 @@ def test_apc_multiple_prompts_all_cached_outputs(
         )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[3]] + GDN_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version
@@ -679,7 +680,7 @@ def test_apc_multiple_prompts_block_align_alignment(
             )
 
 
-@pytest.mark.parametrize("model", [HYBRID_MODELS[3]])
+@pytest.mark.parametrize("model", [HYBRID_MODELS[3]] + GDN_MODELS)
 @pytest.mark.parametrize("max_tokens", [64])
 @pytest.mark.parametrize("n_repetitions", [2])
 # If num_logprobs is set to -1, then the stringent version

diff --git a/vllm/config/model.py b/vllm/config/model.py
@@ -1505,6 +1505,11 @@ def get_mamba_chunk_size(self) -> int | None:
         if chunk_size is None:
             # used by e.g. Mamba2, NemotronH, Zamba
             chunk_size = getattr(self.hf_text_config, "chunk_size", None)
+        if chunk_size is None and self.hf_text_config.model_type == "qwen3_next":
+            # Fallback for Qwen3-Next. 64 is a hardcoded value in the GDN kernel.
+            # https://github.com/fla-org/flash-linear-attention/blob/2e7336262c11f8bc6cd6a94b1eb5ee353ae8b4cd/fla/ops/common/chunk_delta_h.py#L439
+            return 64
+
         return chunk_size
 
     def get_multimodal_config(self) -> MultiModalConfig:

diff --git a/vllm/model_executor/layers/fla/ops/chunk.py b/vllm/model_executor/layers/fla/ops/chunk.py
@@ -32,6 +32,7 @@ def chunk_gated_delta_rule_fwd(
     initial_state: torch.Tensor,
     output_final_state: bool,
     cu_seqlens: torch.LongTensor | None = None,
+    return_intermediate_states: bool = False,
 ):
     g = chunk_local_cumsum(g, chunk_size=64, cu_seqlens=cu_seqlens)
     # obtain WY representation. u is actually the new v.
@@ -66,7 +67,15 @@ def chunk_gated_delta_rule_fwd(
         cu_seqlens=cu_seqlens,
     )
     if SUPPRESS_LEVEL < 3:
-        return g, o, A, final_state, None, None, None
+        return (
+            g,
+            o,
+            A,
+            final_state,
+            None,
+            h if return_intermediate_states else None,
+            None,
+        )
     elif SUPPRESS_LEVEL >= 3:
         return g, o, A, final_state, w, h, v_new
 
@@ -87,6 +96,7 @@ def forward(
         output_final_state: bool,
         cu_seqlens: torch.LongTensor | None = None,
         use_qk_l2norm_in_kernel: bool = False,
+        return_intermediate_states: bool = False,
     ):
         if use_qk_l2norm_in_kernel:
             q = l2norm_fwd(q)
@@ -102,10 +112,22 @@ def forward(
             initial_state=initial_state,
             output_final_state=output_final_state,
             cu_seqlens=cu_seqlens,
+            return_intermediate_states=return_intermediate_states,
         )
         ctx.scale = scale
         ctx.use_qk_l2norm_in_kernel = use_qk_l2norm_in_kernel
-        return o.to(q.dtype), final_state
+        intermediate_states = None
+        if return_intermediate_states:
+            assert h is not None
+            # Convert intermediate states into "chunk-major" form
+            # Equal-length batches keep their batch dimension; flatten it together
+            # with the chunk axis so callers receive a contiguous chunk stream.
+            # Variable-length inputs collapse the batch dimension during preprocessing,
+            # so the kernel already emits a linearised chunk stream in ``states[:, i]``.
+            # Flattening mirrors the metadata builder's chunk enumeration order.
+            # Last three axes of h are [H, K, V], producing [num_chunks_total, H, K, V]
+            intermediate_states = h.reshape(-1, *h.shape[-3:])
+        return o.to(q.dtype), final_state, intermediate_states
 
 
 @torch.compiler.disable
@@ -121,6 +143,7 @@ def chunk_gated_delta_rule(
     cu_seqlens: torch.LongTensor | None = None,
     head_first: bool = False,
     use_qk_l2norm_in_kernel: bool = False,
+    return_intermediate_states: bool = False,
 ):
     r"""
     Args:
@@ -155,6 +178,10 @@ def chunk_gated_delta_rule(
             Outputs of shape `[B, T, H, V]` if `head_first=False` else `[B, H, T, V]`.
         final_state (torch.Tensor):
             Final state of shape `[N, H, K, V]` if `output_final_state=True` else `None`.
+        intermediate_states (Optional[torch.Tensor]):
+            When ``return_intermediate_states`` is ``True`` a tensor containing
+            the per-chunk state snapshots shaped ``[num_chunks_total, H, K, V]``.
+            Otherwise ``None``.
 
     Examples::
         >>> import torch
@@ -169,7 +196,7 @@ def chunk_gated_delta_rule(
         >>> beta = torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda').sigmoid()
         >>> g = F.logsigmoid(torch.rand(B, T, H, dtype=torch.bfloat16, device='cuda'))
         >>> h0 = torch.randn(B, H, K, V, dtype=torch.bfloat16, device='cuda')
-        >>> o, ht = chunk_gated_delta_rule(
+        >>> o, ht, _ = chunk_gated_delta_rule(
             q, k, v, g, beta,
             initial_state=h0,
             output_final_state=True
@@ -178,7 +205,7 @@ def chunk_gated_delta_rule(
         >>> q, k, v, beta, g = map(lambda x: rearrange(x, 'b t ... -> 1 (b t) ...'), (q, k, v, beta, g))
         # for a batch with 4 sequences, `cu_seqlens` with 5 start/end positions are expected
         >>> cu_seqlens = q.new_tensor([0, 2048, 4096, 6144, 8192], dtype=torch.long)
-        >>> o_var, ht_var = chunk_gated_delta_rule(
+        >>> o_var, ht_var, _ = chunk_gated_delta_rule(
             q, k, v, g, beta,
             initial_state=h0,
             output_final_state=True,
@@ -223,7 +250,7 @@ def chunk_gated_delta_rule(
             )
     if scale is None:
         scale = k.shape[-1] ** -0.5
-    o, final_state = ChunkGatedDeltaRuleFunction.apply(
+    o, final_state, intermediate_states = ChunkGatedDeltaRuleFunction.apply(
         q,
         k,
         v,
@@ -234,7 +261,8 @@ def chunk_gated_delta_rule(
         output_final_state,
         cu_seqlens,
         use_qk_l2norm_in_kernel,
+        return_intermediate_states,
     )
     if head_first:
         o = rearrange(o, "b t h ... -> b h t ...")
-    return o, final_state
+    return o, final_state, intermediate_states
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
@@ -305,12 +305,19 @@ def verify_and_update_config(cls, vllm_config: "VllmConfig") -> None:
             "NemotronHForCausalLM",
             "Zamba2ForCausalLM",
         ]
+        GDN_MODELS = [
+            "Qwen3NextForCausalLM",
+        ]
         if cache_config.enable_prefix_caching:
-            if model_config.architecture in MAMBA2_MODELS:
+            if model_config.architecture in MAMBA2_MODELS + GDN_MODELS:
+                layer_type = (
+                    "Mamba2" if model_config.architecture in MAMBA2_MODELS else "GDN"
+                )
                 logger.info(
                     "Warning: Prefix caching is currently enabled. "
-                    "Its support for Mamba2 layers is experimental. "
-                    "Please report any issues you may observe."
+                    "Its support for %s layers is experimental. "
+                    "Please report any issues you may observe.",
+                    layer_type,
                 )
             else:
                 logger.info(