apache
diff --git a/‎python/tvm/relax/backend/cuda/flashinfer.py‎
Lines changed: 121 additions & 360 deletions b/‎python/tvm/relax/backend/cuda/flashinfer.py‎
Lines changed: 121 additions & 360 deletions
diff --git a/‎python/tvm/relax/frontend/nn/llm/kv_cache.py‎
Lines changed: 11 additions & 10 deletions b/‎python/tvm/relax/frontend/nn/llm/kv_cache.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎src/runtime/vm/attn_backend.cc‎
Lines changed: 9 additions & 2 deletions b/‎src/runtime/vm/attn_backend.cc‎
Lines changed: 9 additions & 2 deletions
@@ -371,8 +371,7 @@ def __init__(  # pylint: disable=too-many-locals
         enable_disaggregation : bool
             Whether to enable disaggregation in the KV cache.
         """
-        if rope_mode == RopeMode.INLINE:
-            assert rotary_dim == qk_head_dim, "FlashInfer RoPE does not support partial rotary dim."
+        assert rope_mode != RopeMode.INLINE, "FlashInfer RoPE does not support inline mode."
 
         attn_kind_single = attn_kind[0] if isinstance(attn_kind, List) else attn_kind
         if attn_kind_single == "mha_sliding":
@@ -383,8 +382,8 @@ def __init__(  # pylint: disable=too-many-locals
             dtype_o=dtype,
             qk_head_dim=(qk_head_dim if attn_kind_single == "mha" else mla_original_qk_head_dim),
             v_head_dim=(v_head_dim if attn_kind_single == "mha" else mla_original_v_head_dim),
-            target=target,
-            enable_inline_rope=rope_mode == RopeMode.INLINE,
+            enable_inline_rope=False,
+            return_static_libs=True,
         )
         flashinfer_decode_mods = (
             rx.backend.cuda.flashinfer.gen_flashinfer_decode_module(
@@ -393,7 +392,8 @@ def __init__(  # pylint: disable=too-many-locals
                 dtype_o=dtype,
                 qk_head_dim=qk_head_dim,
                 v_head_dim=v_head_dim,
-                target=target,
+                enable_inline_rope=False,
+                return_static_libs=True,
             )
             if attn_kind_single == "mha"
             else []
@@ -405,7 +405,7 @@ def __init__(  # pylint: disable=too-many-locals
                 dtype_o=dtype,
                 head_dim_ckv=v_head_dim,
                 head_dim_kpe=qk_head_dim - v_head_dim,
-                target=target,
+                return_static_libs=True,
             )
             if attn_kind_single == "mla"
             else []
@@ -417,8 +417,8 @@ def __init__(  # pylint: disable=too-many-locals
         bb = rx.BlockBuilder.current()
         mha_functions = (
             [
-                rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_prefill_with_paged_kv_cache_run"), rx.ExternFunc("batch_prefill_with_kv_cache_plan")]),
-                rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_decode_with_paged_kv_cache_run"), rx.ExternFunc("batch_decode_with_paged_kv_cache_plan")]),
+                rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_prefill_paged_run"), rx.ExternFunc("batch_prefill_plan")]),
+                rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_decode_run"), rx.ExternFunc("batch_decode_plan")]),
                 rx.Tuple([rx.StringImm("tir"), bb.add_func(_attention_prefill(num_key_value_heads, num_attention_heads, qk_head_dim, dtype, True, rope_scaling, target), "tir_attention_prefill_sliding_window")]),
                 rx.Tuple([rx.StringImm("tir"), bb.add_func(_attention_decode(num_key_value_heads, num_attention_heads, qk_head_dim, dtype, True, rope_scaling, target), "tir_attention_decode_sliding_window")]),
                 rx.Tuple([rx.StringImm("tir"), bb.add_func(tree_attn_with_paged_kv_cache(num_key_value_heads, num_attention_heads, qk_head_dim, dtype, rope_scaling, target), "tir_attention_prefill_with_tree_mask_with_paged_kv_cache")]),
@@ -427,7 +427,8 @@ def __init__(  # pylint: disable=too-many-locals
             if attn_kind_single == "mha"
             else [rx.Tuple([]) for _ in range(6)]
         )
-        mla_function = rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_mla_paged_attention_run"), rx.ExternFunc("batch_mla_paged_attention_plan")] if attn_kind_single == "mla" else [])
+        ragged_prefill_function = rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_prefill_ragged_run"), rx.ExternFunc("batch_prefill_plan")]) if attn_kind_single == "mha" else rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_prefill_ragged_run"), rx.ExternFunc("batch_prefill_plan"), rx.PrimValue(mla_original_qk_head_dim), rx.PrimValue(mla_original_v_head_dim)])
+        mla_function = rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_mla_run"), rx.ExternFunc("batch_mla_plan")] if attn_kind_single == "mla" else [])
         attn_merge_functions = [
             bb.add_func(_merge_state_inplace(num_attention_heads, v_head_dim, dtype, target, "tir_attention_merge_state"), "tir_attention_merge_state"),
         ]
@@ -463,7 +464,7 @@ def __init__(  # pylint: disable=too-many-locals
             rx.op.zeros((), dtype),
             bb.add_func(_kv_cache_transpose_append(num_key_value_heads, qk_head_dim, dtype), "kv_cache_transpose_append"),
             bb.add_func(_kv_cache_transpose_append_mla(qk_head_dim, dtype), "kv_cache_transpose_append_mla"),
-            rx.Tuple([rx.StringImm("flashinfer"), rx.ExternFunc("batch_prefill_with_ragged_kv_cache_run"), rx.ExternFunc("batch_prefill_with_kv_cache_plan")]),
+            ragged_prefill_function,
             *mha_functions,
             mla_function,
             rx.Tuple(attn_merge_functions),
 
@@ -59,11 +59,18 @@ std::unique_ptr<RaggedPrefillFunc> ConvertRaggedPrefillFunc(ffi::Array<ffi::Any>
     return std::make_unique<TIRRaggedPrefillFunc>(std::move(attn_func), attn_kind);
   }
   if (backend_name == "flashinfer") {
-    CHECK_EQ(args.size(), 3);
+    CHECK(args.size() == 3 || args.size() == 5);
     ffi::Function attn_func = args[1].cast<ffi::Function>();
     ffi::Function plan_func = args[2].cast<ffi::Function>();
+    int64_t qk_head_dim_override = -1;
+    int64_t v_head_dim_override = -1;
+    if (args.size() == 5) {
+      qk_head_dim_override = args[3].cast<int64_t>();
+      v_head_dim_override = args[4].cast<int64_t>();
+    }
     return std::make_unique<FlashInferRaggedPrefillFunc>(std::move(attn_func), std::move(plan_func),
-                                                         attn_kind);
+                                                         attn_kind, qk_head_dim_override,
+                                                         v_head_dim_override);
   }
   LOG(FATAL) << "Cannot reach here";
   throw;