fix wrong size calculation

jiahanc · jiahanc · commit ed8313803bc7 · 2025-10-17T23:25:31.000-07:00
Signed-off-by: jiahanc &lt;173873397+jiahanc@users.noreply.github.com&gt;
diff --git a/csrc/trtllm_batched_gemm_runner.cu b/csrc/trtllm_batched_gemm_runner.cu
@@ -169,7 +169,6 @@ void TrtllmGenBatchedGemmRunner::run(
   auto const configs = bmm.getBatchedGemmConfigs();
 
   auto const& config = configs[configIndex];
-  std::cout << "config.mFunctionName: " << config.mFunctionName << std::endl;
   FLASHINFER_CHECK(numBatches > 0, "Batched GEMM requires numBatches > 0");
   if (!mOptions.staticBatch) {
     FLASHINFER_CHECK(totalNumPaddedTokens,
diff --git a/csrc/trtllm_fused_moe_kernel_launcher.cu b/csrc/trtllm_fused_moe_kernel_launcher.cu
@@ -62,8 +62,7 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
 
   if (use_routing_scales_on_input) {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
-             RoutingMethodType::DeepSeekV3) {
+  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::DeepSeekV3) {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_float32) << "routing_logits must be float.";
   } else {
     TVM_FFI_ICHECK_EQ(routing_logits->dtype, dl_bfloat16) << "routing_logits must be bfloat16.";
@@ -99,8 +98,7 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
                  RoutingMethodType::RenormalizeNaive) {
     TVM_FFI_LOG_AND_THROW(NotImplementedError)
         << "Don't support routing method type Renormalize(Naive).";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
-             RoutingMethodType::Llama4) {
+  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
     TVM_FFI_ICHECK_EQ(top_k, 1)
         << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
   }
@@ -144,7 +142,8 @@ void trtllm_fp8_per_tensor_scale_moe_launcher(
   args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
   args.local_expert_offset = local_expert_offset;
   args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
+  args.routed_scaling_factor =
+      routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
   args.intermediate_size = intermediate_size;
   args.mUseRoutingScalesOnInput = use_routing_scales_on_input;
 
@@ -300,9 +299,10 @@ void trtllm_fp8_per_tensor_scale_moe(
     TensorView gemm1_weights, TensorView output1_scales_scalar,
     TensorView output1_scales_gate_scalar, TensorView gemm2_weights,
     TensorView output2_scales_scalar, TensorView output, int64_t num_experts, int64_t top_k,
-    Optional<int64_t> n_group, Optional<int64_t> topk_group,  int64_t intermediate_size, int64_t local_expert_offset,
-    int64_t local_num_experts, Optional<double> routed_scaling_factor, bool use_routing_scales_on_input,
-    int64_t tile_tokens_dim, int64_t routing_method_type, bool enable_pdl) {
+    Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
+    int64_t local_expert_offset, int64_t local_num_experts, Optional<double> routed_scaling_factor,
+    bool use_routing_scales_on_input, int64_t tile_tokens_dim, int64_t routing_method_type,
+    bool enable_pdl) {
   auto dtype = hidden_states->dtype;
   if (dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn) {
     trtllm_fp8_per_tensor_scale_moe_launcher(
@@ -320,10 +320,11 @@ void trtllm_fp8_block_scale_moe_launcher(
     TensorView routing_logits, Optional<TensorView> routing_bias, TensorView hidden_states,
     TensorView hidden_states_scale, TensorView gemm1_weights, TensorView gemm1_weights_scale,
     TensorView gemm2_weights, TensorView gemm2_weights_scale, TensorView output,
-    int64_t const num_experts, int64_t const top_k, Optional<int64_t> const n_group, Optional<int64_t> const topk_group,
-    int64_t const intermediate_size, int64_t const local_expert_offset,
-    int64_t const local_num_experts, Optional<double> const routed_scaling_factor,
-    int64_t const tile_tokens_dim, int64_t const routing_method_type,
+    int64_t const num_experts, int64_t const top_k, Optional<int64_t> const n_group,
+    Optional<int64_t> const topk_group, int64_t const intermediate_size,
+    int64_t const local_expert_offset, int64_t const local_num_experts,
+    Optional<double> const routed_scaling_factor, int64_t const tile_tokens_dim,
+    int64_t const routing_method_type,
     tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner& moe_runner, int64_t moeConfigIndex,
     bool enable_pdl) {
   static const std::tuple<int, int> device_props = [hidden_states] {
@@ -380,8 +381,7 @@ void trtllm_fp8_block_scale_moe_launcher(
                  RoutingMethodType::RenormalizeNaive) {
     TVM_FFI_ICHECK(top_k <= 10 && top_k > 0)
         << "Current routing kernel (no groups, renormalize) only supports top_k<=10 && top_k>0.";
-  } else if (static_cast<RoutingMethodType>(routing_method_type) ==
-             RoutingMethodType::Llama4) {
+  } else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4) {
     TVM_FFI_ICHECK_EQ(top_k, 1)
         << "Current routing kernel (no groups, Llama4) only supports top_k=1.";
   }
@@ -424,7 +424,8 @@ void trtllm_fp8_block_scale_moe_launcher(
   args.topk_group = topk_group.has_value() ? topk_group.value() : 0;
   args.local_expert_offset = local_expert_offset;
   args.local_num_experts = local_num_experts;
-  args.routed_scaling_factor = routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
+  args.routed_scaling_factor =
+      routed_scaling_factor.has_value() ? routed_scaling_factor.value() : 1.0;
   args.intermediate_size = intermediate_size;
   args.mUseDeepSeekFp8 = true;
 
@@ -610,11 +611,11 @@ void trtllm_fp8_block_scale_moe(TensorView routing_logits, Optional<TensorView>
                                 TensorView gemm1_weights, TensorView gemm1_weights_scale,
                                 TensorView gemm2_weights, TensorView gemm2_weights_scale,
                                 TensorView output, int64_t num_experts, int64_t top_k,
-                                Optional<int64_t> n_group, Optional<int64_t> topk_group, int64_t intermediate_size,
-                                int64_t local_expert_offset, int64_t local_num_experts,
-                                Optional<double> routed_scaling_factor, int64_t tile_tokens_dim,
-                                int64_t routing_method_type, bool use_shuffled_weight,
-                                int64_t weight_layout, bool enable_pdl) {
+                                Optional<int64_t> n_group, Optional<int64_t> topk_group,
+                                int64_t intermediate_size, int64_t local_expert_offset,
+                                int64_t local_num_experts, Optional<double> routed_scaling_factor,
+                                int64_t tile_tokens_dim, int64_t routing_method_type,
+                                bool use_shuffled_weight, int64_t weight_layout, bool enable_pdl) {
   auto dtype = hidden_states->dtype;
   if (dtype == dl_float16 || dtype == dl_bfloat16 || dtype == dl_float8_e4m3fn) {
     using RunnerType = tensorrt_llm::kernels::trtllmgen_moe::MoE::Runner;
@@ -829,11 +830,9 @@ Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
   //     {args.num_tokens, args.top_k}, routing_bias_dtype, hidden_states->device);
   // Tensor expert_indexes = alloc_tensor(
   //     {args.num_tokens, args.top_k}, dl_int32, hidden_states->device);
-  int constexpr MAX_NUM_EXPERTS = 384;
-  Tensor expert_count_histogram = alloc_tensor(
-      {2 * MAX_NUM_EXPERTS},
-      dl_int32,  // 256 is the max number of threads per block and max number of experts
-      hidden_states->device);
+  int64_t const size_of_expert_count_histogram = std::max(num_experts * 2, int64_t(256 * 2));
+  Tensor expert_count_histogram =
+      alloc_tensor({size_of_expert_count_histogram}, dl_int32, hidden_states->device);
 
   auto const sf_vec_size = dtype_weights == btg::Dtype::MxE2m1 ? 32 : 16;
 
@@ -1035,7 +1034,6 @@ Array<Tensor> trtllm_fp4_block_scale_moe_launcher(
   workspace.gemm1_output_scale = gemm1_output_scale.has_value()
                                      ? static_cast<float*>(gemm1_output_scale.value()->data)
                                      : nullptr;
-
   // gemm2 intermediate ws
   workspace.gemm2_output = gemm2_output->data;
   workspace.gemm2_output_scale = nullptr;
diff --git a/include/flashinfer/trtllm/fused_moe/DevKernel.h b/include/flashinfer/trtllm/fused_moe/DevKernel.h
@@ -122,7 +122,7 @@ namespace moe::dev {
                LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, 128 /* Always 128 for llama4*/), kernel,   \
                numBlocks, numThreads, smemSize, stream);                                           \
   } else {                                                                                         \
-    FLASHINFER_WARN("Unsupported dtypeExpW");                                                       \
+    FLASHINFER_WARN("Unsupported dtypeExpW");                                                      \
   }
 
 #define LAUNCH_ROUTING_WITH_NUM_EXPERTS_FORCE_FLOAT_INPUT(data, coopLaunch, kernel, numBlocks,     \
@@ -147,7 +147,7 @@ namespace moe::dev {
     LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, false),      \
                kernel, numBlocks, numThreads, smemSize, stream);                                   \
   } else {                                                                                         \
-    FLASHINFER_WARN("Unsupported dtypeExpW");                                                       \
+    FLASHINFER_WARN("Unsupported dtypeExpW");                                                      \
   }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -167,7 +167,7 @@ namespace moe::dev {
     LAUNCH_PDL(data, coopLaunch, LAUNCH_ESC(__nv_bfloat16, __nv_bfloat16, numExperts, false),      \
                kernel, numBlocks, numThreads, smemSize, stream);                                   \
   } else {                                                                                         \
-    FLASHINFER_WARN("Unsupported dtypeExpW");                                                       \
+    FLASHINFER_WARN("Unsupported dtypeExpW");                                                      \
   }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -137,11 +137,11 @@ def is_cuda_oom_error_str(e: str) -> bool:
     return "CUDA" in e and "out of memory" in e
 
 
-@pytest.hookimpl(hookwrapper=True)
+@pytest.hookimpl(tryfirst=True)
 def pytest_runtest_call(item):
     # Wrap the test call so we don't invoke item.runtest() ourselves; yield lets pytest run it.
     try:
-        yield
+        item.runtest()
     except (torch.cuda.OutOfMemoryError, RuntimeError) as e:
         if isinstance(e, torch.cuda.OutOfMemoryError) or is_cuda_oom_error_str(str(e)):
             pytest.skip("Skipping due to OOM")
diff --git a/tests/moe/test_trtllm_gen_fused_moe.py b/tests/moe/test_trtllm_gen_fused_moe.py
@@ -1838,7 +1838,7 @@ def cache_permute_indices():
 
 @pytest.mark.parametrize("num_tokens", [1, 8, 1024])
 @pytest.mark.parametrize("hidden_size", [1024, 8192])
-@pytest.mark.parametrize("intermediate_size", [ 1024, 768, 384, 512])
+@pytest.mark.parametrize("intermediate_size", [1024, 768, 384, 512])
 @pytest.mark.parametrize(
     "moe_impl",
     [
@@ -1913,7 +1913,7 @@ def cache_permute_indices():
                 "routed_scaling": None,
                 "has_routing_bias": False,
                 "routing_method_type": RoutingMethodType.Renormalize,
-                "compatible_moe_impls": [FP8PerTensorMoe, FP8BlockScaleMoe, FP4Moe],
+                "compatible_moe_impls": [FP8BlockScaleMoe, FP4Moe],
             },
             id="Renorm",
             # marks=pytest.mark.skip(
@@ -2085,7 +2085,7 @@ def test_moe_quantization_classes(
         )
         else 64,
     )
-
+    padding = tile_tokens_dim
     # Validation checks
     assert top_k <= num_experts
     # assert top_k <= 8