vllm-project · vllm-bot · Sep 23, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
@@ -49,7 +49,9 @@
 from vllm.platforms import current_platform
 from vllm.scalar_type import scalar_types
 from vllm.utils import has_deep_gemm
-from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used, is_deep_gemm_supported
+from vllm.utils.deep_gemm import (is_deep_gemm_e8m0_used,
+                                  is_deep_gemm_supported,
+                                  should_use_deepgemm_for_fp8_linear)
 from vllm.utils.flashinfer import has_flashinfer_moe
 
 if TYPE_CHECKING:
@@ -251,8 +253,10 @@ def __init__(self, quant_config: Fp8Config):
             act_quant_group_shape=self.act_q_group_shape)
 
         self.w8a8_block_fp8_linear = W8A8BlockFp8LinearOp(
-            self.cutlass_block_fp8_supported,
-            self.use_aiter_and_is_supported,
+            cutlass_block_fp8_supported=self.cutlass_block_fp8_supported,
+            use_aiter_and_is_supported=self.use_aiter_and_is_supported,
+            ue8m0_deepgemm_supported=is_deep_gemm_e8m0_used(),
+            is_blackwell=current_platform.has_device_capability(100),
         )
 
     def create_weights(
@@ -365,6 +369,9 @@ def create_weights(
             else:
                 layer.register_parameter("input_scale", None)
 
+        self.w8a8_block_fp8_linear.set_should_use_deepgemm(
+            should_use_deepgemm_for_fp8_linear(self.out_dtype, weight))
+
     def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
         # Pad the weight tensor. This is an optimization on ROCm platform, which
         # can benefit from tensors located far enough from one another in memory

@@ -112,7 +112,7 @@
 # TODO fix ROCm->Triton custom path:
 #  https://github.com/vllm-project/vllm/issues/14397
 class W8A8BlockFp8LinearOp:
    """
    This class executes a Blocked FP8 linear layer using cutlass if supported and
    torch.scaled_mm otherwise.
    """
@@ -121,9 +121,20 @@
         self,
         cutlass_block_fp8_supported: bool = CUTLASS_BLOCK_FP8_SUPPORTED,
         use_aiter_and_is_supported: bool = False,
+        ue8m0_deepgemm_supported: bool = False,
+        is_blackwell: bool = False,
     ):
         self.cutlass_block_fp8_supported = cutlass_block_fp8_supported
         self.use_aiter_and_is_supported = use_aiter_and_is_supported
+        self.ue8m0_deepgemm_supported = ue8m0_deepgemm_supported
+        self.is_blackwell = is_blackwell
+        self.should_use_deepgemm = False
+
+    def set_should_use_deepgemm(
+        self,
+        should_use_deepgemm: bool,
+    ):
+        self.should_use_deepgemm = should_use_deepgemm
 
     def apply(
         self,
@@ -140,7 +151,7 @@
         output_shape = [*input.shape[:-1], weight.shape[0]]
         output_dtype = input.dtype
 
-        if should_use_deepgemm_for_fp8_linear(output_dtype, weight):
+        if self.should_use_deepgemm:
 
             input_2d = input.view(-1, input.shape[-1])
             output_shape = [*input.shape[:-1], weight.shape[0]]
@@ -149,6 +160,7 @@
                 input_2d,
                 block_size[1],
                 column_major_scales=True,
+                use_ue8m0=self.ue8m0_deepgemm_supported,
             )
 
             # ensure DeepGEMM-backed custom op is registered before use
@@ -166,12 +178,11 @@
             return output.to(dtype=output_dtype).view(*output_shape)
 
         if current_platform.is_cuda():
-            if current_platform.has_device_capability(100):
-
+            if self.is_blackwell:
                 use_cutlass = self.cutlass_block_fp8_supported and (
                     cdiv(weight.shape[0], 128) == weight_scale.shape[0]
                     and cdiv(weight.shape[1], 128) == weight_scale.shape[1])
            else:
                # TODO: update this after switching to public sm90 block scale gemm
                # as it also supports weight.shape % 128 != 0
                use_cutlass = self.cutlass_block_fp8_supported and (
@@ -183,7 +194,8 @@
             use_cutlass, self.use_aiter_and_is_supported)
         if use_cutlass:
             q_input, x_scale = per_token_group_quant_fp8(
-                input_2d, block_size[1], column_major_scales=use_cutlass)
+                input_2d, block_size[1], column_major_scales=use_cutlass,
+                use_ue8m0=self.ue8m0_deepgemm_supported)
             output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                         block_size, input.dtype)
 
@@ -193,7 +205,8 @@
                     input_2d.contiguous(), quant_dtype=rocm_aiter.dtypes.fp8)
             else:
                 q_input, x_scale = per_token_group_quant_fp8(
-                    input_2d, block_size[1], column_major_scales=use_cutlass)
+                    input_2d, block_size[1], column_major_scales=use_cutlass,
+                    use_ue8m0=self.ue8m0_deepgemm_supported)
 
             output = w8a8_blockscale_func(q_input, weight, x_scale, weight_scale,
                                         block_size, input.dtype)