intel · yiliu30 · Sep 10, 2025 · Aug 11, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -19,6 +19,7 @@
 import time
 import traceback
 from dataclasses import asdict, fields
+from enum import Enum
 from typing import Any, Callable, Union
 
 import accelerate
@@ -30,6 +31,7 @@
 
 from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
+from auto_round.export.export_to_autoround import AutoRoundFormat
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType
 from auto_round.low_cpu_mem.utils import get_layers_before_block
 from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
@@ -857,8 +859,8 @@ def remove_duplicates(lst):
                         format = "auto_round:auto_awq"
                 elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
                     format = f"auto_round:{self.data_type}"
-                elif is_wfp8afp8(self):  # staic wfp8afp8
-                    format = "auto_round:fp8"
+                elif is_static_wfp8afp8(self):  # staic wfp8afp8
+                    format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"
                 elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16:  # woq fp8
                     format = "auto_round:fp8"
                 elif self.act_bits < 16:
@@ -956,10 +958,10 @@ def _check_supported_format(self, format: str) -> bool:
                     )
                     format = "fake"
             else:
-                if not (format == "auto_round" or format == "auto_round:fp8"):
+                if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"):
                     logger.warning(
                         f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model,"
-                        " change format to auto_round"
+                        f" change format {format} to auto_round"
                     )
                     format = "auto_round"
             if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8":

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Optional, Union
+
+import torch
+
+__all__ = ["QModuleBase"]
+
+
+class QModuleBase(torch.nn.Module):
+    """
+    Base class used to describe the weight creation and forward pass
+    of different quantization schemes supported by Auto-Round.
+    The design is inspired by vLLM's CompressedTensorsScheme:
+    https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    @abstractmethod
+    def from_original(cls, config, original_layer: torch.nn.Module):
+        raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
@@ -0,0 +1,122 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Union
+
+import torch
+
+from auto_round.experimental.qmodules.base import QModuleBase
+from auto_round.utils import logger
+
+__all__ = ["WeightFP8ActFP8StaticQuantLinear"]
+
+
+def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
+    return scale, clipped_qtensor_fp8
+
+
+class WeightFP8ActFP8StaticQuantLinear(QModuleBase):
+    hp_dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight: Optional[torch.Tensor] = None,
+        weight_scale: Optional[torch.Tensor] = None,
+        bias: Union[torch.Tensor, bool, None] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
+        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
+        self.dtype = dtype
+        if bias is not None:
+            if isinstance(bias, bool):
+                bias = torch.zeros((out_features,), dtype=dtype)
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.register_parameter("bias", None)
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
+        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
+
+        init_input_scale = torch.zeros((1), dtype=dtype) if input_scale is None else input_scale
+        self.register_buffer("input_scale", init_input_scale.to(dtype))
+        self.pre_dequantized = False
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        # TODO: correct that config once we add fp8 op support.
+        logger.warning_once("FP8 ops are not yet supported. Using capability 0.")
+        return 0
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        pass
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an `WeightFP8ActFP8StaticQuantLinear` layer from an original linear layer.
+        """
+        logger.warning_once(
+            "FP8 static quantization is still in experimental stage, the inference speed might be slow."
+        )
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias,
+            )
+            return qdq_linear
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        qdq_weight = self.weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = _quant_tensor_to_fp8_with_scale(bf16_input, self.input_scale.data)
+        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
+        return qdq_input_bf16
+
+    @torch.no_grad()
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
diff --git a/auto_round/export/export_to_autoround/__init__.py b/auto_round/export/export_to_autoround/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .export import save_quantized_as_autoround
+from .export import save_quantized_as_autoround, AutoRoundFormat
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
@@ -18,6 +18,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
 
 import threadpoolctl as tctl
 import torch
@@ -43,6 +44,12 @@
 )
 
 
+class AutoRoundFormat(str, Enum):
+    # Weight: FP8, per-channel, may be extended to per-tensor in future
+    # Activation: FP8, per-tensor
+    TORCH_FP8_STATIC = "fp8_static"
+
+
 def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16):
     """
     Dynamically imports and returns the appropriate QuantLinear class based on the specified backend and parameters.
@@ -152,7 +159,7 @@ def pack_layer(layer_name, model, backend, device=None):
 
         return pack_layer(layer_name, model, backend, device)
 
-    if backend == "auto_round:fp8":
+    if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}":
         from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer
 
         return pack_layer(layer_name, model, backend, device)
@@ -268,9 +275,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         from auto_round.export.export_to_autoround.export_to_fp8 import save_quantized_as_autoround
 
         return save_quantized_as_autoround(output_dir, inplace=inplace, backend="auto_round", **kwargs)
+    from auto_round.autoround import AutoRoundFormat
 
     ##if using sym, we change to gptq sym kernel to avoid compiling from auto_round source
-    if (kwargs.get("sym") is None or kwargs.get("sym")) and ("gptq" not in backend and "awq" not in backend):
+    if (
+        (kwargs.get("sym") is None or kwargs.get("sym"))
+        and ("gptq" not in backend and "awq" not in backend)
+        and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend)
+    ):
         backend = backend.replace("auto_round", "auto_round:auto_gptq")
 
     model = kwargs["model"]