From bb947822170a33324c1016bde7f5ec6661a23ab7 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 11 Aug 2025 10:19:44 +0000
Subject: [PATCH 01/47] load w8a8

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py | 108 +++++++++++++++++++-
 examples/load_w8a8.py                 | 136 ++++++++++++++++++++++++++
 2 files changed, 243 insertions(+), 1 deletion(-)
 create mode 100644 examples/load_w8a8.py

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index bd6dde836..1fff106d5 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -472,6 +472,8 @@ def infer_target_device(device_map=None):
 
 
 def post_init(model, used_backends):
+    if is_weight_fp8_activation_static_fp8(model.config.quantization_config):
+        return
     need_autogptq_init = False
     need_gptqmodel_init = False
     need_ipex_itrex_init = False
@@ -526,6 +528,108 @@ def post_init(model, used_backends):
             logger.warning("force model to bfloat16")
 
 
+
+def quant_tensor_with_scale(tensor, scale):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
+    return scale, cliped_qtensor_fp8
+
+
+class FP8QDQLinear(torch.nn.Module):
+    dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self, in_features: int, out_features: int, bias: bool = True, device=None
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype),
+            requires_grad=True,
+        )
+        self.weight_scale = nn.Parameter(
+            torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype),
+            requires_grad=False,
+        )
+        self.input_scale = nn.Parameter(
+            torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+        self.pre_dequantized = False
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = quant_tensor_with_scale(
+            bf16_input, self.input_scale.data
+        )
+        qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale
+        return qdq_input_bf16
+
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an FP8QDQLinear layer from an original linear layer.
+        """
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias is not None,
+            )
+            return qdq_linear
+
+
+def _patching_mod(
+    mod, config, src_cls, dst_cls
+):
+    named_children_list = list(mod.named_children())
+    for name, layer in named_children_list:
+        if isinstance(layer, src_cls):
+            new_layer = dst_cls.from_original(config, layer)
+            setattr(mod, name, new_layer)
+            print(f"Patched {name} with {new_layer.__class__.__name__}")
+        elif isinstance(layer, nn.Module):
+            _patching_mod(layer, config, src_cls, dst_cls)
+    return mod
+
+
+def patching_model(model):
+    model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear)
+    return model
+
+
+def is_weight_fp8_activation_static_fp8(quant_config):
+    return True
+
 def convert_hf_model(model: nn.Module, target_device="cpu"):
     """Converts the given model to an AutoRound model by replacing its layers with quantized layers.
 
@@ -547,7 +651,9 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
     """
 
     quantization_config = model.config.quantization_config
-
+    if is_weight_fp8_activation_static_fp8(quantization_config):
+        model = patching_model(model)
+        
     if hasattr(quantization_config, "desc_act") and quantization_config.desc_act:
         ##check static_group
         if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or (
diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py
new file mode 100644
index 000000000..df10b6c10
--- /dev/null
+++ b/examples/load_w8a8.py
@@ -0,0 +1,136 @@
+import os
+import torch
+import tqdm
+from loguru import logger
+import logging
+import safetensors
+from safetensors import safe_open
+from safetensors.torch import save_file
+import json
+
+logging.basicConfig(level=logging.DEBUG)
+torch.set_grad_enabled(False)
+
+# CONSTANTS
+SAFETENSORS = "safetensors"
+WEIGHT_SCALE_NAME = "weight_scale"
+INPUT_SCALE_NAME = "scale_input"
+SCALE_DTYPE = torch.bfloat16
+SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
+FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+WEIGHT_BACKOFF = 1.0
+QUANT_MODULE_TYPES = (torch.nn.Linear,)
+SKIP_WEIGHT_LST = {
+    "model.norm",
+    "layernorm",
+    "e_score_correction_bias",
+    # "lm_head.weight",
+    "embed_tokens",
+    "mlp.gate.weight",  # mlp.gate is not linear
+}
+
+MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
+
+
+seed = 0
+import random
+
+random.seed(seed)
+import torch
+
+torch.manual_seed(seed)
+torch.cuda.manual_seed(seed)
+import numpy as np
+
+np.random.seed(seed)
+
+
+# torch.use_deterministic_algorithms(True)
+def seed_worker(worker_id):
+    worker_seed = torch.initial_seed() % 2**32
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+
+
+g = torch.Generator()
+g.manual_seed(0)
+
+
+
+
+def pre_dequantize(model):
+    """
+    Pre-dequantize all FP8QDQLinear layers in the model.
+    """
+    for name, module in model.named_modules():
+        if module.__class__.__name__ == "FP8QDQLinear":
+            logger.info(f"Pre-dequantizing {name}")
+            module.pre_dequantize()
+        else:
+            logger.debug(f"Skipping {name} as it is not FP8QDQLinear")
+
+
+def qdq_eval(model_path, not_patch_lin=False):
+    import transformers
+    from transformers.modeling_utils import no_init_weights
+
+
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
+    logger.info(f"Patched model: {model}")
+    model.eval()
+    model.to("cuda")
+    import torch
+
+    model = torch.compile(model)
+    # pre_dequantize(model)
+    with torch.device("cuda"):
+        tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+        prompt = "Hi, who"
+        encode = tokenizer.encode(prompt, return_tensors="pt")
+        with torch.no_grad():
+            output_tokens = model.generate(encode, max_length=100)
+            output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+            logger.info(f"Prompt: {prompt}")
+            logger.info(f"Output: {output}")
+
+    # from auto_round.script.llm import eval_task_by_task
+
+    # eval_task_by_task(
+    #     model=model,
+    #     device="cuda",
+    #     tasks="gsm8k",
+    #     batch_size=32,
+    #     limit=128,
+    #     # trust_remote_code=not args.disable_trust_remote_code,
+    #     # eval_model_dtype=args.eval_model_dtype
+    # )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-m', "--qmodel_path", type=str, required=True)
+    parser.add_argument(
+        "--not_patch_lin", action="store_true", help="Measure float model"
+    )
+    args = parser.parse_args()
+    qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
+
+
+"""
+p load_w8a8.py --qmodel_path  /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8
+Running generate_until requests:  76%|███ | 97/128 [11:45<03:
+Running generate_until requests: 100%|███| 128/128 [11:45<00:00,  5.51s/it]
+|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
+|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
+|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7422|±  |0.0388|
+|     |       |strict-match    |     5|exact_match|↑  |0.6797|±  |0.0414|
+
+total eval time: 742.8823928833008
+"""
\ No newline at end of file

From 9bef8263328fe7ef152d828c1775d4aa385885cc Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 02:44:17 -0400
Subject: [PATCH 02/47] refactor

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../export_to_autoround/export_to_fp8_woq.py  |  89 +++++++++++++
 auto_round/inference/backend.py               |  20 ++-
 auto_round/inference/convert_model.py         | 120 ++----------------
 examples/load_w8a8.py                         |  39 +++---
 4 files changed, 141 insertions(+), 127 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 5b6a4c400..8b357e090 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -16,6 +16,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from typing import Optional, Union
 
 import threadpoolctl as tctl
 import torch
@@ -83,6 +84,94 @@ def __init__(
             self.register_buffer("input_scale", input_scale.to(dtype))
 
 
+def quant_tensor_with_scale(tensor, scale):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
+    return scale, cliped_qtensor_fp8
+
+
+class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module):
+    hp_dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight: Optional[torch.Tensor] = None,
+        weight_scale: Optional[torch.Tensor] = None,
+        bias: Union[torch.Tensor, bool, None] = None,
+        weight_zp: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight
+        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
+        self.dtype = dtype
+        if bias is not None:
+            if isinstance(bias, bool):
+                bias = torch.zeros((out_features,), dtype=dtype)
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.register_parameter("bias", None)
+        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
+        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
+
+        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
+        if weight_zp:
+            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
+
+        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
+        self.register_buffer("input_scale", init_input_scale.to(dtype))
+        self.pre_dequantized = False
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an FP8WOQLinear layer from an original linear layer.
+        """
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias,
+            )
+            return qdq_linear
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data)
+        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
+        return qdq_input_bf16
+
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
+
+
 def pack_layer(layer_name, model, data_type, packing_device=None):
     """
      Packs a model layer for quantization based on its type and configuration.
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index a4f578726..4b259db0a 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -410,7 +410,18 @@ def check_compatible(
     return True
 
 
-def dynamic_import_inference_linear(backend, bits, group_size, sym):
+def is_weight_fp8_activation_static_fp8(config):
+    bits, group_size, sym, data_type, act_dynamic = (
+        config["bits"],
+        config["group_size"],
+        config["sym"],
+        config["data_type"],
+        config["act_dynamic"],
+    )
+    return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
+
+
+def dynamic_import_inference_linear(backend, config):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
 
     This function dynamically loads the correct `QuantLinear` class based on the backend and quantization
@@ -435,6 +446,13 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym):
         ImportError:
             If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq).
     """
+    bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
+
+    if is_weight_fp8_activation_static_fp8(config):
+        from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear
+
+        return WeightFP8ActFP8StaticQuantLinear
+
     if "qbits" in backend:
         try:
             from intel_extension_for_transformers import qbits  # pylint: disable=E0401
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index 1fff106d5..bbca26f4f 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -27,6 +27,7 @@
     find_backend,
     get_highest_priority_backend,
     get_layer_backend,
+    is_weight_fp8_activation_static_fp8,
     process_requirement,
 )
 from auto_round.utils import (
@@ -61,7 +62,7 @@ def skip_not_convert_modules(model, quantization_config, layer_names, layer_conf
     try:  # transformers new api
         modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert, add_default_skips=True)
     except:
-        modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert)
+        modules_to_not_convert = _get_modules_to_not_convert(model, modules_to_not_convert)
     if modules_to_not_convert:
         for layer_name in layer_names:
             if any([re.search(re.compile(n), layer_name) for n in modules_to_not_convert]):
@@ -219,6 +220,7 @@ def get_layer_config(model, quantization_config):
             - group_size (int): Group size for weight quantization.
             - data_type (str, optional): Data type for quantization (default: "int").
             - sym (bool): Whether to use symmetric quantization.
+            - act_dynamic (bool, optional): Whether to use dynamic activation quantization (default: False).
             - quant_block_list (list, optional): Predefined list of blocks to quantize.
             - to_quant_block_names (list or str, optional): Blocks to quantize (if quant_block_list is None).
             - extra_config (dict, optional): Per-layer overrides for quantization settings.
@@ -231,13 +233,14 @@ def get_layer_config(model, quantization_config):
             - "group_size" (int): Group size for quantization.
             - "data_type" (str): Data type used for quantization.
             - "sym" (bool): Whether symmetric quantization is applied.
+            - "act_dynamic" (bool): Whether dynamic activation quantization is used.
             - "clip" (bool): Whether weight clipping is enabled.
     """
     bits = quantization_config.bits
     group_size = quantization_config.group_size
     data_type = getattr(quantization_config, "data_type", "int")  # Default to "int" if not specified
     sym = quantization_config.sym
-
+    act_dynamic = getattr(quantization_config, "act_dynamic", False)
     # Determine the quantization block list
     quant_block_list = getattr(quantization_config, "quant_block_list", None)
     if quant_block_list is None:
@@ -290,11 +293,11 @@ def get_layer_config(model, quantization_config):
             "group_size": extra_config.get(layer_name, {}).get("group_size", group_size),
             "data_type": extra_config.get(layer_name, {}).get("data_type", data_type),
             "sym": extra_config.get(layer_name, {}).get("sym", sym),
+            "act_dynamic": extra_config.get(layer_name, {}).get("act_dynamic", act_dynamic),
             "clip": extra_config.get(layer_name, {}).get("clip", False),
         }
         for layer_name in layer_names
     }
-
     return layer_configs
 
 
@@ -415,7 +418,7 @@ def _import_exllamav2_kernels():
 
 def _create_quant_layer(layer, layer_backend, config, in_features, out_features):
     """Creates a quantized layer using the appropriate class."""
-    QuantLinear = dynamic_import_inference_linear(layer_backend, config["bits"], config["group_size"], config["sym"])
+    QuantLinear = dynamic_import_inference_linear(layer_backend, config)
     bias = layer.bias is not None
 
     # Special handling for AWQ layers
@@ -437,6 +440,8 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features)
             out_features=out_features,
             bias=bias,
         )
+    elif is_weight_fp8_activation_static_fp8(config):
+        return QuantLinear.from_original(config, layer)
     # Default quantized layer creation
     try:
         return QuantLinear(
@@ -528,108 +533,6 @@ def post_init(model, used_backends):
             logger.warning("force model to bfloat16")
 
 
-
-def quant_tensor_with_scale(tensor, scale):
-    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
-    qtensor = tensor / scale
-    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
-    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
-    return scale, cliped_qtensor_fp8
-
-
-class FP8QDQLinear(torch.nn.Module):
-    dtype = torch.bfloat16
-    fp8_dtype = torch.float8_e4m3fn
-
-    def __init__(
-        self, in_features: int, out_features: int, bias: bool = True, device=None
-    ):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight = nn.Parameter(
-            torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype),
-            requires_grad=True,
-        )
-        self.weight_scale = nn.Parameter(
-            torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype),
-            requires_grad=False,
-        )
-        self.input_scale = nn.Parameter(
-            torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False
-        )
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_features))
-        else:
-            self.register_parameter("bias", None)
-        self.pre_dequantized = False
-
-    def dequant_weight_online(self):
-        if self.pre_dequantized:
-            return self.weight
-        fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale
-        return qdq_weight
-
-    def pre_dequantize(self):
-        if self.pre_dequantized:
-            return
-        dequant_weight = self.dequant_weight_online()
-        del self.weight
-        del self.weight_scale
-        self.weight = nn.Parameter(dequant_weight, requires_grad=False)
-        self.pre_dequantized = True
-
-    def qdq_input(self, bf16_input: torch.Tensor):
-        input_scale, input_fp8 = quant_tensor_with_scale(
-            bf16_input, self.input_scale.data
-        )
-        qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale
-        return qdq_input_bf16
-
-    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
-        qdq_input = self.qdq_input(bf16_input)
-        qdq_weight = self.dequant_weight_online()
-        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
-        return out
-
-    @classmethod
-    def from_original(cls, config, original_layer):
-        """
-        Create an FP8QDQLinear layer from an original linear layer.
-        """
-        device = original_layer.weight.device
-        with torch.device(device):
-            qdq_linear = cls(
-                in_features=original_layer.in_features,
-                out_features=original_layer.out_features,
-                bias=original_layer.bias is not None,
-            )
-            return qdq_linear
-
-
-def _patching_mod(
-    mod, config, src_cls, dst_cls
-):
-    named_children_list = list(mod.named_children())
-    for name, layer in named_children_list:
-        if isinstance(layer, src_cls):
-            new_layer = dst_cls.from_original(config, layer)
-            setattr(mod, name, new_layer)
-            print(f"Patched {name} with {new_layer.__class__.__name__}")
-        elif isinstance(layer, nn.Module):
-            _patching_mod(layer, config, src_cls, dst_cls)
-    return mod
-
-
-def patching_model(model):
-    model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear)
-    return model
-
-
-def is_weight_fp8_activation_static_fp8(quant_config):
-    return True
-
 def convert_hf_model(model: nn.Module, target_device="cpu"):
     """Converts the given model to an AutoRound model by replacing its layers with quantized layers.
 
@@ -651,9 +554,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
     """
 
     quantization_config = model.config.quantization_config
-    if is_weight_fp8_activation_static_fp8(quantization_config):
-        model = patching_model(model)
-        
+
     if hasattr(quantization_config, "desc_act") and quantization_config.desc_act:
         ##check static_group
         if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or (
@@ -694,7 +595,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
         backend = backend[len("auto_round:") :]
 
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend)
-
     if backend == "auto" or backend == "":
         best_backend = get_highest_priority_backend(
             quantization_config.bits,
diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py
index df10b6c10..ad6218f9b 100644
--- a/examples/load_w8a8.py
+++ b/examples/load_w8a8.py
@@ -1,12 +1,13 @@
+import json
+import logging
 import os
+
+import safetensors
 import torch
 import tqdm
 from loguru import logger
-import logging
-import safetensors
 from safetensors import safe_open
 from safetensors.torch import save_file
-import json
 
 logging.basicConfig(level=logging.DEBUG)
 torch.set_grad_enabled(False)
@@ -42,13 +43,13 @@
 torch.cuda.manual_seed(seed)
 import numpy as np
 
-np.random.seed(seed)
+np.random.Generator(seed)
 
 
 # torch.use_deterministic_algorithms(True)
 def seed_worker(worker_id):
     worker_seed = torch.initial_seed() % 2**32
-    np.random.seed(worker_seed)
+    np.random.Generator(worker_seed)
     random.seed(worker_seed)
 
 
@@ -56,8 +57,6 @@ def seed_worker(worker_id):
 g.manual_seed(0)
 
 
-
-
 def pre_dequantize(model):
     """
     Pre-dequantize all FP8QDQLinear layers in the model.
@@ -70,10 +69,15 @@ def pre_dequantize(model):
             logger.debug(f"Skipping {name} as it is not FP8QDQLinear")
 
 
+import torch
+
+
+@torch.no_grad()
 def qdq_eval(model_path, not_patch_lin=False):
+
     import transformers
-    from transformers.modeling_utils import no_init_weights
 
+    # from transformers.modeling_utils import no_init_weights
 
     model = transformers.AutoModelForCausalLM.from_pretrained(
         model_path,
@@ -86,14 +90,19 @@ def qdq_eval(model_path, not_patch_lin=False):
     model.to("cuda")
     import torch
 
-    model = torch.compile(model)
-    # pre_dequantize(model)
     with torch.device("cuda"):
+        from transformers import GenerationConfig
+
+        gen_config = GenerationConfig(use_cache=True, cache_implementation="static")
         tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
         prompt = "Hi, who"
         encode = tokenizer.encode(prompt, return_tensors="pt")
         with torch.no_grad():
-            output_tokens = model.generate(encode, max_length=100)
+            output_tokens = model.generate(
+                encode,
+                max_length=10,
+                #    generation_config=gen_config
+            )
             output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
             logger.info(f"Prompt: {prompt}")
             logger.info(f"Output: {output}")
@@ -115,10 +124,8 @@ def qdq_eval(model_path, not_patch_lin=False):
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument('-m', "--qmodel_path", type=str, required=True)
-    parser.add_argument(
-        "--not_patch_lin", action="store_true", help="Measure float model"
-    )
+    parser.add_argument("-m", "--qmodel_path", type=str, required=True)
+    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
     args = parser.parse_args()
     qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
 
@@ -133,4 +140,4 @@ def qdq_eval(model_path, not_patch_lin=False):
 |     |       |strict-match    |     5|exact_match|↑  |0.6797|±  |0.0414|
 
 total eval time: 742.8823928833008
-"""
\ No newline at end of file
+"""

From b30a126fed56bd07473d2bba53d1dcbe9ed9bd7b Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 03:01:56 -0400
Subject: [PATCH 03/47] add ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py |  2 --
 test/test_cpu/test_export.py          | 28 ++++++++++++++++++++++++++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index bbca26f4f..bd8b4621d 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -477,8 +477,6 @@ def infer_target_device(device_map=None):
 
 
 def post_init(model, used_backends):
-    if is_weight_fp8_activation_static_fp8(model.config.quantization_config):
-        return
     need_autogptq_init = False
     need_gptqmodel_init = False
     need_ipex_itrex_init = False
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index bbce4036b..367d20c5d 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -199,7 +199,7 @@ def test_autoround_3bit_sym_format(self):
         print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
-    def test_static_afp8_export(self):
+    def test_static_afp8_export_and_load(self):
         import os
 
         from safetensors import safe_open
@@ -226,6 +226,32 @@ def test_static_afp8_export(self):
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1, 1]))
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
+        with torch.no_grad():
+            import transformers
+
+            model = transformers.AutoModelForCausalLM.from_pretrained(
+                quantized_model_path,
+                torch_dtype="auto",
+                low_cpu_mem_usage=True,
+                trust_remote_code=True,
+            )
+            model.eval()
+            assert (
+                model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear"
+            ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}"
+            tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path)
+            prompt = "AI is "
+            encode = tokenizer.encode(prompt, return_tensors="pt")
+            with torch.no_grad():
+                output_tokens = model.generate(
+                    encode,
+                    max_length=10,
+                )
+                output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+                print(f"Prompt: {prompt}")
+                print(f"Output: {output}")
+                assert output is not None, "Output should not be None"
+
         shutil.rmtree(quantized_model_path, ignore_errors=True)
 
         model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True)

From eaad3a6e150d8830c96460b333ed557c04e165ae Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 03:02:38 -0400
Subject: [PATCH 04/47] remove example

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 examples/load_w8a8.py | 143 ------------------------------------------
 1 file changed, 143 deletions(-)
 delete mode 100644 examples/load_w8a8.py

diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py
deleted file mode 100644
index ad6218f9b..000000000
--- a/examples/load_w8a8.py
+++ /dev/null
@@ -1,143 +0,0 @@
-import json
-import logging
-import os
-
-import safetensors
-import torch
-import tqdm
-from loguru import logger
-from safetensors import safe_open
-from safetensors.torch import save_file
-
-logging.basicConfig(level=logging.DEBUG)
-torch.set_grad_enabled(False)
-
-# CONSTANTS
-SAFETENSORS = "safetensors"
-WEIGHT_SCALE_NAME = "weight_scale"
-INPUT_SCALE_NAME = "scale_input"
-SCALE_DTYPE = torch.bfloat16
-SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
-FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
-WEIGHT_BACKOFF = 1.0
-QUANT_MODULE_TYPES = (torch.nn.Linear,)
-SKIP_WEIGHT_LST = {
-    "model.norm",
-    "layernorm",
-    "e_score_correction_bias",
-    # "lm_head.weight",
-    "embed_tokens",
-    "mlp.gate.weight",  # mlp.gate is not linear
-}
-
-MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
-
-
-seed = 0
-import random
-
-random.seed(seed)
-import torch
-
-torch.manual_seed(seed)
-torch.cuda.manual_seed(seed)
-import numpy as np
-
-np.random.Generator(seed)
-
-
-# torch.use_deterministic_algorithms(True)
-def seed_worker(worker_id):
-    worker_seed = torch.initial_seed() % 2**32
-    np.random.Generator(worker_seed)
-    random.seed(worker_seed)
-
-
-g = torch.Generator()
-g.manual_seed(0)
-
-
-def pre_dequantize(model):
-    """
-    Pre-dequantize all FP8QDQLinear layers in the model.
-    """
-    for name, module in model.named_modules():
-        if module.__class__.__name__ == "FP8QDQLinear":
-            logger.info(f"Pre-dequantizing {name}")
-            module.pre_dequantize()
-        else:
-            logger.debug(f"Skipping {name} as it is not FP8QDQLinear")
-
-
-import torch
-
-
-@torch.no_grad()
-def qdq_eval(model_path, not_patch_lin=False):
-
-    import transformers
-
-    # from transformers.modeling_utils import no_init_weights
-
-    model = transformers.AutoModelForCausalLM.from_pretrained(
-        model_path,
-        torch_dtype="auto",
-        low_cpu_mem_usage=True,
-        trust_remote_code=True,
-    )
-    logger.info(f"Patched model: {model}")
-    model.eval()
-    model.to("cuda")
-    import torch
-
-    with torch.device("cuda"):
-        from transformers import GenerationConfig
-
-        gen_config = GenerationConfig(use_cache=True, cache_implementation="static")
-        tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
-        prompt = "Hi, who"
-        encode = tokenizer.encode(prompt, return_tensors="pt")
-        with torch.no_grad():
-            output_tokens = model.generate(
-                encode,
-                max_length=10,
-                #    generation_config=gen_config
-            )
-            output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
-            logger.info(f"Prompt: {prompt}")
-            logger.info(f"Output: {output}")
-
-    # from auto_round.script.llm import eval_task_by_task
-
-    # eval_task_by_task(
-    #     model=model,
-    #     device="cuda",
-    #     tasks="gsm8k",
-    #     batch_size=32,
-    #     limit=128,
-    #     # trust_remote_code=not args.disable_trust_remote_code,
-    #     # eval_model_dtype=args.eval_model_dtype
-    # )
-
-
-if __name__ == "__main__":
-    import argparse
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-m", "--qmodel_path", type=str, required=True)
-    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
-    args = parser.parse_args()
-    qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
-
-
-"""
-p load_w8a8.py --qmodel_path  /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8
-Running generate_until requests:  76%|███ | 97/128 [11:45<03:
-Running generate_until requests: 100%|███| 128/128 [11:45<00:00,  5.51s/it]
-|Tasks|Version|     Filter     |n-shot|  Metric   |   |Value |   |Stderr|
-|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
-|gsm8k|      3|flexible-extract|     5|exact_match|↑  |0.7422|±  |0.0388|
-|     |       |strict-match    |     5|exact_match|↑  |0.6797|±  |0.0414|
-
-total eval time: 742.8823928833008
-"""

From c411ca5f86fdc2f84a5fa301ceab34d98ddf2bcb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 12 Aug 2025 03:04:26 -0400
Subject: [PATCH 05/47] fix typo

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 8b357e090..1b2d7c222 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -133,7 +133,7 @@ def __init__(
     @classmethod
     def from_original(cls, config, original_layer):
         """
-        Create an FP8WOQLinear layer from an original linear layer.
+        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
         """
         device = original_layer.weight.device
         with torch.device(device):
@@ -165,6 +165,7 @@ def qdq_input(self, bf16_input: torch.Tensor):
         qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
         return qdq_input_bf16
 
+    @torch.no_grad()
     def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
         qdq_input = self.qdq_input(bf16_input)
         qdq_weight = self.dequant_weight_online()

From 6597d5ca36d084848f76cde2a972bc684f888d4c Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 13 Aug 2025 08:39:45 +0800
Subject: [PATCH 06/47] Update
 auto_round/export/export_to_autoround/export_to_fp8_woq.py

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 1b2d7c222..09af9e270 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -89,7 +89,9 @@ def quant_tensor_with_scale(tensor, scale):
     qtensor = tensor / scale
     cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
     cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
-    return scale, cliped_qtensor_fp8
+    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
+    return scale, clipped_qtensor_fp8
 
 
 class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module):

From 9b0f32ffdd0cb4aac2c36922588c8cdd56296346 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 13 Aug 2025 08:40:45 +0800
Subject: [PATCH 07/47] Update export_to_fp8_woq.py

---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 09af9e270..4d2b924d1 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -87,8 +87,6 @@ def __init__(
 def quant_tensor_with_scale(tensor, scale):
     FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
     qtensor = tensor / scale
-    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
-    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
     clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
     clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
     return scale, clipped_qtensor_fp8

From 5ebca24b6ee300f4205ae3798c5568ac419cf134 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Sun, 24 Aug 2025 05:00:23 -0400
Subject: [PATCH 08/47] update shape

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../export/export_to_autoround/export_to_fp8_woq.py       | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 7bcfb8011..e7b473593 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -94,7 +94,7 @@ def __init__(
         super().__init__()
         self.in_features = in_features
         self.out_features = out_features
-        init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight
+        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
         self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
         self.dtype = dtype
         if bias is not None:
@@ -103,14 +103,14 @@ def __init__(
             self.bias = torch.nn.Parameter(bias, requires_grad=False)
         else:
             self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
         self.register_buffer("weight_scale", init_weight_scale.to(dtype))
 
         init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
         if weight_zp:
             self.register_buffer("weight_zp", init_weight_zp.to(dtype))
 
-        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
+        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 
@@ -132,7 +132,7 @@ def dequant_weight_online(self):
         if self.pre_dequantized:
             return self.weight
         fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale
+        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
         return qdq_weight
 
     def pre_dequantize(self):

From 03cb21711a34b22fc002ebca399d9a58b7d07ec9 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 02:54:16 -0400
Subject: [PATCH 09/47] refactor

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py      |  28 +++++
 .../experimental/qmodules/fp8_static.py       | 108 ++++++++++++++++++
 .../export_to_autoround/export_to_fp8_woq.py  |  89 ---------------
 3 files changed, 136 insertions(+), 89 deletions(-)
 create mode 100644 auto_round/experimental/qmodules/base.py
 create mode 100644 auto_round/experimental/qmodules/fp8_static.py

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
new file mode 100644
index 000000000..860e66836
--- /dev/null
+++ b/auto_round/experimental/qmodules/base.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import abstractmethod
+from typing import Optional, Union
+
+import torch
+
+
+class QModuleBase(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    @abstractmethod
+    def from_original(cls, config, original_layer):
+        raise NotImplementedError
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
new file mode 100644
index 000000000..8d58480d3
--- /dev/null
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+
+from auto_round.experimental.qmodules.base import QModuleBase
+
+
+def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor):
+    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+    qtensor = tensor / scale
+    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
+    return scale, clipped_qtensor_fp8
+
+
+class WeightFP8ActFP8StaticQuantLinear(QModuleBase):
+    hp_dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight: Optional[torch.Tensor] = None,
+        weight_scale: Optional[torch.Tensor] = None,
+        bias: Union[torch.Tensor, bool, None] = None,
+        weight_zp: Optional[torch.Tensor] = None,
+        input_scale: Optional[torch.Tensor] = None,
+        dtype=torch.bfloat16,
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
+        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
+        self.dtype = dtype
+        if bias is not None:
+            if isinstance(bias, bool):
+                bias = torch.zeros((out_features,), dtype=dtype)
+            self.bias = torch.nn.Parameter(bias, requires_grad=False)
+        else:
+            self.register_parameter("bias", None)
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
+        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
+
+        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
+        if weight_zp:
+            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
+
+        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
+        self.register_buffer("input_scale", init_input_scale.to(dtype))
+        self.pre_dequantized = False
+
+    @classmethod
+    def from_original(cls, config, original_layer):
+        """
+        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
+        """
+        device = original_layer.weight.device
+        with torch.device(device):
+            qdq_linear = cls(
+                in_features=original_layer.in_features,
+                out_features=original_layer.out_features,
+                bias=original_layer.bias,
+            )
+            return qdq_linear
+
+    def dequant_weight_online(self):
+        if self.pre_dequantized:
+            return self.weight
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
+        return qdq_weight
+
+    def pre_dequantize(self):
+        if self.pre_dequantized:
+            return
+        dequant_weight = self.dequant_weight_online()
+        del self.weight
+        del self.weight_scale
+        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
+        self.pre_dequantized = True
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = _quant_tensor_to_fp8_with_scale(bf16_input, self.input_scale.data)
+        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
+        return qdq_input_bf16
+
+    @torch.no_grad()
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index e7b473593..214e5046e 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -68,95 +68,6 @@ def __init__(
             self.register_buffer("input_scale", input_scale.to(dtype))
 
 
-def quant_tensor_with_scale(tensor, scale):
-    FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
-    qtensor = tensor / scale
-    clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
-    clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn)
-    return scale, clipped_qtensor_fp8
-
-
-class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module):
-    hp_dtype = torch.bfloat16
-    fp8_dtype = torch.float8_e4m3fn
-
-    def __init__(
-        self,
-        in_features,
-        out_features,
-        weight: Optional[torch.Tensor] = None,
-        weight_scale: Optional[torch.Tensor] = None,
-        bias: Union[torch.Tensor, bool, None] = None,
-        weight_zp: Optional[torch.Tensor] = None,
-        input_scale: Optional[torch.Tensor] = None,
-        dtype=torch.bfloat16,
-    ):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight
-        self.weight = torch.nn.Parameter(init_weight, requires_grad=False)
-        self.dtype = dtype
-        if bias is not None:
-            if isinstance(bias, bool):
-                bias = torch.zeros((out_features,), dtype=dtype)
-            self.bias = torch.nn.Parameter(bias, requires_grad=False)
-        else:
-            self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
-        self.register_buffer("weight_scale", init_weight_scale.to(dtype))
-
-        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
-        if weight_zp:
-            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
-
-        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
-        self.register_buffer("input_scale", init_input_scale.to(dtype))
-        self.pre_dequantized = False
-
-    @classmethod
-    def from_original(cls, config, original_layer):
-        """
-        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
-        """
-        device = original_layer.weight.device
-        with torch.device(device):
-            qdq_linear = cls(
-                in_features=original_layer.in_features,
-                out_features=original_layer.out_features,
-                bias=original_layer.bias,
-            )
-            return qdq_linear
-
-    def dequant_weight_online(self):
-        if self.pre_dequantized:
-            return self.weight
-        fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
-        return qdq_weight
-
-    def pre_dequantize(self):
-        if self.pre_dequantized:
-            return
-        dequant_weight = self.dequant_weight_online()
-        del self.weight
-        del self.weight_scale
-        self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False)
-        self.pre_dequantized = True
-
-    def qdq_input(self, bf16_input: torch.Tensor):
-        input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data)
-        qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale
-        return qdq_input_bf16
-
-    @torch.no_grad()
-    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
-        qdq_input = self.qdq_input(bf16_input)
-        qdq_weight = self.dequant_weight_online()
-        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
-        return out
-
-
 def pack_layer(layer_name, model, data_type, packing_device=None):
     """
      Packs a model layer for quantization based on its type and configuration.

From 66388e5360173de4e4b6340a4e075bdd1749c46c Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 05:23:48 -0400
Subject: [PATCH 10/47] tmp add bk

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       | 12 ++++++++++++
 auto_round/inference/convert_model.py |  3 +--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 4e3f42861..867d9f398 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -172,6 +172,17 @@ def feature_multiply_checker_group_size(
     requirements=["auto-round>=0.5.1"],
 )
 
+BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
+    device=["cuda", "cpu"],
+    packing_format="",
+    sym=[True],
+    bits=[8],
+    priority=0,
+    feature_checks=[],
+    alias=["auto_round", "torch"],
+    requirements=["auto-round>=0.6.1"],
+)
+
 BackendInfos["auto_round:tritonv2_zp"] = BackendInfo(
     device=["cuda", "xpu"],
     sym=[True],  ## asym has accuracys
@@ -732,6 +743,7 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
             If no compatible backend is found for the given layer configuration.
     """
     # Check if the provided backend is in BackendInfos
+    # breakpoint()
     backend = find_backend(backend)
     if backend not in BackendInfos.keys():
         raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.")
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index bd8b4621d..fbdfb8804 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -566,7 +566,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
         backend = quantization_config.backend
     else:
         backend = "auto"
-
     ##target_backend could be None
     _, backend = parse_target_device_and_backend(backend)
 
@@ -591,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
 
     if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format):
         backend = backend[len("auto_round:") :]
-
+    # breakpoint()
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend)
     if backend == "auto" or backend == "":
         best_backend = get_highest_priority_backend(

From 17ddd2d0d22d42a990a1dafcc47d47f14e45f0a5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:00:54 -0400
Subject: [PATCH 11/47] refactor code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       | 7 ++++---
 auto_round/inference/convert_model.py | 2 +-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 867d9f398..3e4c8a7f2 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -176,11 +176,12 @@ def feature_multiply_checker_group_size(
     device=["cuda", "cpu"],
     packing_format="",
     sym=[True],
+    dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
     feature_checks=[],
     alias=["auto_round", "torch"],
-    requirements=["auto-round>=0.6.1"],
+    requirements=["auto-round>=0.6.1.dev0"],
 )
 
 BackendInfos["auto_round:tritonv2_zp"] = BackendInfo(
@@ -463,7 +464,7 @@ def dynamic_import_inference_linear(backend, config):
     bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
 
     if is_weight_fp8_activation_static_fp8(config):
-        from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear
+        from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear
 
         return WeightFP8ActFP8StaticQuantLinear
 
@@ -743,7 +744,6 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
             If no compatible backend is found for the given layer configuration.
     """
     # Check if the provided backend is in BackendInfos
-    # breakpoint()
     backend = find_backend(backend)
     if backend not in BackendInfos.keys():
         raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.")
@@ -855,6 +855,7 @@ def build_pip_commands(gptq_req, other_reqs):
 
     # Instructional messages
     install_instructions = []
+
     for cmd in pip_cmds:
         if "intel-extension-for-pytorch" in cmd and target_device == "xpu":
             install_instructions.append(
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index fbdfb8804..df8b52c07 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -590,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"):
 
     if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format):
         backend = backend[len("auto_round:") :]
-    # breakpoint()
+
     used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend)
     if backend == "auto" or backend == "":
         best_backend = get_highest_priority_backend(

From 808449d71e0d004298c183d76a417a3df83f3528 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:12:52 -0400
Subject: [PATCH 12/47] refine code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py      | 24 +++++++++++++++++++
 .../experimental/qmodules/fp8_static.py       | 12 ++++++++++
 auto_round/inference/backend.py               |  4 ++++
 3 files changed, 40 insertions(+)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index 860e66836..affc7552d 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -19,6 +19,14 @@
 
 
 class QModuleBase(torch.nn.Module):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by Auto-Round.
+    The design is inspired by vLLM's CompressedTensorsScheme:
+    https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
+
+    """
+
     def __init__(self):
         super().__init__()
 
@@ -26,3 +34,19 @@ def __init__(self):
     @abstractmethod
     def from_original(cls, config, original_layer):
         raise NotImplementedError
+
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 8d58480d3..3774da810 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from abc import abstractmethod
 from typing import Optional, Union
 
 import torch
@@ -106,3 +107,14 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
         qdq_weight = self.dequant_weight_online()
         out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
         return out
+
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        # FIXME: set to 0 for now, as fp8 kernels are not available yet
+        return 0
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        pass
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 3e4c8a7f2..0ca0d4726 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -172,6 +172,10 @@ def feature_multiply_checker_group_size(
     requirements=["auto-round>=0.5.1"],
 )
 
+# FP8 static quant
+# Weight: FP8, per-channel, may be extended to per-tensor in future
+# Activation: FP8, per-tensor
+
 BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
     device=["cuda", "cpu"],
     packing_format="",

From f74ed6f6ffd7c40b55ce2886a9882f55b5f96bce Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:17:49 -0400
Subject: [PATCH 13/47] fix device list

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 .../experimental/qmodules/fp8_static.py       | 22 +++++++++----------
 auto_round/inference/backend.py               |  2 +-
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 3774da810..074cf34e7 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -66,6 +66,17 @@ def __init__(
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 
+    @classmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        # FIXME: set to 0 for now, as fp8 kernels are not available yet
+        return 0
+
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        pass
+
     @classmethod
     def from_original(cls, config, original_layer):
         """
@@ -107,14 +118,3 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
         qdq_weight = self.dequant_weight_online()
         out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
         return out
-
-    @classmethod
-    def get_min_capability(cls) -> int:
-        """
-        Get minimum device capability.
-        """
-        # FIXME: set to 0 for now, as fp8 kernels are not available yet
-        return 0
-
-    def process_weights_after_loading(self, layer: torch.nn.Module):
-        pass
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 0ca0d4726..f74f22b75 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -177,7 +177,7 @@ def feature_multiply_checker_group_size(
 # Activation: FP8, per-tensor
 
 BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
-    device=["cuda", "cpu"],
+    device=["xpu", "cuda", "cpu"],
     packing_format="",
     sym=[True],
     dtype=["float32", "float16", "bfloat16"],

From 632cf8a91046608bb26afedf63c81e0920a3d822 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 26 Aug 2025 23:25:13 -0400
Subject: [PATCH 14/47] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/export/export_to_autoround/export_to_fp8_woq.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
index 9dbbca5ab..b8a32896f 100644
--- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py
+++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py
@@ -16,7 +16,6 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
-from typing import Optional, Union
 
 import threadpoolctl as tctl
 import torch

From 5b8b29d4a2e315b9656eb90c8b3948015bcb4a20 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:14:04 -0400
Subject: [PATCH 15/47] refactor code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py                       | 19 +++++++++++---
 .../export/export_to_autoround/export.py      |  8 +++++-
 auto_round/inference/backend.py               | 13 +---------
 auto_round/utils.py                           | 26 +++++++++++++++++++
 4 files changed, 50 insertions(+), 16 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index fed33df34..85ea75e60 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -19,6 +19,7 @@
 import sys
 import time
 import traceback
+from enum import Enum
 from typing import Any, Union
 
 import accelerate
@@ -74,6 +75,7 @@
     is_optimum_habana_available,
     is_standard_fp,
     is_static_afp8,
+    is_torch_fp8_static,
     llm_load_model,
     logger,
     mv_module_from_gpu,
@@ -87,6 +89,12 @@
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
 
+class AutoRoundFormat(str, Enum):
+    # Weight: FP8, per-channel, may be extended to per-tensor in future
+    # Activation: FP8, per-tensor
+    TORCH_FP8_STATIC = "torch_fp8_static"
+
+
 class AutoRound(object):
     """Automatic weight rounding (Signed Gradient Descent) for LLM quantization
 
@@ -663,9 +671,14 @@ def _parse_format_to_list(self, format: str) -> list:
                     )
                     if enable_awq:
                         formats[index] = format.replace("auto_round", "auto_round:auto_awq")
-                if is_nv_fp(self.data_type) or is_mx_fp(self.data_type) or is_standard_fp(self.data_type):
+                if is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
                     format = format.replace("auto_round", f"auto_round:{self.data_type}")
                     formats[index] = format
+                if is_torch_fp8_static(self):
+                    format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}")
+                    formats[index] = format
+                # if is_torch_fp8_static(self):
+                #     formats[index] = "auto_round:torch_fp8_static"
             elif format == "llmcompressor":
                 from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported
 
@@ -731,10 +744,10 @@ def _check_supported_format(self, format: str) -> bool:
                     )
                     format = "fake"
             else:
-                if not (format == "auto_round" or format == "auto_round:fp8"):
+                if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"):
                     logger.warning(
                         f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model,"
-                        " change format to auto_round"
+                        f" change format {format} to auto_round"
                     )
                     format = "auto_round"
             if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8":
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 1640528b6..38b815eb1 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -263,6 +263,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     Raises:
         ValueError: If the backend is not supported.
     """
+    # breakpoint()
     data_type = kwargs.get("data_type", None)
     if is_nv_fp(data_type) or is_mx_fp(data_type):  ## detect nvfp & mxfp first
         from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp
@@ -273,9 +274,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
         from auto_round.export.export_to_autoround.export_to_fp8_woq import save_quantized_as_autoround
 
         return save_quantized_as_autoround(output_dir, inplace=inplace, backend="auto_round", **kwargs)
+    from auto_round.autoround import AutoRoundFormat
 
     ##if using sym, we change to gptq sym kernel to avoid compiling from auto_round source
-    if (kwargs.get("sym") is None or kwargs.get("sym")) and ("gptq" not in backend and "awq" not in backend):
+    if (
+        (kwargs.get("sym") is None or kwargs.get("sym"))
+        and ("gptq" not in backend and "awq" not in backend)
+        and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend)
+    ):
         backend = backend.replace("auto_round", "auto_round:auto_gptq")
 
     model = kwargs["model"]
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index f74f22b75..739ff4e89 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -19,7 +19,7 @@
 from transformers.utils.versions import require_version
 
 import auto_round_extension.cuda.gptqmodel_marlin
-from auto_round.utils import get_library_version, logger
+from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger
 
 BackendInfos = {}
 
@@ -429,17 +429,6 @@ def check_compatible(
     return True
 
 
-def is_weight_fp8_activation_static_fp8(config):
-    bits, group_size, sym, data_type, act_dynamic = (
-        config["bits"],
-        config["group_size"],
-        config["sym"],
-        config["data_type"],
-        config["act_dynamic"],
-    )
-    return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
-
-
 def dynamic_import_inference_linear(backend, config):
     """Dynamically imports and returns the appropriate QuantLinear class based on the given backend.
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 74999c624..c13556827 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2516,3 +2516,29 @@ def is_nv_fp(backend):
 
 def is_static_afp8(ar):
     return not ar.act_dynamic and "fp8" in ar.act_data_type
+
+
+def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic):
+    return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
+
+
+def is_weight_fp8_activation_static_fp8(config):
+    bits, group_size, sym, data_type, act_dynamic = (
+        config["bits"],
+        config["group_size"],
+        config["sym"],
+        config["data_type"],
+        config["act_dynamic"],
+    )
+    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
+
+
+def is_torch_fp8_static(ar):
+    bits, group_size, sym, data_type, act_dynamic = (
+        ar.bits,
+        ar.group_size,
+        ar.sym,
+        ar.data_type,
+        ar.act_dynamic,
+    )
+    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)

From 57b4c19913c442434144e8ba50df1dfb6f5ba7df Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:18:02 -0400
Subject: [PATCH 16/47] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py        | 6 +++---
 auto_round/export/export_to_autoround/export.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index affc7552d..c069f5151 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from abc import abstractmethod
+from abc import ABC, abstractmethod
 from typing import Optional, Union
 
 import torch
 
 
-class QModuleBase(torch.nn.Module):
+class QModuleBase(ABC):
     """
     Abstract class used to describe the weight creation and forward pass
     of different quantization schemes supported by Auto-Round.
@@ -32,7 +32,7 @@ def __init__(self):
 
     @classmethod
     @abstractmethod
-    def from_original(cls, config, original_layer):
+    def from_original(cls, config, original_layer: torch.nn.Module):
         raise NotImplementedError
 
     @classmethod
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 38b815eb1..48a59f5e5 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -263,7 +263,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex
     Raises:
         ValueError: If the backend is not supported.
     """
-    # breakpoint()
     data_type = kwargs.get("data_type", None)
     if is_nv_fp(data_type) or is_mx_fp(data_type):  ## detect nvfp & mxfp first
         from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp

From bdf5f3e554da100b337f327257fa2308b90811f5 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:19:06 -0400
Subject: [PATCH 17/47] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py       | 2 ++
 auto_round/experimental/qmodules/fp8_static.py | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index c069f5151..2a74a470d 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -17,6 +17,8 @@
 
 import torch
 
+__all__ = ["QModuleBase"]
+
 
 class QModuleBase(ABC):
     """
diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 074cf34e7..b5c7d2dd2 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -19,6 +19,8 @@
 
 from auto_round.experimental.qmodules.base import QModuleBase
 
+__all__ = ["WeightFP8ActFP8StaticQuantLinear"]
+
 
 def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor):
     FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max

From ce3384f33ec861f00e4c704f032dc99b907c8536 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Wed, 27 Aug 2025 03:26:05 -0400
Subject: [PATCH 18/47] fix ut

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/base.py |  4 +-
 test/test_cpu/test_export.py             | 48 ++++++++++++------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py
index 2a74a470d..8b7a9c138 100644
--- a/auto_round/experimental/qmodules/base.py
+++ b/auto_round/experimental/qmodules/base.py
@@ -20,9 +20,9 @@
 __all__ = ["QModuleBase"]
 
 
-class QModuleBase(ABC):
+class QModuleBase(torch.nn.Module):
     """
-    Abstract class used to describe the weight creation and forward pass
+    Base class used to describe the weight creation and forward pass
     of different quantization schemes supported by Auto-Round.
     The design is inspired by vLLM's CompressedTensorsScheme:
     https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py
diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py
index 24498c780..d648fd721 100644
--- a/test/test_cpu/test_export.py
+++ b/test/test_cpu/test_export.py
@@ -230,31 +230,33 @@ def test_static_afp8_export(self, static_kv_dtype):
         self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys())
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1]))
         self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn)
-        with torch.no_grad():
-            import transformers
-
-            model = transformers.AutoModelForCausalLM.from_pretrained(
-                quantized_model_path,
-                torch_dtype="auto",
-                low_cpu_mem_usage=True,
-                trust_remote_code=True,
-            )
-            model.eval()
-            assert (
-                model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear"
-            ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}"
-            tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path)
-            prompt = "AI is "
-            encode = tokenizer.encode(prompt, return_tensors="pt")
+        if static_kv_dtype is None:
             with torch.no_grad():
-                output_tokens = model.generate(
-                    encode,
-                    max_length=10,
+                import transformers
+
+                model = transformers.AutoModelForCausalLM.from_pretrained(
+                    quantized_model_path,
+                    torch_dtype="auto",
+                    low_cpu_mem_usage=True,
+                    trust_remote_code=True,
                 )
-                output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
-                print(f"Prompt: {prompt}")
-                print(f"Output: {output}")
-                assert output is not None, "Output should not be None"
+                model.eval()
+                assert (
+                    model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__
+                    == "WeightFP8ActFP8StaticQuantLinear"
+                ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}"
+                tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path)
+                prompt = "AI is "
+                encode = tokenizer.encode(prompt, return_tensors="pt")
+                with torch.no_grad():
+                    output_tokens = model.generate(
+                        encode,
+                        max_length=10,
+                    )
+                    output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+                    print(f"Prompt: {prompt}")
+                    print(f"Output: {output}")
+                    assert output is not None, "Output should not be None"
 
         if static_kv_dtype == "fp8":
             self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys())

From 22d11de19ce77a04b29f28c5c19e6639a7130298 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 01:04:39 -0400
Subject: [PATCH 19/47] correct

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/fp8_static.py | 13 +++++--------
 auto_round/utils.py                            | 14 +++++++++++---
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index b5c7d2dd2..90ee09357 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -18,6 +18,7 @@
 import torch
 
 from auto_round.experimental.qmodules.base import QModuleBase
+from auto_round.utils import logger
 
 __all__ = ["WeightFP8ActFP8StaticQuantLinear"]
 
@@ -41,7 +42,6 @@ def __init__(
         weight: Optional[torch.Tensor] = None,
         weight_scale: Optional[torch.Tensor] = None,
         bias: Union[torch.Tensor, bool, None] = None,
-        weight_zp: Optional[torch.Tensor] = None,
         input_scale: Optional[torch.Tensor] = None,
         dtype=torch.bfloat16,
     ):
@@ -57,14 +57,10 @@ def __init__(
             self.bias = torch.nn.Parameter(bias, requires_grad=False)
         else:
             self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
+        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
         self.register_buffer("weight_scale", init_weight_scale.to(dtype))
 
-        init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp
-        if weight_zp:
-            self.register_buffer("weight_zp", init_weight_zp.to(dtype))
-
-        init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale
+        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 
@@ -73,7 +69,8 @@ def get_min_capability(cls) -> int:
         """
         Get minimum device capability.
         """
-        # FIXME: set to 0 for now, as fp8 kernels are not available yet
+        # TODO: correct that config once we add fp8 op support.
+        logger.warning_once("FP8 ops are not yet supported. Using capability 0.")
         return 0
 
     def process_weights_after_loading(self, layer: torch.nn.Module):
diff --git a/auto_round/utils.py b/auto_round/utils.py
index c13556827..2fd78f7a0 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -108,9 +108,17 @@ def infer_bits_by_data_type(data_type: str):
     return None
 
 
-@lru_cache(None)
-def warning_once(self, msg: str):
-    self.warning(msg)
+@lru_cache(maxsize=None)
+def warning_once(self, msg, *args, **kwargs):
+    """
+    Log a warning message only once per unique message/arguments combination.
+
+    Args:
+        msg: The warning message format string
+        *args: Variable positional arguments for message formatting
+        **kwargs: Variable keyword arguments for message formatting and logging options
+    """
+    self.warning(msg, *args, **kwargs)
 
 
 class AutoRoundFormatter(logging.Formatter):

From 90826139a8ddfb53a983ad2e87b2ef978fcbe3fb Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 01:05:28 -0400
Subject: [PATCH 20/47] clean

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 85ea75e60..2af8df95e 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -677,8 +677,7 @@ def _parse_format_to_list(self, format: str) -> list:
                 if is_torch_fp8_static(self):
                     format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}")
                     formats[index] = format
-                # if is_torch_fp8_static(self):
-                #     formats[index] = "auto_round:torch_fp8_static"
+
             elif format == "llmcompressor":
                 from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported
 

From 2202856fabc8abe2f8ad7a964899450621fbd598 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 03:34:11 -0400
Subject: [PATCH 21/47] fix shape

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/fp8_static.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index 90ee09357..a6798f53d 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -57,10 +57,10 @@ def __init__(
             self.bias = torch.nn.Parameter(bias, requires_grad=False)
         else:
             self.register_parameter("bias", None)
-        init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale
+        init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale
         self.register_buffer("weight_scale", init_weight_scale.to(dtype))
 
-        init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale
+        init_input_scale = torch.zeros((1), dtype=dtype) if input_scale is None else input_scale
         self.register_buffer("input_scale", init_input_scale.to(dtype))
         self.pre_dequantized = False
 

From d0b99a8f1c493d8484e10871b3a533705c8f1401 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 20:59:33 -0400
Subject: [PATCH 22/47] fix check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 934486c5a..6ef3884a9 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -687,7 +687,7 @@ def _parse_format_to_list(self, format: str) -> list:
                         format = "auto_round:auto_awq"
                 elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type):
                     format = f"auto_round:{self.data_type}"
-                elif is_wfp8afp8(self):  # staic wfp8afp8
+                elif is_static_wfp8afp8(self):  # staic wfp8afp8
                     format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"
                 elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16:  # woq fp8
                     format = "auto_round:fp8"

From 31845d0d025db8b24e4676192a5b998c56188c8e Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 28 Aug 2025 21:02:34 -0400
Subject: [PATCH 23/47] clean code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py |  2 --
 auto_round/utils.py     | 15 ---------------
 2 files changed, 17 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 6ef3884a9..49e3984a7 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -73,9 +73,7 @@
     is_nv_fp,
     is_optimum_habana_available,
     is_standard_fp,
-    is_static_afp8,
     is_static_wfp8afp8,
-    is_torch_fp8_static,
     is_wfp8afp8,
     llm_load_model,
     logger,
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 9886a5337..21363688b 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2527,10 +2527,6 @@ def is_nv_fp(backend):
     return BackendDataType.NV_FP in backend
 
 
-def is_static_afp8(ar):
-    return not ar.act_dynamic and "fp8" in ar.act_data_type
-
-
 def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic):
     return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
 
@@ -2546,17 +2542,6 @@ def is_weight_fp8_activation_static_fp8(config):
     return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
 
 
-def is_torch_fp8_static(ar):
-    bits, group_size, sym, data_type, act_dynamic = (
-        ar.bits,
-        ar.group_size,
-        ar.sym,
-        ar.data_type,
-        ar.act_dynamic,
-    )
-    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
-
-
 def is_wfp8afp8(ar):
     if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and (
         "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)

From 1f2e6749d230e0948d6e2177d6ae1f48de93abc6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 01:00:33 -0400
Subject: [PATCH 24/47] fix backend check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/fp8_static.py  | 4 ++--
 auto_round/export/export_to_autoround/export.py | 4 +++-
 auto_round/utils.py                             | 4 ++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index a6798f53d..a61148d82 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -93,8 +93,7 @@ def from_original(cls, config, original_layer):
     def dequant_weight_online(self):
         if self.pre_dequantized:
             return self.weight
-        fp8_weight = self.weight
-        qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
+        qdq_weight = self.weight.to(self.dtype) * self.weight_scale.unsqueeze(1)
         return qdq_weight
 
     def pre_dequantize(self):
@@ -113,6 +112,7 @@ def qdq_input(self, bf16_input: torch.Tensor):
 
     @torch.no_grad()
     def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+
         qdq_input = self.qdq_input(bf16_input)
         qdq_weight = self.dequant_weight_online()
         out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index ee2d61587..2f0552c1b 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -151,8 +151,10 @@ def pack_layer(layer_name, model, backend):
         from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer
 
         return pack_layer(layer_name, model, backend)
+    # breakpoint()
+    from auto_round.autoround import AutoRoundFormat
 
-    if backend == "auto_round:fp8":
+    if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}":
         from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer
 
         return pack_layer(layer_name, model, backend)
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 09dcb92b5..f8c878b43 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2554,8 +2554,8 @@ def is_nv_fp(backend):
     return BackendDataType.NV_FP in backend
 
 
-def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic):
-    return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic
+def _is_weight_fp8_activation_static_fp8(bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool):
+    return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic
 
 
 def is_weight_fp8_activation_static_fp8(config):

From 4cec318ffa1476f74822db40a58eeba7e4951e67 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 02:01:25 -0400
Subject: [PATCH 25/47] update config

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py | 37 +++++++++++++++++++--------
 auto_round/schemes.py                 |  9 +++++--
 2 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index df8b52c07..8b569a654 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -30,6 +30,7 @@
     is_weight_fp8_activation_static_fp8,
     process_requirement,
 )
+from auto_round.schemes import QuantizationScheme
 from auto_round.utils import (
     SUPPORTED_LAYER_TYPES,
     check_start_with_block_name,
@@ -240,7 +241,23 @@ def get_layer_config(model, quantization_config):
     group_size = quantization_config.group_size
     data_type = getattr(quantization_config, "data_type", "int")  # Default to "int" if not specified
     sym = quantization_config.sym
+
+    act_bits = getattr(quantization_config, "act_bits", None)
+    act_group_size = getattr(quantization_config, "act_group_size", False)
+    act_sym = getattr(quantization_config, "act_sym", None)
+    act_data_type = getattr(quantization_config, "act_data_type", None)
     act_dynamic = getattr(quantization_config, "act_dynamic", False)
+
+    default_quant_scheme = QuantizationScheme(
+        bits=bits,
+        group_size=group_size,
+        data_type=data_type,
+        sym=sym,
+        act_bits=act_bits,
+        act_group_size=act_group_size,
+        act_sym=act_sym,
+    )
+
     # Determine the quantization block list
     quant_block_list = getattr(quantization_config, "quant_block_list", None)
     if quant_block_list is None:
@@ -287,17 +304,15 @@ def get_layer_config(model, quantization_config):
     layer_names = list(set(layer_names).union(extra_config.keys()))
 
     # Construct final layer configuration
-    layer_configs = {
-        layer_name: {
-            "bits": extra_config.get(layer_name, {}).get("bits", bits),
-            "group_size": extra_config.get(layer_name, {}).get("group_size", group_size),
-            "data_type": extra_config.get(layer_name, {}).get("data_type", data_type),
-            "sym": extra_config.get(layer_name, {}).get("sym", sym),
-            "act_dynamic": extra_config.get(layer_name, {}).get("act_dynamic", act_dynamic),
-            "clip": extra_config.get(layer_name, {}).get("clip", False),
-        }
-        for layer_name in layer_names
-    }
+    layer_configs = {}
+    quant_scheme_attrs = QuantizationScheme.get_attributes()
+    for layer_name in layer_names:
+        layer_config = {}
+        layer_extra_config = extra_config.get(layer_name, {})
+        for scheme_attr in quant_scheme_attrs:
+            layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr))
+        layer_configs[layer_name] = layer_config
+
     return layer_configs
 
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 496af179c..2908cd73b 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -13,8 +13,8 @@
 # limitations under the License.
 import copy
 from copy import deepcopy
-from dataclasses import dataclass
-from typing import Optional
+from dataclasses import dataclass, fields
+from typing import List, Optional
 
 __all__ = ["QuantizationScheme", "preset_name_to_scheme"]
 
@@ -32,11 +32,16 @@ class QuantizationScheme:
     act_dynamic: Optional[bool] = None
     super_bits: Optional[int] = None
     super_group_size: Optional[int] = None
+    clip: Optional[bool] = False
 
     @classmethod
     def from_dict(cls, config: dict):
         return cls(**config)
 
+    @classmethod
+    def get_attributes(cls: "QuantizationScheme") -> List[str]:
+        return [field.name for field in fields(cls)]
+
 
 def preset_name_to_scheme(name: str) -> QuantizationScheme:
     """Get a QuantizationScheme instance from a preset scheme name."""

From 6b2962fde6472470f9ef8be845008567c16bd095 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 02:03:51 -0400
Subject: [PATCH 26/47] revert change

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index 8b569a654..4aec6338d 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -221,7 +221,6 @@ def get_layer_config(model, quantization_config):
             - group_size (int): Group size for weight quantization.
             - data_type (str, optional): Data type for quantization (default: "int").
             - sym (bool): Whether to use symmetric quantization.
-            - act_dynamic (bool, optional): Whether to use dynamic activation quantization (default: False).
             - quant_block_list (list, optional): Predefined list of blocks to quantize.
             - to_quant_block_names (list or str, optional): Blocks to quantize (if quant_block_list is None).
             - extra_config (dict, optional): Per-layer overrides for quantization settings.
@@ -234,7 +233,6 @@ def get_layer_config(model, quantization_config):
             - "group_size" (int): Group size for quantization.
             - "data_type" (str): Data type used for quantization.
             - "sym" (bool): Whether symmetric quantization is applied.
-            - "act_dynamic" (bool): Whether dynamic activation quantization is used.
             - "clip" (bool): Whether weight clipping is enabled.
     """
     bits = quantization_config.bits

From 638718e7ae859ae47430b9a10f4a050e96efafa8 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 02:06:36 -0400
Subject: [PATCH 27/47] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index f8c878b43..b5514ac38 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -24,7 +24,7 @@
 from collections import UserDict
 from enum import Enum
 from functools import lru_cache
-from typing import Any, Callable, Tuple, Union
+from typing import Any, Callable, Dict, Tuple, Union
 
 import cpuinfo
 import torch
@@ -2558,7 +2558,7 @@ def _is_weight_fp8_activation_static_fp8(bit: int, group_size: int, sym: bool, d
     return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic
 
 
-def is_weight_fp8_activation_static_fp8(config):
+def is_weight_fp8_activation_static_fp8(config: Dict):
     bits, group_size, sym, data_type, act_dynamic = (
         config["bits"],
         config["group_size"],

From 4df3e8f754b745b9fa4498ef01f954d064b21f62 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 02:25:09 -0400
Subject: [PATCH 28/47] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index b5514ac38..5410609cf 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2554,7 +2554,9 @@ def is_nv_fp(backend):
     return BackendDataType.NV_FP in backend
 
 
-def _is_weight_fp8_activation_static_fp8(bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool):
+def _is_weight_fp8_activation_static_fp8(
+    bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool
+) -> bool:
     return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic
 
 

From e01603ce2e6b8673d229ab039cf7ac4af3b1d690 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 02:40:35 -0400
Subject: [PATCH 29/47] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 00f0ce648..bacdac8df 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -185,7 +185,7 @@ def feature_multiply_checker_group_size(
     priority=0,
     feature_checks=[],
     alias=["auto_round", "torch"],
-    requirements=["auto-round>=0.6.1.dev0"],
+    requirements=["auto-round>0.6.0"],
 )
 
 BackendInfos["auto_round:tritonv2_zp"] = BackendInfo(

From 0cdf28b1507c7023dd17e178b445501bacd44434 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 03:41:02 -0400
Subject: [PATCH 30/47] propagate the config

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       | 20 +++++++++-----------
 auto_round/inference/convert_model.py |  4 +---
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index bacdac8df..77bb64784 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -356,9 +356,7 @@ def feature_multiply_checker_group_size(
 )
 
 
-def check_compatible(
-    backend_name, device, bits, group_size, sym, packing_format, in_features, out_features, check_requirements=True
-):
+def check_compatible(backend_name, device, config, packing_format, in_features, out_features, check_requirements=True):
     """Checks if the given configuration is compatible with the specified backend.
 
     Args:
@@ -388,7 +386,7 @@ def check_compatible(
     - If the packing format does not match, it must be convertible.
     """
     backend = BackendInfos[backend_name]
-
+    bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
     # Check if device is supported by the backend
     if device not in backend.device:
         return False
@@ -685,7 +683,7 @@ def find_backend(target_backend: str, orig_backend: str = None):
     )
 
 
-def get_all_compatible_backend(device, backend, orig_backend, bits, group_size, sym, in_features, out_features):
+def get_all_compatible_backend(device, backend, orig_backend, config, in_features, out_features):
     # Get packing format from the original backend
     packing_format = BackendInfos[orig_backend].packing_format
 
@@ -693,16 +691,14 @@ def get_all_compatible_backend(device, backend, orig_backend, bits, group_size,
     compatible_backends = [
         key
         for key in BackendInfos.keys()
-        if check_compatible(
-            key, device, bits, group_size, sym, packing_format, in_features, out_features, check_requirements=False
-        )
+        if check_compatible(key, device, config, packing_format, in_features, out_features, check_requirements=False)
     ]
 
     # Return the first compatible backend or an empty list if none found
     return compatible_backends
 
 
-def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_features, out_features):
+def get_layer_backend(device, backend, orig_backend, config, in_features, out_features):
     """Selects the most suitable backend for the layer based on compatibility and priority.
 
     This function first checks if the specified backend supports the layer with the provided configuration.
@@ -736,8 +732,10 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
             If the specified backend is not supported.
             If no compatible backend is found for the given layer configuration.
     """
+    bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
     # Check if the provided backend is in BackendInfos
     backend = find_backend(backend)
+
     if backend not in BackendInfos.keys():
         raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.")
 
@@ -746,13 +744,13 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f
     # Find and store other compatible backends
     supported_backends = []
     for key in BackendInfos.keys():
-        if check_compatible(key, device, bits, group_size, sym, packing_format, in_features, out_features):
+        if check_compatible(key, device, config, packing_format, in_features, out_features):
             supported_backends.append(key)
 
     # Raise an error if no compatible backends are found
     if len(supported_backends) == 0:
         supported_backends_need_package = get_all_compatible_backend(
-            device, backend, orig_backend, bits, group_size, sym, in_features, out_features
+            device, backend, orig_backend, config, in_features, out_features
         )
 
         if len(supported_backends_need_package) > 0:
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index 4aec6338d..73d1385e9 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -407,9 +407,7 @@ def _get_layer_backend(target_device, target_backend, orig_backend, config, in_f
         target_device,
         target_backend,
         orig_backend,
-        config["bits"],
-        config["group_size"],
-        config["sym"],
+        config,
         in_features,
         out_features,
     )

From 27910da00fa5c0be064ad8c54de8b81bbb369d5f Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 03:48:06 -0400
Subject: [PATCH 31/47] pass config to checker

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 77bb64784..ca07e1972 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -73,15 +73,16 @@ class BackendInfo:
     requirements: Optional[List[str]] = None
 
 
-def feature_multiply_checker(in_feature, out_feature, group_size, in_feature_multiplier, out_feature_multiplier=None):
+def feature_multiply_checker(in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None):
     if out_feature_multiplier is None:
         out_feature_multiplier = in_feature_multiplier
     return in_feature % in_feature_multiplier == 0 and out_feature % out_feature_multiplier == 0
 
 
 def feature_multiply_checker_group_size(
-    in_feature, out_feature, group_size, in_feature_multiplier, out_feature_multiplier=None
+    in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None
 ):
+    group_size = config["group_size"]
     if out_feature_multiplier is None:
         out_feature_multiplier = in_feature_multiplier
     return (
@@ -410,7 +411,7 @@ def check_compatible(backend_name, device, config, packing_format, in_features,
         return False
 
     for check in backend.feature_checks:
-        if not check(in_features, out_features, group_size):
+        if not check(in_features, out_features, config):
             return False
 
     if check_requirements and backend.requirements is not None:

From d46acdb2b9ac61255f97a14cb1fd41dcc0356c45 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 04:10:24 -0400
Subject: [PATCH 32/47] add more check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       | 19 ++++++++++++++++--
 auto_round/inference/convert_model.py |  2 +-
 auto_round/schemes.py                 | 29 ++++++++++++++++++++++++++-
 3 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index ca07e1972..0f822d7c2 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -19,6 +19,7 @@
 from transformers.utils.versions import require_version
 
 import auto_round_extension.cuda.gptqmodel_marlin
+from auto_round.schemes import QuantizationScheme
 from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger
 
 BackendInfos = {}
@@ -105,6 +106,21 @@ def feature_multiply_checker_group_size(
     feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
 )
 
+
+def torch_fp8_static_check(
+    in_feature: int,
+    out_feature: int,
+    config: QuantizationScheme,
+    in_feature_multiplier: Optional[int] = None,
+    out_feature_multiplier: Optional[int] = None,
+):
+    if not is_weight_fp8_activation_static_fp8(config):
+        return False
+    from auto_round.schemes import FPW8_STATIC
+
+    return config == FPW8_STATIC
+
+
 BackendInfos["auto_gptq:exllamav2"] = BackendInfo(
     device=["cuda"],
     sym=[True, False],
@@ -184,7 +200,7 @@ def feature_multiply_checker_group_size(
     dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
-    feature_checks=[],
+    feature_checks=[torch_fp8_static_check],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.6.0"],
 )
@@ -733,7 +749,6 @@ def get_layer_backend(device, backend, orig_backend, config, in_features, out_fe
             If the specified backend is not supported.
             If no compatible backend is found for the given layer configuration.
     """
-    bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
     # Check if the provided backend is in BackendInfos
     backend = find_backend(backend)
 
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index 73d1385e9..e50bf71e8 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -309,7 +309,7 @@ def get_layer_config(model, quantization_config):
         layer_extra_config = extra_config.get(layer_name, {})
         for scheme_attr in quant_scheme_attrs:
             layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr))
-        layer_configs[layer_name] = layer_config
+        layer_configs[layer_name] = QuantizationScheme.from_dict(layer_config)
 
     return layer_configs
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index 2908cd73b..c7221a1fd 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -14,7 +14,7 @@
 import copy
 from copy import deepcopy
 from dataclasses import dataclass, fields
-from typing import List, Optional
+from typing import Generator, List, Optional
 
 __all__ = ["QuantizationScheme", "preset_name_to_scheme"]
 
@@ -42,6 +42,33 @@ def from_dict(cls, config: dict):
     def get_attributes(cls: "QuantizationScheme") -> List[str]:
         return [field.name for field in fields(cls)]
 
+    def __getitem__(self, key: str):
+        if key not in self.get_attributes():
+            raise KeyError(f"{key} is not a valid attribute")
+        return getattr(self, key)
+
+    def __setitem__(self, key: str, value: None | int | str):
+        if key not in self.get_attributes():
+            raise KeyError(f"{key} is not a valid attribute")
+        setattr(self, key, value)
+
+    def items(self):
+        return ((field, getattr(self, field)) for field in self.get_attributes())
+
+    def keys(self):
+        return self.get_attributes()
+
+    def values(self):
+        return (getattr(self, field) for field in self.get_attributes())
+
+    def __eq__(self, other: "QuantizationScheme") -> bool:
+        if not isinstance(other, QuantizationScheme):
+            return False
+        for field in self.get_attributes():
+            if getattr(self, field) != getattr(other, field):
+                return False
+        return True
+
 
 def preset_name_to_scheme(name: str) -> QuantizationScheme:
     """Get a QuantizationScheme instance from a preset scheme name."""

From fd05799340e863813a3c4c75857fd99ec1b0eeb2 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 05:10:03 -0400
Subject: [PATCH 33/47] refine code

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 3 +--
 auto_round/inference/backend.py                 | 2 --
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 2f0552c1b..89c9032d6 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -25,6 +25,7 @@
 import transformers
 from tqdm import tqdm
 
+from auto_round.autoround import AutoRoundFormat
 from auto_round.utils import (
     SUPPORTED_FORMATS,
     SUPPORTED_LAYER_TYPES,
@@ -151,8 +152,6 @@ def pack_layer(layer_name, model, backend):
         from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer
 
         return pack_layer(layer_name, model, backend)
-    # breakpoint()
-    from auto_round.autoround import AutoRoundFormat
 
     if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}":
         from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 0f822d7c2..cb521bf82 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -114,8 +114,6 @@ def torch_fp8_static_check(
     in_feature_multiplier: Optional[int] = None,
     out_feature_multiplier: Optional[int] = None,
 ):
-    if not is_weight_fp8_activation_static_fp8(config):
-        return False
     from auto_round.schemes import FPW8_STATIC
 
     return config == FPW8_STATIC

From 3d75c276ad234d6d15f61afb5c08be3ff0303c97 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 06:24:09 -0400
Subject: [PATCH 34/47] fix equal check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py | 3 ++-
 auto_round/schemes.py                 | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index e50bf71e8..c2d4a6b6a 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -254,6 +254,8 @@ def get_layer_config(model, quantization_config):
         act_bits=act_bits,
         act_group_size=act_group_size,
         act_sym=act_sym,
+        act_data_type=act_data_type,
+        act_dynamic=act_dynamic,
     )
 
     # Determine the quantization block list
@@ -310,7 +312,6 @@ def get_layer_config(model, quantization_config):
         for scheme_attr in quant_scheme_attrs:
             layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr))
         layer_configs[layer_name] = QuantizationScheme.from_dict(layer_config)
-
     return layer_configs
 
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index c7221a1fd..a07cc220a 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -183,6 +183,7 @@ def is_preset_scheme(name: str) -> bool:
         "act_group_size": 0,
         "act_data_type": "fp",
         "act_dynamic": False,
+        "act_sym": True,
     }
 )
 

From e0c0d58e5d197c085368ff485cd787d95442551a Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 06:24:09 -0400
Subject: [PATCH 35/47] fix equal check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/convert_model.py | 3 ++-
 auto_round/schemes.py                 | 1 +
 auto_round/utils.py                   | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index e50bf71e8..c2d4a6b6a 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -254,6 +254,8 @@ def get_layer_config(model, quantization_config):
         act_bits=act_bits,
         act_group_size=act_group_size,
         act_sym=act_sym,
+        act_data_type=act_data_type,
+        act_dynamic=act_dynamic,
     )
 
     # Determine the quantization block list
@@ -310,7 +312,6 @@ def get_layer_config(model, quantization_config):
         for scheme_attr in quant_scheme_attrs:
             layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr))
         layer_configs[layer_name] = QuantizationScheme.from_dict(layer_config)
-
     return layer_configs
 
 
diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index c7221a1fd..a07cc220a 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -183,6 +183,7 @@ def is_preset_scheme(name: str) -> bool:
         "act_group_size": 0,
         "act_data_type": "fp",
         "act_dynamic": False,
+        "act_sym": True,
     }
 )
 
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 5410609cf..4c42ec334 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -33,6 +33,7 @@
 from torch.amp import autocast
 
 from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType
+from auto_round.schemes import QuantizationScheme
 
 SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings")
 
@@ -524,7 +525,7 @@ def check_to_quantized(config):
         bool: True if the configuration is valid for quantization (bits <= 8),
             False otherwise.
     """
-    if isinstance(config, dict):
+    if isinstance(config, (dict, QuantizationScheme)):
         bits = int(config.get("bits", 16))
         act_bits = int(config.get("act_bits", 16))
     elif hasattr(config, "orig_layer"):

From fa3ec2dd5048da6522e7a0dffe05716c9f6a5eba Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 08:55:43 -0400
Subject: [PATCH 36/47] fix get

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/schemes.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/auto_round/schemes.py b/auto_round/schemes.py
index a07cc220a..007828290 100644
--- a/auto_round/schemes.py
+++ b/auto_round/schemes.py
@@ -61,6 +61,15 @@ def keys(self):
     def values(self):
         return (getattr(self, field) for field in self.get_attributes())
 
+    def get(self, key: str, default=None):
+        if key not in self.get_attributes():
+            return default
+        res = getattr(self, key)
+        # In case the attribute is explicitly set to None, return default
+        if res is None:
+            return default
+        return getattr(self, key)
+
     def __eq__(self, other: "QuantizationScheme") -> bool:
         if not isinstance(other, QuantizationScheme):
             return False

From ad5269e011310dd3ad28c10d373161124bca0cee Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 22:11:48 -0400
Subject: [PATCH 37/47] rename

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index cb521bf82..0f224d740 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -98,16 +98,16 @@ def feature_multiply_checker_group_size(
     feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32
 )
 
-exllamav2_feature_check = functools.partial(
+exllamav2_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32
 )
 
-gptqmodel_marlin_feature_check = functools.partial(
+gptqmodel_marlin_feature_checker = functools.partial(
     feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64
 )
 
 
-def torch_fp8_static_check(
+def torch_fp8_static_checker(
     in_feature: int,
     out_feature: int,
     config: QuantizationScheme,
@@ -128,7 +128,7 @@ def torch_fp8_static_check(
     dtype=["float16"],
     ##16, 384,768 accuracy issue
     group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
-    feature_checks=[exllamav2_feature_check],
+    feature_checks=[exllamav2_feature_checker],
     alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
     requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
 )
@@ -141,7 +141,7 @@ def torch_fp8_static_check(
     group_size=None,
     dtype=["float16"],
     priority=0,
-    feature_checks=[exllamav2_feature_check],
+    feature_checks=[exllamav2_feature_checker],
     alias=["auto_gptq:tritonv2"],
     requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"],
 )
@@ -153,7 +153,7 @@ def torch_fp8_static_check(
     bits=[2, 3, 4, 8],
     group_size=None,
     priority=1,
-    feature_checks=[exllamav2_feature_check],
+    feature_checks=[exllamav2_feature_checker],
     alias=["auto_gptq:cuda"],
     dtype=["float16"],
     convertable_format=["int32_zp"],
@@ -182,7 +182,7 @@ def torch_fp8_static_check(
     dtype=["float16", "bfloat16"],
     bits=[2, 3, 4, 8],
     priority=0,
-    feature_checks=[exllamav2_feature_check],
+    feature_checks=[exllamav2_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>=0.5.1"],
 )
@@ -198,7 +198,7 @@ def torch_fp8_static_check(
     dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
-    feature_checks=[torch_fp8_static_check],
+    feature_checks=[torch_fp8_static_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.6.0"],
 )
@@ -223,7 +223,7 @@ def torch_fp8_static_check(
     dtype=["float16", "bfloat16"],
     bits=[2, 3, 4, 8],
     priority=0,
-    feature_checks=[exllamav2_feature_check],
+    feature_checks=[exllamav2_feature_checker],
     alias=["torch", "torch_zp"],
     requirements=["auto-round>=0.5.1"],
 )
@@ -236,7 +236,7 @@ def torch_fp8_static_check(
     group_size=[-1, 32, 64, 128],
     dtype=["float16", "bfloat16"],
     priority=6,
-    feature_checks=[gptqmodel_marlin_feature_check],
+    feature_checks=[gptqmodel_marlin_feature_checker],
     alias=["marlin", "gptqmodel"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -249,7 +249,7 @@ def torch_fp8_static_check(
     group_size=[-1, 32, 64, 128],
     dtype=["float16", "bfloat16"],
     priority=6,
-    feature_checks=[gptqmodel_marlin_feature_check],
+    feature_checks=[gptqmodel_marlin_feature_checker],
     alias=["marlin", "gptqmodel"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -262,7 +262,7 @@ def torch_fp8_static_check(
     group_size=[-1, 32, 64, 128],  ##16 seems has accuracy issue
     dtype=["float16", "bfloat16"],
     priority=5,
-    feature_checks=[exllamav2_feature_check],
+    feature_checks=[exllamav2_feature_checker],
     alias=["exllamav2"],
     requirements=["gptqmodel>=2.0"],
 )

From 35e45ed0c314682d9530ea30488519a781915f04 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 22:22:34 -0400
Subject: [PATCH 38/47] update check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py       |  6 ++++--
 auto_round/inference/convert_model.py |  4 ++--
 auto_round/utils.py                   | 11 -----------
 3 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 0f224d740..cb054aa72 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -19,8 +19,9 @@
 from transformers.utils.versions import require_version
 
 import auto_round_extension.cuda.gptqmodel_marlin
+from auto_round.autoround import AutoRoundFormat
 from auto_round.schemes import QuantizationScheme
-from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger
+from auto_round.utils import get_library_version, logger
 
 BackendInfos = {}
 
@@ -469,7 +470,8 @@ def dynamic_import_inference_linear(backend, config):
     """
     bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
 
-    if is_weight_fp8_activation_static_fp8(config):
+    if AutoRoundFormat.TORCH_FP8_STATIC.value in backend:
+        logger.warning_once("FP8 static quantization is still experimental.")
         from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear
 
         return WeightFP8ActFP8StaticQuantLinear
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index c2d4a6b6a..46b0bf2ab 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -21,13 +21,13 @@
 from tqdm import tqdm
 from transformers.pytorch_utils import Conv1D
 
+from auto_round.autoround import AutoRoundFormat
 from auto_round.inference.backend import (
     BackendInfos,
     dynamic_import_inference_linear,
     find_backend,
     get_highest_priority_backend,
     get_layer_backend,
-    is_weight_fp8_activation_static_fp8,
     process_requirement,
 )
 from auto_round.schemes import QuantizationScheme
@@ -452,7 +452,7 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features)
             out_features=out_features,
             bias=bias,
         )
-    elif is_weight_fp8_activation_static_fp8(config):
+    elif AutoRoundFormat.TORCH_FP8_STATIC.value in layer_backend:
         return QuantLinear.from_original(config, layer)
     # Default quantized layer creation
     try:
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 4c42ec334..077127fe8 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -2561,17 +2561,6 @@ def _is_weight_fp8_activation_static_fp8(
     return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic
 
 
-def is_weight_fp8_activation_static_fp8(config: Dict):
-    bits, group_size, sym, data_type, act_dynamic = (
-        config["bits"],
-        config["group_size"],
-        config["sym"],
-        config["data_type"],
-        config["act_dynamic"],
-    )
-    return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic)
-
-
 def is_wfp8afp8(ar):
     if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and (
         "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8)

From f4e254ba7d064dd9893c08078ef9074e0f192715 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Thu, 4 Sep 2025 22:27:23 -0400
Subject: [PATCH 39/47] add warning

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/experimental/qmodules/fp8_static.py | 7 +++++--
 auto_round/inference/backend.py                | 1 -
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py
index a61148d82..e7c55086d 100644
--- a/auto_round/experimental/qmodules/fp8_static.py
+++ b/auto_round/experimental/qmodules/fp8_static.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from abc import abstractmethod
+
 from typing import Optional, Union
 
 import torch
@@ -79,8 +79,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module):
     @classmethod
     def from_original(cls, config, original_layer):
         """
-        Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer.
+        Create an `WeightFP8ActFP8StaticQuantLinear` layer from an original linear layer.
         """
+        logger.warning_once(
+            "FP8 static quantization is still in experimental stage, the inference speed might be slow."
+        )
         device = original_layer.weight.device
         with torch.device(device):
             qdq_linear = cls(
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index cb054aa72..0006b1061 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -471,7 +471,6 @@ def dynamic_import_inference_linear(backend, config):
     bits, group_size, sym = config["bits"], config["group_size"], config["sym"]
 
     if AutoRoundFormat.TORCH_FP8_STATIC.value in backend:
-        logger.warning_once("FP8 static quantization is still experimental.")
         from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear
 
         return WeightFP8ActFP8StaticQuantLinear

From ff5a1e99b1950146fbdff33bb8a080fa29b0dbe3 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 5 Sep 2025 03:22:02 -0400
Subject: [PATCH 40/47] rename check

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py | 38 ++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 0006b1061..3b14da330 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -55,7 +55,7 @@ class BackendInfo:
             indicate higher priority. Defaults to 0.
         convertable_format: A list of strings specifying the formats that the backend
             can convert from. Defaults to an empty list.
-        feature_checks: A list of feature check functions (e.g., validation methods)
+        check_list: A list of feature check functions (e.g., validation methods)
             used to verify whether the backend supports certain features. Defaults to
             an empty list.
         alias: An optional list of strings representing alternative names for the
@@ -70,7 +70,7 @@ class BackendInfo:
     group_size: Optional[List[int]] = None
     priority: int = 0  ##higher is better
     convertable_format: List[str] = field(default_factory=list)
-    feature_checks: List[Any] = field(default_factory=list)
+    check_list: List[Any] = field(default_factory=list)
     alias: Optional[List[str]] = None
     requirements: Optional[List[str]] = None
 
@@ -129,7 +129,7 @@ def torch_fp8_static_checker(
     dtype=["float16"],
     ##16, 384,768 accuracy issue
     group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
-    feature_checks=[exllamav2_feature_checker],
+    check_list=[exllamav2_feature_checker],
     alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
     requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
 )
@@ -142,7 +142,7 @@ def torch_fp8_static_checker(
     group_size=None,
     dtype=["float16"],
     priority=0,
-    feature_checks=[exllamav2_feature_checker],
+    check_list=[exllamav2_feature_checker],
     alias=["auto_gptq:tritonv2"],
     requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"],
 )
@@ -154,7 +154,7 @@ def torch_fp8_static_checker(
     bits=[2, 3, 4, 8],
     group_size=None,
     priority=1,
-    feature_checks=[exllamav2_feature_checker],
+    check_list=[exllamav2_feature_checker],
     alias=["auto_gptq:cuda"],
     dtype=["float16"],
     convertable_format=["int32_zp"],
@@ -171,7 +171,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 4, 8],
     priority=2,
-    feature_checks=[feature_multiply_checker_32],
+    check_list=[feature_multiply_checker_32],
     alias=["auto_round", "tritonv2", "triton"],
     requirements=["triton>=2.0", "auto-round>=0.5.0"],
 )
@@ -183,7 +183,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 3, 4, 8],
     priority=0,
-    feature_checks=[exllamav2_feature_checker],
+    check_list=[exllamav2_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>=0.5.1"],
 )
@@ -199,7 +199,7 @@ def torch_fp8_static_checker(
     dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
-    feature_checks=[torch_fp8_static_checker],
+    check_list=[torch_fp8_static_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.6.0"],
 )
@@ -212,7 +212,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 4, 8],
     priority=2,
-    feature_checks=[feature_multiply_checker_32],
+    check_list=[feature_multiply_checker_32],
     alias=["tritonv2", "tritonv2_zp", "triton"],
     requirements=["triton>=2.0", "auto-round>=0.5.0"],
 )
@@ -224,7 +224,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 3, 4, 8],
     priority=0,
-    feature_checks=[exllamav2_feature_checker],
+    check_list=[exllamav2_feature_checker],
     alias=["torch", "torch_zp"],
     requirements=["auto-round>=0.5.1"],
 )
@@ -237,7 +237,7 @@ def torch_fp8_static_checker(
     group_size=[-1, 32, 64, 128],
     dtype=["float16", "bfloat16"],
     priority=6,
-    feature_checks=[gptqmodel_marlin_feature_checker],
+    check_list=[gptqmodel_marlin_feature_checker],
     alias=["marlin", "gptqmodel"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -250,7 +250,7 @@ def torch_fp8_static_checker(
     group_size=[-1, 32, 64, 128],
     dtype=["float16", "bfloat16"],
     priority=6,
-    feature_checks=[gptqmodel_marlin_feature_checker],
+    check_list=[gptqmodel_marlin_feature_checker],
     alias=["marlin", "gptqmodel"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -263,7 +263,7 @@ def torch_fp8_static_checker(
     group_size=[-1, 32, 64, 128],  ##16 seems has accuracy issue
     dtype=["float16", "bfloat16"],
     priority=5,
-    feature_checks=[exllamav2_feature_checker],
+    check_list=[exllamav2_feature_checker],
     alias=["exllamav2"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -287,7 +287,7 @@ def torch_fp8_static_checker(
     bits=[2, 4, 8],
     group_size=None,
     priority=1,
-    feature_checks=[],
+    check_list=[],
     alias=["itrex", "qbits"],
     dtype=["float16", "bfloat16"],
     convertable_format=["int32"],
@@ -302,7 +302,7 @@ def torch_fp8_static_checker(
     group_size=None,
     dtype=["float16", "bfloat16"],
     priority=1,
-    feature_checks=[],
+    check_list=[],
     alias=["itrex", "qbits"],
     convertable_format=["int32_zp"],
     requirements=["torch<2.7.0", "intel-extension-for-transformers"],
@@ -316,7 +316,7 @@ def torch_fp8_static_checker(
     group_size=None,
     dtype=["float16", "bfloat16"],
     priority=1,
-    feature_checks=[],
+    check_list=[],
     alias=["itrex", "qbits"],
     requirements=["torch<2.7.0", "intel-extension-for-transformers"],
 )
@@ -328,7 +328,7 @@ def torch_fp8_static_checker(
     bits=[4],
     group_size=None,
     priority=5,
-    feature_checks=[],
+    check_list=[],
     dtype=["float16", "bfloat16"],
     convertable_format=["int32_zp"],
     alias=["ipex"],
@@ -343,7 +343,7 @@ def torch_fp8_static_checker(
     group_size=None,
     priority=1,
     dtype=["float16", "bfloat16"],
-    feature_checks=[],
+    check_list=[],
     alias=["ipex"],
     convertable_format=["awq"],
     requirements=["intel-extension-for-pytorch>=2.6"],
@@ -425,7 +425,7 @@ def check_compatible(backend_name, device, config, packing_format, in_features,
     else:
         return False
 
-    for check in backend.feature_checks:
+    for check in backend.check_list:
         if not check(in_features, out_features, config):
             return False
 

From 50968fd0acea6e2db1057e8db22478a9a24f8fcc Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Fri, 5 Sep 2025 03:36:15 -0400
Subject: [PATCH 41/47] rename

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py | 38 ++++++++++++++++-----------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 3b14da330..9d3aa58c5 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -55,7 +55,7 @@ class BackendInfo:
             indicate higher priority. Defaults to 0.
         convertable_format: A list of strings specifying the formats that the backend
             can convert from. Defaults to an empty list.
-        check_list: A list of feature check functions (e.g., validation methods)
+        checkers: A list of check functions (e.g., validation methods)
             used to verify whether the backend supports certain features. Defaults to
             an empty list.
         alias: An optional list of strings representing alternative names for the
@@ -70,7 +70,7 @@ class BackendInfo:
     group_size: Optional[List[int]] = None
     priority: int = 0  ##higher is better
     convertable_format: List[str] = field(default_factory=list)
-    check_list: List[Any] = field(default_factory=list)
+    checkers: List[Any] = field(default_factory=list)
     alias: Optional[List[str]] = None
     requirements: Optional[List[str]] = None
 
@@ -129,7 +129,7 @@ def torch_fp8_static_checker(
     dtype=["float16"],
     ##16, 384,768 accuracy issue
     group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048],
-    check_list=[exllamav2_feature_checker],
+    checkers=[exllamav2_feature_checker],
     alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"],
     requirements=["torch<2.6.0", "auto-gptq>=0.7.1"],
 )
@@ -142,7 +142,7 @@ def torch_fp8_static_checker(
     group_size=None,
     dtype=["float16"],
     priority=0,
-    check_list=[exllamav2_feature_checker],
+    checkers=[exllamav2_feature_checker],
     alias=["auto_gptq:tritonv2"],
     requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"],
 )
@@ -154,7 +154,7 @@ def torch_fp8_static_checker(
     bits=[2, 3, 4, 8],
     group_size=None,
     priority=1,
-    check_list=[exllamav2_feature_checker],
+    checkers=[exllamav2_feature_checker],
     alias=["auto_gptq:cuda"],
     dtype=["float16"],
     convertable_format=["int32_zp"],
@@ -171,7 +171,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 4, 8],
     priority=2,
-    check_list=[feature_multiply_checker_32],
+    checkers=[feature_multiply_checker_32],
     alias=["auto_round", "tritonv2", "triton"],
     requirements=["triton>=2.0", "auto-round>=0.5.0"],
 )
@@ -183,7 +183,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 3, 4, 8],
     priority=0,
-    check_list=[exllamav2_feature_checker],
+    checkers=[exllamav2_feature_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>=0.5.1"],
 )
@@ -199,7 +199,7 @@ def torch_fp8_static_checker(
     dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
-    check_list=[torch_fp8_static_checker],
+    checkers=[torch_fp8_static_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.6.0"],
 )
@@ -212,7 +212,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 4, 8],
     priority=2,
-    check_list=[feature_multiply_checker_32],
+    checkers=[feature_multiply_checker_32],
     alias=["tritonv2", "tritonv2_zp", "triton"],
     requirements=["triton>=2.0", "auto-round>=0.5.0"],
 )
@@ -224,7 +224,7 @@ def torch_fp8_static_checker(
     dtype=["float16", "bfloat16"],
     bits=[2, 3, 4, 8],
     priority=0,
-    check_list=[exllamav2_feature_checker],
+    checkers=[exllamav2_feature_checker],
     alias=["torch", "torch_zp"],
     requirements=["auto-round>=0.5.1"],
 )
@@ -237,7 +237,7 @@ def torch_fp8_static_checker(
     group_size=[-1, 32, 64, 128],
     dtype=["float16", "bfloat16"],
     priority=6,
-    check_list=[gptqmodel_marlin_feature_checker],
+    checkers=[gptqmodel_marlin_feature_checker],
     alias=["marlin", "gptqmodel"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -250,7 +250,7 @@ def torch_fp8_static_checker(
     group_size=[-1, 32, 64, 128],
     dtype=["float16", "bfloat16"],
     priority=6,
-    check_list=[gptqmodel_marlin_feature_checker],
+    checkers=[gptqmodel_marlin_feature_checker],
     alias=["marlin", "gptqmodel"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -263,7 +263,7 @@ def torch_fp8_static_checker(
     group_size=[-1, 32, 64, 128],  ##16 seems has accuracy issue
     dtype=["float16", "bfloat16"],
     priority=5,
-    check_list=[exllamav2_feature_checker],
+    checkers=[exllamav2_feature_checker],
     alias=["exllamav2"],
     requirements=["gptqmodel>=2.0"],
 )
@@ -287,7 +287,7 @@ def torch_fp8_static_checker(
     bits=[2, 4, 8],
     group_size=None,
     priority=1,
-    check_list=[],
+    checkers=[],
     alias=["itrex", "qbits"],
     dtype=["float16", "bfloat16"],
     convertable_format=["int32"],
@@ -302,7 +302,7 @@ def torch_fp8_static_checker(
     group_size=None,
     dtype=["float16", "bfloat16"],
     priority=1,
-    check_list=[],
+    checkers=[],
     alias=["itrex", "qbits"],
     convertable_format=["int32_zp"],
     requirements=["torch<2.7.0", "intel-extension-for-transformers"],
@@ -316,7 +316,7 @@ def torch_fp8_static_checker(
     group_size=None,
     dtype=["float16", "bfloat16"],
     priority=1,
-    check_list=[],
+    checkers=[],
     alias=["itrex", "qbits"],
     requirements=["torch<2.7.0", "intel-extension-for-transformers"],
 )
@@ -328,7 +328,7 @@ def torch_fp8_static_checker(
     bits=[4],
     group_size=None,
     priority=5,
-    check_list=[],
+    checkers=[],
     dtype=["float16", "bfloat16"],
     convertable_format=["int32_zp"],
     alias=["ipex"],
@@ -343,7 +343,7 @@ def torch_fp8_static_checker(
     group_size=None,
     priority=1,
     dtype=["float16", "bfloat16"],
-    check_list=[],
+    checkers=[],
     alias=["ipex"],
     convertable_format=["awq"],
     requirements=["intel-extension-for-pytorch>=2.6"],
@@ -425,7 +425,7 @@ def check_compatible(backend_name, device, config, packing_format, in_features,
     else:
         return False
 
-    for check in backend.check_list:
+    for check in backend.checkers:
         if not check(in_features, out_features, config):
             return False
 

From abd83acf652b6e1d176110472f636cf3142b9654 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 6 Sep 2025 06:59:06 +0000
Subject: [PATCH 42/47] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/export/export_to_autoround/export.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index c66f109da..472d18e06 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -25,7 +25,6 @@
 import transformers
 from tqdm import tqdm
 
-
 from auto_round.autoround import AutoRoundFormat
 from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
 from auto_round.utils import (

From d332a957db8e2ace5cd11efe67a9b5ab3ec83966 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 8 Sep 2025 08:30:11 +0000
Subject: [PATCH 43/47] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/autoround.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index 3cacaf4a4..7cb77acfe 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -18,8 +18,8 @@
 import sys
 import time
 import traceback
-from enum import Enum
 from dataclasses import asdict, fields
+from enum import Enum
 from typing import Any, Callable, Union
 
 import accelerate

From 8a4a533451ed51d7554a5853a87e19006e7256c9 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 8 Sep 2025 12:36:53 +0000
Subject: [PATCH 44/47] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 auto_round/utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/auto_round/utils.py b/auto_round/utils.py
index 24850f860..e91b37918 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -24,11 +24,8 @@
 from collections import UserDict
 from enum import Enum
 from functools import lru_cache
-
-from typing import Any, Callable, Dict, Tuple, Union
-
 from pathlib import Path
-
+from typing import Any, Callable, Dict, Tuple, Union
 
 import cpuinfo
 import torch

From f05e38ba4b4af0c17fb81e8daeac6aefd5ebaa70 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Mon, 8 Sep 2025 22:12:21 -0400
Subject: [PATCH 45/47] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/inference/backend.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 9d3aa58c5..cedcbbd64 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -115,9 +115,9 @@ def torch_fp8_static_checker(
     in_feature_multiplier: Optional[int] = None,
     out_feature_multiplier: Optional[int] = None,
 ):
-    from auto_round.schemes import FPW8_STATIC
+    from auto_round.schemes import FP8_STATIC
 
-    return config == FPW8_STATIC
+    return config == FP8_STATIC
 
 
 BackendInfos["auto_gptq:exllamav2"] = BackendInfo(

From c58a61c6176bd03a49d16ace0a0bf2089fdf838a Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 9 Sep 2025 07:25:20 -0400
Subject: [PATCH 46/47] update

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/autoround.py                           | 7 +------
 auto_round/export/export_to_autoround/__init__.py | 2 +-
 auto_round/export/export_to_autoround/export.py   | 8 +++++++-
 auto_round/inference/backend.py                   | 2 +-
 auto_round/inference/convert_model.py             | 2 +-
 auto_round/utils.py                               | 2 +-
 6 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index b67d729af..ef01063b4 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -31,6 +31,7 @@
 
 from auto_round.data_type import QUANT_FUNC_WITH_DTYPE
 from auto_round.data_type.utils import reshape_pad_tensor_by_group_size
+from auto_round.export.export_to_autoround import AutoRoundFormat
 from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType
 from auto_round.low_cpu_mem.utils import get_layers_before_block
 from auto_round.schemes import QuantizationScheme, preset_name_to_scheme
@@ -97,12 +98,6 @@
 from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block
 
 
-class AutoRoundFormat(str, Enum):
-    # Weight: FP8, per-channel, may be extended to per-tensor in future
-    # Activation: FP8, per-tensor
-    TORCH_FP8_STATIC = "torch_fp8_static"
-
-
 class AutoRound(object):
     """Automatic weight rounding (Signed Gradient Descent) for LLM quantization
 
diff --git a/auto_round/export/export_to_autoround/__init__.py b/auto_round/export/export_to_autoround/__init__.py
index 0c036d831..6cdcd5aed 100644
--- a/auto_round/export/export_to_autoround/__init__.py
+++ b/auto_round/export/export_to_autoround/__init__.py
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .export import save_quantized_as_autoround
+from .export import save_quantized_as_autoround, AutoRoundFormat
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index de8d15752..03f8ee0c2 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -18,6 +18,7 @@
 import json
 import os
 from concurrent.futures import ThreadPoolExecutor
+from enum import Enum
 
 import threadpoolctl as tctl
 import torch
@@ -25,7 +26,6 @@
 import transformers
 from tqdm import tqdm
 
-from auto_round.autoround import AutoRoundFormat
 from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config
 from auto_round.utils import (
     SUPPORTED_FORMATS,
@@ -44,6 +44,12 @@
 )
 
 
+class AutoRoundFormat(str, Enum):
+    # Weight: FP8, per-channel, may be extended to per-tensor in future
+    # Activation: FP8, per-tensor
+    TORCH_FP8_STATIC = "torch_fp8_static"
+
+
 def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16):
     """
     Dynamically imports and returns the appropriate QuantLinear class based on the specified backend and parameters.
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index cedcbbd64..3c47af6b3 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -19,7 +19,7 @@
 from transformers.utils.versions import require_version
 
 import auto_round_extension.cuda.gptqmodel_marlin
-from auto_round.autoround import AutoRoundFormat
+from auto_round.export.export_to_autoround import AutoRoundFormat
 from auto_round.schemes import QuantizationScheme
 from auto_round.utils import get_library_version, logger
 
diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py
index 59bf29a09..fcfb83b4c 100644
--- a/auto_round/inference/convert_model.py
+++ b/auto_round/inference/convert_model.py
@@ -21,7 +21,7 @@
 from tqdm import tqdm
 from transformers.pytorch_utils import Conv1D
 
-from auto_round.autoround import AutoRoundFormat
+from auto_round.export.export_to_autoround import AutoRoundFormat
 from auto_round.inference.backend import (
     BackendInfos,
     dynamic_import_inference_linear,
diff --git a/auto_round/utils.py b/auto_round/utils.py
index 65695ff87..c14131e57 100644
--- a/auto_round/utils.py
+++ b/auto_round/utils.py
@@ -25,7 +25,7 @@
 from enum import Enum
 from functools import lru_cache
 from pathlib import Path
-from typing import Any, Callable, Dict, Tuple, Union
+from typing import Any, Callable, Tuple, Union
 
 import cpuinfo
 import torch

From 2c34244c98e69e2495e3f1f303f643d9097241e6 Mon Sep 17 00:00:00 2001
From: yiliu30 <yi4.liu@intel.com>
Date: Tue, 9 Sep 2025 21:34:00 -0400
Subject: [PATCH 47/47] fix

Signed-off-by: yiliu30 <yi4.liu@intel.com>
---
 auto_round/export/export_to_autoround/export.py | 2 +-
 auto_round/inference/backend.py                 | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index 03f8ee0c2..ffd10036c 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -47,7 +47,7 @@
 class AutoRoundFormat(str, Enum):
     # Weight: FP8, per-channel, may be extended to per-tensor in future
     # Activation: FP8, per-tensor
-    TORCH_FP8_STATIC = "torch_fp8_static"
+    TORCH_FP8_STATIC = "fp8_static"
 
 
 def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16):
diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py
index 3c47af6b3..1868ee14c 100644
--- a/auto_round/inference/backend.py
+++ b/auto_round/inference/backend.py
@@ -73,6 +73,7 @@ class BackendInfo:
     checkers: List[Any] = field(default_factory=list)
     alias: Optional[List[str]] = None
     requirements: Optional[List[str]] = None
+    # TODO(Yi): Add more fields for activation dtype, group size, etc.
 
 
 def feature_multiply_checker(in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None):
@@ -108,7 +109,7 @@ def feature_multiply_checker_group_size(
 )
 
 
-def torch_fp8_static_checker(
+def fp8_static_scheme_checker(
     in_feature: int,
     out_feature: int,
     config: QuantizationScheme,
@@ -192,14 +193,14 @@ def torch_fp8_static_checker(
 # Weight: FP8, per-channel, may be extended to per-tensor in future
 # Activation: FP8, per-tensor
 
-BackendInfos["auto_round:torch_fp8_static"] = BackendInfo(
+BackendInfos["auto_round:fp8_static"] = BackendInfo(
     device=["xpu", "cuda", "cpu"],
     packing_format="",
     sym=[True],
     dtype=["float32", "float16", "bfloat16"],
     bits=[8],
     priority=0,
-    checkers=[torch_fp8_static_checker],
+    checkers=[fp8_static_scheme_checker],
     alias=["auto_round", "torch"],
     requirements=["auto-round>0.6.0"],
 )