From bb947822170a33324c1016bde7f5ec6661a23ab7 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 11 Aug 2025 10:19:44 +0000 Subject: [PATCH 01/47] load w8a8 Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 108 +++++++++++++++++++- examples/load_w8a8.py | 136 ++++++++++++++++++++++++++ 2 files changed, 243 insertions(+), 1 deletion(-) create mode 100644 examples/load_w8a8.py diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index bd6dde836..1fff106d5 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -472,6 +472,8 @@ def infer_target_device(device_map=None): def post_init(model, used_backends): + if is_weight_fp8_activation_static_fp8(model.config.quantization_config): + return need_autogptq_init = False need_gptqmodel_init = False need_ipex_itrex_init = False @@ -526,6 +528,108 @@ def post_init(model, used_backends): logger.warning("force model to bfloat16") + +def quant_tensor_with_scale(tensor, scale): + FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max + qtensor = tensor / scale + cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) + return scale, cliped_qtensor_fp8 + + +class FP8QDQLinear(torch.nn.Module): + dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__( + self, in_features: int, out_features: int, bias: bool = True, device=None + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter( + torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), + requires_grad=True, + ) + self.weight_scale = nn.Parameter( + torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype), + requires_grad=False, + ) + self.input_scale = nn.Parameter( + torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False + ) + if bias: + self.bias = nn.Parameter(torch.empty(out_features)) + else: + self.register_parameter("bias", None) + self.pre_dequantized = False + + def dequant_weight_online(self): + if self.pre_dequantized: + return self.weight + fp8_weight = self.weight + qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale + return qdq_weight + + def pre_dequantize(self): + if self.pre_dequantized: + return + dequant_weight = self.dequant_weight_online() + del self.weight + del self.weight_scale + self.weight = nn.Parameter(dequant_weight, requires_grad=False) + self.pre_dequantized = True + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = quant_tensor_with_scale( + bf16_input, self.input_scale.data + ) + qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale + return qdq_input_bf16 + + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out + + @classmethod + def from_original(cls, config, original_layer): + """ + Create an FP8QDQLinear layer from an original linear layer. + """ + device = original_layer.weight.device + with torch.device(device): + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + bias=original_layer.bias is not None, + ) + return qdq_linear + + +def _patching_mod( + mod, config, src_cls, dst_cls +): + named_children_list = list(mod.named_children()) + for name, layer in named_children_list: + if isinstance(layer, src_cls): + new_layer = dst_cls.from_original(config, layer) + setattr(mod, name, new_layer) + print(f"Patched {name} with {new_layer.__class__.__name__}") + elif isinstance(layer, nn.Module): + _patching_mod(layer, config, src_cls, dst_cls) + return mod + + +def patching_model(model): + model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear) + return model + + +def is_weight_fp8_activation_static_fp8(quant_config): + return True + def convert_hf_model(model: nn.Module, target_device="cpu"): """Converts the given model to an AutoRound model by replacing its layers with quantized layers. @@ -547,7 +651,9 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): """ quantization_config = model.config.quantization_config - + if is_weight_fp8_activation_static_fp8(quantization_config): + model = patching_model(model) + if hasattr(quantization_config, "desc_act") and quantization_config.desc_act: ##check static_group if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or ( diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py new file mode 100644 index 000000000..df10b6c10 --- /dev/null +++ b/examples/load_w8a8.py @@ -0,0 +1,136 @@ +import os +import torch +import tqdm +from loguru import logger +import logging +import safetensors +from safetensors import safe_open +from safetensors.torch import save_file +import json + +logging.basicConfig(level=logging.DEBUG) +torch.set_grad_enabled(False) + +# CONSTANTS +SAFETENSORS = "safetensors" +WEIGHT_SCALE_NAME = "weight_scale" +INPUT_SCALE_NAME = "scale_input" +SCALE_DTYPE = torch.bfloat16 +SCALE_FILE_NAME = f"scales.{SAFETENSORS}" +FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max +WEIGHT_BACKOFF = 1.0 +QUANT_MODULE_TYPES = (torch.nn.Linear,) +SKIP_WEIGHT_LST = { + "model.norm", + "layernorm", + "e_score_correction_bias", + # "lm_head.weight", + "embed_tokens", + "mlp.gate.weight", # mlp.gate is not linear +} + +MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" + + +seed = 0 +import random + +random.seed(seed) +import torch + +torch.manual_seed(seed) +torch.cuda.manual_seed(seed) +import numpy as np + +np.random.seed(seed) + + +# torch.use_deterministic_algorithms(True) +def seed_worker(worker_id): + worker_seed = torch.initial_seed() % 2**32 + np.random.seed(worker_seed) + random.seed(worker_seed) + + +g = torch.Generator() +g.manual_seed(0) + + + + +def pre_dequantize(model): + """ + Pre-dequantize all FP8QDQLinear layers in the model. + """ + for name, module in model.named_modules(): + if module.__class__.__name__ == "FP8QDQLinear": + logger.info(f"Pre-dequantizing {name}") + module.pre_dequantize() + else: + logger.debug(f"Skipping {name} as it is not FP8QDQLinear") + + +def qdq_eval(model_path, not_patch_lin=False): + import transformers + from transformers.modeling_utils import no_init_weights + + + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + logger.info(f"Patched model: {model}") + model.eval() + model.to("cuda") + import torch + + model = torch.compile(model) + # pre_dequantize(model) + with torch.device("cuda"): + tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) + prompt = "Hi, who" + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate(encode, max_length=100) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + logger.info(f"Prompt: {prompt}") + logger.info(f"Output: {output}") + + # from auto_round.script.llm import eval_task_by_task + + # eval_task_by_task( + # model=model, + # device="cuda", + # tasks="gsm8k", + # batch_size=32, + # limit=128, + # # trust_remote_code=not args.disable_trust_remote_code, + # # eval_model_dtype=args.eval_model_dtype + # ) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('-m', "--qmodel_path", type=str, required=True) + parser.add_argument( + "--not_patch_lin", action="store_true", help="Measure float model" + ) + args = parser.parse_args() + qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) + + +""" +p load_w8a8.py --qmodel_path /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8 +Running generate_until requests: 76%|███ | 97/128 [11:45<03: +Running generate_until requests: 100%|███| 128/128 [11:45<00:00, 5.51s/it] +|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| +|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| +|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7422|± |0.0388| +| | |strict-match | 5|exact_match|↑ |0.6797|± |0.0414| + +total eval time: 742.8823928833008 +""" \ No newline at end of file From 9bef8263328fe7ef152d828c1775d4aa385885cc Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 02:44:17 -0400 Subject: [PATCH 02/47] refactor Signed-off-by: yiliu30 --- .../export_to_autoround/export_to_fp8_woq.py | 89 +++++++++++++ auto_round/inference/backend.py | 20 ++- auto_round/inference/convert_model.py | 120 ++---------------- examples/load_w8a8.py | 39 +++--- 4 files changed, 141 insertions(+), 127 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 5b6a4c400..8b357e090 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -16,6 +16,7 @@ import json import os from concurrent.futures import ThreadPoolExecutor +from typing import Optional, Union import threadpoolctl as tctl import torch @@ -83,6 +84,94 @@ def __init__( self.register_buffer("input_scale", input_scale.to(dtype)) +def quant_tensor_with_scale(tensor, scale): + FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max + qtensor = tensor / scale + cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) + return scale, cliped_qtensor_fp8 + + +class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module): + hp_dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__( + self, + in_features, + out_features, + weight: Optional[torch.Tensor] = None, + weight_scale: Optional[torch.Tensor] = None, + bias: Union[torch.Tensor, bool, None] = None, + weight_zp: Optional[torch.Tensor] = None, + input_scale: Optional[torch.Tensor] = None, + dtype=torch.bfloat16, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight + self.weight = torch.nn.Parameter(init_weight, requires_grad=False) + self.dtype = dtype + if bias is not None: + if isinstance(bias, bool): + bias = torch.zeros((out_features,), dtype=dtype) + self.bias = torch.nn.Parameter(bias, requires_grad=False) + else: + self.register_parameter("bias", None) + init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale + self.register_buffer("weight_scale", init_weight_scale.to(dtype)) + + init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp + if weight_zp: + self.register_buffer("weight_zp", init_weight_zp.to(dtype)) + + init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale + self.register_buffer("input_scale", init_input_scale.to(dtype)) + self.pre_dequantized = False + + @classmethod + def from_original(cls, config, original_layer): + """ + Create an FP8WOQLinear layer from an original linear layer. + """ + device = original_layer.weight.device + with torch.device(device): + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + bias=original_layer.bias, + ) + return qdq_linear + + def dequant_weight_online(self): + if self.pre_dequantized: + return self.weight + fp8_weight = self.weight + qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale + return qdq_weight + + def pre_dequantize(self): + if self.pre_dequantized: + return + dequant_weight = self.dequant_weight_online() + del self.weight + del self.weight_scale + self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False) + self.pre_dequantized = True + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data) + qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale + return qdq_input_bf16 + + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out + + def pack_layer(layer_name, model, data_type, packing_device=None): """ Packs a model layer for quantization based on its type and configuration. diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index a4f578726..4b259db0a 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -410,7 +410,18 @@ def check_compatible( return True -def dynamic_import_inference_linear(backend, bits, group_size, sym): +def is_weight_fp8_activation_static_fp8(config): + bits, group_size, sym, data_type, act_dynamic = ( + config["bits"], + config["group_size"], + config["sym"], + config["data_type"], + config["act_dynamic"], + ) + return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic + + +def dynamic_import_inference_linear(backend, config): """Dynamically imports and returns the appropriate QuantLinear class based on the given backend. This function dynamically loads the correct `QuantLinear` class based on the backend and quantization @@ -435,6 +446,13 @@ def dynamic_import_inference_linear(backend, bits, group_size, sym): ImportError: If required modules are missing for a backend (e.g., Intel Extension, GPTQ, auto_awq). """ + bits, group_size, sym = config["bits"], config["group_size"], config["sym"] + + if is_weight_fp8_activation_static_fp8(config): + from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear + + return WeightFP8ActFP8StaticQuantLinear + if "qbits" in backend: try: from intel_extension_for_transformers import qbits # pylint: disable=E0401 diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 1fff106d5..bbca26f4f 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -27,6 +27,7 @@ find_backend, get_highest_priority_backend, get_layer_backend, + is_weight_fp8_activation_static_fp8, process_requirement, ) from auto_round.utils import ( @@ -61,7 +62,7 @@ def skip_not_convert_modules(model, quantization_config, layer_names, layer_conf try: # transformers new api modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert, add_default_skips=True) except: - modules_to_not_convert = get_modules_to_not_convert(model, modules_to_not_convert) + modules_to_not_convert = _get_modules_to_not_convert(model, modules_to_not_convert) if modules_to_not_convert: for layer_name in layer_names: if any([re.search(re.compile(n), layer_name) for n in modules_to_not_convert]): @@ -219,6 +220,7 @@ def get_layer_config(model, quantization_config): - group_size (int): Group size for weight quantization. - data_type (str, optional): Data type for quantization (default: "int"). - sym (bool): Whether to use symmetric quantization. + - act_dynamic (bool, optional): Whether to use dynamic activation quantization (default: False). - quant_block_list (list, optional): Predefined list of blocks to quantize. - to_quant_block_names (list or str, optional): Blocks to quantize (if quant_block_list is None). - extra_config (dict, optional): Per-layer overrides for quantization settings. @@ -231,13 +233,14 @@ def get_layer_config(model, quantization_config): - "group_size" (int): Group size for quantization. - "data_type" (str): Data type used for quantization. - "sym" (bool): Whether symmetric quantization is applied. + - "act_dynamic" (bool): Whether dynamic activation quantization is used. - "clip" (bool): Whether weight clipping is enabled. """ bits = quantization_config.bits group_size = quantization_config.group_size data_type = getattr(quantization_config, "data_type", "int") # Default to "int" if not specified sym = quantization_config.sym - + act_dynamic = getattr(quantization_config, "act_dynamic", False) # Determine the quantization block list quant_block_list = getattr(quantization_config, "quant_block_list", None) if quant_block_list is None: @@ -290,11 +293,11 @@ def get_layer_config(model, quantization_config): "group_size": extra_config.get(layer_name, {}).get("group_size", group_size), "data_type": extra_config.get(layer_name, {}).get("data_type", data_type), "sym": extra_config.get(layer_name, {}).get("sym", sym), + "act_dynamic": extra_config.get(layer_name, {}).get("act_dynamic", act_dynamic), "clip": extra_config.get(layer_name, {}).get("clip", False), } for layer_name in layer_names } - return layer_configs @@ -415,7 +418,7 @@ def _import_exllamav2_kernels(): def _create_quant_layer(layer, layer_backend, config, in_features, out_features): """Creates a quantized layer using the appropriate class.""" - QuantLinear = dynamic_import_inference_linear(layer_backend, config["bits"], config["group_size"], config["sym"]) + QuantLinear = dynamic_import_inference_linear(layer_backend, config) bias = layer.bias is not None # Special handling for AWQ layers @@ -437,6 +440,8 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features) out_features=out_features, bias=bias, ) + elif is_weight_fp8_activation_static_fp8(config): + return QuantLinear.from_original(config, layer) # Default quantized layer creation try: return QuantLinear( @@ -528,108 +533,6 @@ def post_init(model, used_backends): logger.warning("force model to bfloat16") - -def quant_tensor_with_scale(tensor, scale): - FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max - qtensor = tensor / scale - cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) - cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) - return scale, cliped_qtensor_fp8 - - -class FP8QDQLinear(torch.nn.Module): - dtype = torch.bfloat16 - fp8_dtype = torch.float8_e4m3fn - - def __init__( - self, in_features: int, out_features: int, bias: bool = True, device=None - ): - super().__init__() - self.in_features = in_features - self.out_features = out_features - self.weight = nn.Parameter( - torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), - requires_grad=True, - ) - self.weight_scale = nn.Parameter( - torch.empty((out_features, 1), dtype=FP8QDQLinear.dtype), - requires_grad=False, - ) - self.input_scale = nn.Parameter( - torch.empty((1, 1), dtype=FP8QDQLinear.dtype), requires_grad=False - ) - if bias: - self.bias = nn.Parameter(torch.empty(out_features)) - else: - self.register_parameter("bias", None) - self.pre_dequantized = False - - def dequant_weight_online(self): - if self.pre_dequantized: - return self.weight - fp8_weight = self.weight - qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.weight_scale - return qdq_weight - - def pre_dequantize(self): - if self.pre_dequantized: - return - dequant_weight = self.dequant_weight_online() - del self.weight - del self.weight_scale - self.weight = nn.Parameter(dequant_weight, requires_grad=False) - self.pre_dequantized = True - - def qdq_input(self, bf16_input: torch.Tensor): - input_scale, input_fp8 = quant_tensor_with_scale( - bf16_input, self.input_scale.data - ) - qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale - return qdq_input_bf16 - - def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: - qdq_input = self.qdq_input(bf16_input) - qdq_weight = self.dequant_weight_online() - out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) - return out - - @classmethod - def from_original(cls, config, original_layer): - """ - Create an FP8QDQLinear layer from an original linear layer. - """ - device = original_layer.weight.device - with torch.device(device): - qdq_linear = cls( - in_features=original_layer.in_features, - out_features=original_layer.out_features, - bias=original_layer.bias is not None, - ) - return qdq_linear - - -def _patching_mod( - mod, config, src_cls, dst_cls -): - named_children_list = list(mod.named_children()) - for name, layer in named_children_list: - if isinstance(layer, src_cls): - new_layer = dst_cls.from_original(config, layer) - setattr(mod, name, new_layer) - print(f"Patched {name} with {new_layer.__class__.__name__}") - elif isinstance(layer, nn.Module): - _patching_mod(layer, config, src_cls, dst_cls) - return mod - - -def patching_model(model): - model = _patching_mod(model, None, torch.nn.Linear, FP8QDQLinear) - return model - - -def is_weight_fp8_activation_static_fp8(quant_config): - return True - def convert_hf_model(model: nn.Module, target_device="cpu"): """Converts the given model to an AutoRound model by replacing its layers with quantized layers. @@ -651,9 +554,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): """ quantization_config = model.config.quantization_config - if is_weight_fp8_activation_static_fp8(quantization_config): - model = patching_model(model) - + if hasattr(quantization_config, "desc_act") and quantization_config.desc_act: ##check static_group if (hasattr(quantization_config, "static_groups") and not quantization_config.static_groups) or ( @@ -694,7 +595,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): backend = backend[len("auto_round:") :] used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend) - if backend == "auto" or backend == "": best_backend = get_highest_priority_backend( quantization_config.bits, diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py index df10b6c10..ad6218f9b 100644 --- a/examples/load_w8a8.py +++ b/examples/load_w8a8.py @@ -1,12 +1,13 @@ +import json +import logging import os + +import safetensors import torch import tqdm from loguru import logger -import logging -import safetensors from safetensors import safe_open from safetensors.torch import save_file -import json logging.basicConfig(level=logging.DEBUG) torch.set_grad_enabled(False) @@ -42,13 +43,13 @@ torch.cuda.manual_seed(seed) import numpy as np -np.random.seed(seed) +np.random.Generator(seed) # torch.use_deterministic_algorithms(True) def seed_worker(worker_id): worker_seed = torch.initial_seed() % 2**32 - np.random.seed(worker_seed) + np.random.Generator(worker_seed) random.seed(worker_seed) @@ -56,8 +57,6 @@ def seed_worker(worker_id): g.manual_seed(0) - - def pre_dequantize(model): """ Pre-dequantize all FP8QDQLinear layers in the model. @@ -70,10 +69,15 @@ def pre_dequantize(model): logger.debug(f"Skipping {name} as it is not FP8QDQLinear") +import torch + + +@torch.no_grad() def qdq_eval(model_path, not_patch_lin=False): + import transformers - from transformers.modeling_utils import no_init_weights + # from transformers.modeling_utils import no_init_weights model = transformers.AutoModelForCausalLM.from_pretrained( model_path, @@ -86,14 +90,19 @@ def qdq_eval(model_path, not_patch_lin=False): model.to("cuda") import torch - model = torch.compile(model) - # pre_dequantize(model) with torch.device("cuda"): + from transformers import GenerationConfig + + gen_config = GenerationConfig(use_cache=True, cache_implementation="static") tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) prompt = "Hi, who" encode = tokenizer.encode(prompt, return_tensors="pt") with torch.no_grad(): - output_tokens = model.generate(encode, max_length=100) + output_tokens = model.generate( + encode, + max_length=10, + # generation_config=gen_config + ) output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) logger.info(f"Prompt: {prompt}") logger.info(f"Output: {output}") @@ -115,10 +124,8 @@ def qdq_eval(model_path, not_patch_lin=False): import argparse parser = argparse.ArgumentParser() - parser.add_argument('-m', "--qmodel_path", type=str, required=True) - parser.add_argument( - "--not_patch_lin", action="store_true", help="Measure float model" - ) + parser.add_argument("-m", "--qmodel_path", type=str, required=True) + parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model") args = parser.parse_args() qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) @@ -133,4 +140,4 @@ def qdq_eval(model_path, not_patch_lin=False): | | |strict-match | 5|exact_match|↑ |0.6797|± |0.0414| total eval time: 742.8823928833008 -""" \ No newline at end of file +""" From b30a126fed56bd07473d2bba53d1dcbe9ed9bd7b Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 03:01:56 -0400 Subject: [PATCH 03/47] add ut Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 2 -- test/test_cpu/test_export.py | 28 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index bbca26f4f..bd8b4621d 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -477,8 +477,6 @@ def infer_target_device(device_map=None): def post_init(model, used_backends): - if is_weight_fp8_activation_static_fp8(model.config.quantization_config): - return need_autogptq_init = False need_gptqmodel_init = False need_ipex_itrex_init = False diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index bbce4036b..367d20c5d 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -199,7 +199,7 @@ def test_autoround_3bit_sym_format(self): print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) shutil.rmtree(quantized_model_path, ignore_errors=True) - def test_static_afp8_export(self): + def test_static_afp8_export_and_load(self): import os from safetensors import safe_open @@ -226,6 +226,32 @@ def test_static_afp8_export(self): self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1, 1])) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) + with torch.no_grad(): + import transformers + + model = transformers.AutoModelForCausalLM.from_pretrained( + quantized_model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + model.eval() + assert ( + model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear" + ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}" + tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) + prompt = "AI is " + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate( + encode, + max_length=10, + ) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + print(f"Prompt: {prompt}") + print(f"Output: {output}") + assert output is not None, "Output should not be None" + shutil.rmtree(quantized_model_path, ignore_errors=True) model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", trust_remote_code=True) From eaad3a6e150d8830c96460b333ed557c04e165ae Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 03:02:38 -0400 Subject: [PATCH 04/47] remove example Signed-off-by: yiliu30 --- examples/load_w8a8.py | 143 ------------------------------------------ 1 file changed, 143 deletions(-) delete mode 100644 examples/load_w8a8.py diff --git a/examples/load_w8a8.py b/examples/load_w8a8.py deleted file mode 100644 index ad6218f9b..000000000 --- a/examples/load_w8a8.py +++ /dev/null @@ -1,143 +0,0 @@ -import json -import logging -import os - -import safetensors -import torch -import tqdm -from loguru import logger -from safetensors import safe_open -from safetensors.torch import save_file - -logging.basicConfig(level=logging.DEBUG) -torch.set_grad_enabled(False) - -# CONSTANTS -SAFETENSORS = "safetensors" -WEIGHT_SCALE_NAME = "weight_scale" -INPUT_SCALE_NAME = "scale_input" -SCALE_DTYPE = torch.bfloat16 -SCALE_FILE_NAME = f"scales.{SAFETENSORS}" -FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max -WEIGHT_BACKOFF = 1.0 -QUANT_MODULE_TYPES = (torch.nn.Linear,) -SKIP_WEIGHT_LST = { - "model.norm", - "layernorm", - "e_score_correction_bias", - # "lm_head.weight", - "embed_tokens", - "mlp.gate.weight", # mlp.gate is not linear -} - -MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" - - -seed = 0 -import random - -random.seed(seed) -import torch - -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) -import numpy as np - -np.random.Generator(seed) - - -# torch.use_deterministic_algorithms(True) -def seed_worker(worker_id): - worker_seed = torch.initial_seed() % 2**32 - np.random.Generator(worker_seed) - random.seed(worker_seed) - - -g = torch.Generator() -g.manual_seed(0) - - -def pre_dequantize(model): - """ - Pre-dequantize all FP8QDQLinear layers in the model. - """ - for name, module in model.named_modules(): - if module.__class__.__name__ == "FP8QDQLinear": - logger.info(f"Pre-dequantizing {name}") - module.pre_dequantize() - else: - logger.debug(f"Skipping {name} as it is not FP8QDQLinear") - - -import torch - - -@torch.no_grad() -def qdq_eval(model_path, not_patch_lin=False): - - import transformers - - # from transformers.modeling_utils import no_init_weights - - model = transformers.AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype="auto", - low_cpu_mem_usage=True, - trust_remote_code=True, - ) - logger.info(f"Patched model: {model}") - model.eval() - model.to("cuda") - import torch - - with torch.device("cuda"): - from transformers import GenerationConfig - - gen_config = GenerationConfig(use_cache=True, cache_implementation="static") - tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) - prompt = "Hi, who" - encode = tokenizer.encode(prompt, return_tensors="pt") - with torch.no_grad(): - output_tokens = model.generate( - encode, - max_length=10, - # generation_config=gen_config - ) - output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) - logger.info(f"Prompt: {prompt}") - logger.info(f"Output: {output}") - - # from auto_round.script.llm import eval_task_by_task - - # eval_task_by_task( - # model=model, - # device="cuda", - # tasks="gsm8k", - # batch_size=32, - # limit=128, - # # trust_remote_code=not args.disable_trust_remote_code, - # # eval_model_dtype=args.eval_model_dtype - # ) - - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("-m", "--qmodel_path", type=str, required=True) - parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model") - args = parser.parse_args() - qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) - - -""" -p load_w8a8.py --qmodel_path /data5/yliu7/HF_HOME/Qwen3-32B-w8afp8 -Running generate_until requests: 76%|███ | 97/128 [11:45<03: -Running generate_until requests: 100%|███| 128/128 [11:45<00:00, 5.51s/it] -|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr| -|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:| -|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.7422|± |0.0388| -| | |strict-match | 5|exact_match|↑ |0.6797|± |0.0414| - -total eval time: 742.8823928833008 -""" From c411ca5f86fdc2f84a5fa301ceab34d98ddf2bcb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 12 Aug 2025 03:04:26 -0400 Subject: [PATCH 05/47] fix typo Signed-off-by: yiliu30 --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 8b357e090..1b2d7c222 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -133,7 +133,7 @@ def __init__( @classmethod def from_original(cls, config, original_layer): """ - Create an FP8WOQLinear layer from an original linear layer. + Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. """ device = original_layer.weight.device with torch.device(device): @@ -165,6 +165,7 @@ def qdq_input(self, bf16_input: torch.Tensor): qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale return qdq_input_bf16 + @torch.no_grad() def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: qdq_input = self.qdq_input(bf16_input) qdq_weight = self.dequant_weight_online() From 6597d5ca36d084848f76cde2a972bc684f888d4c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 13 Aug 2025 08:39:45 +0800 Subject: [PATCH 06/47] Update auto_round/export/export_to_autoround/export_to_fp8_woq.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 1b2d7c222..09af9e270 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -89,7 +89,9 @@ def quant_tensor_with_scale(tensor, scale): qtensor = tensor / scale cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) - return scale, cliped_qtensor_fp8 + clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) + return scale, clipped_qtensor_fp8 class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module): From 9b0f32ffdd0cb4aac2c36922588c8cdd56296346 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 13 Aug 2025 08:40:45 +0800 Subject: [PATCH 07/47] Update export_to_fp8_woq.py --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 09af9e270..4d2b924d1 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -87,8 +87,6 @@ def __init__( def quant_tensor_with_scale(tensor, scale): FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max qtensor = tensor / scale - cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) - cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) return scale, clipped_qtensor_fp8 From 5ebca24b6ee300f4205ae3798c5568ac419cf134 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Sun, 24 Aug 2025 05:00:23 -0400 Subject: [PATCH 08/47] update shape Signed-off-by: yiliu30 --- .../export/export_to_autoround/export_to_fp8_woq.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 7bcfb8011..e7b473593 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -94,7 +94,7 @@ def __init__( super().__init__() self.in_features = in_features self.out_features = out_features - init_weight = torch.empty((out_features, in_features), dtype=dtype) if weight is None else weight + init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight self.weight = torch.nn.Parameter(init_weight, requires_grad=False) self.dtype = dtype if bias is not None: @@ -103,14 +103,14 @@ def __init__( self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale + init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale self.register_buffer("weight_scale", init_weight_scale.to(dtype)) init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp if weight_zp: self.register_buffer("weight_zp", init_weight_zp.to(dtype)) - init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale + init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False @@ -132,7 +132,7 @@ def dequant_weight_online(self): if self.pre_dequantized: return self.weight fp8_weight = self.weight - qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale + qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) return qdq_weight def pre_dequantize(self): From 03cb21711a34b22fc002ebca399d9a58b7d07ec9 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 02:54:16 -0400 Subject: [PATCH 09/47] refactor Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 28 +++++ .../experimental/qmodules/fp8_static.py | 108 ++++++++++++++++++ .../export_to_autoround/export_to_fp8_woq.py | 89 --------------- 3 files changed, 136 insertions(+), 89 deletions(-) create mode 100644 auto_round/experimental/qmodules/base.py create mode 100644 auto_round/experimental/qmodules/fp8_static.py diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py new file mode 100644 index 000000000..860e66836 --- /dev/null +++ b/auto_round/experimental/qmodules/base.py @@ -0,0 +1,28 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import abstractmethod +from typing import Optional, Union + +import torch + + +class QModuleBase(torch.nn.Module): + def __init__(self): + super().__init__() + + @classmethod + @abstractmethod + def from_original(cls, config, original_layer): + raise NotImplementedError diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py new file mode 100644 index 000000000..8d58480d3 --- /dev/null +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -0,0 +1,108 @@ +# Copyright (c) 2025 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import torch + +from auto_round.experimental.qmodules.base import QModuleBase + + +def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor): + FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max + qtensor = tensor / scale + clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) + return scale, clipped_qtensor_fp8 + + +class WeightFP8ActFP8StaticQuantLinear(QModuleBase): + hp_dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__( + self, + in_features, + out_features, + weight: Optional[torch.Tensor] = None, + weight_scale: Optional[torch.Tensor] = None, + bias: Union[torch.Tensor, bool, None] = None, + weight_zp: Optional[torch.Tensor] = None, + input_scale: Optional[torch.Tensor] = None, + dtype=torch.bfloat16, + ): + super().__init__() + self.in_features = in_features + self.out_features = out_features + init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight + self.weight = torch.nn.Parameter(init_weight, requires_grad=False) + self.dtype = dtype + if bias is not None: + if isinstance(bias, bool): + bias = torch.zeros((out_features,), dtype=dtype) + self.bias = torch.nn.Parameter(bias, requires_grad=False) + else: + self.register_parameter("bias", None) + init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale + self.register_buffer("weight_scale", init_weight_scale.to(dtype)) + + init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp + if weight_zp: + self.register_buffer("weight_zp", init_weight_zp.to(dtype)) + + init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale + self.register_buffer("input_scale", init_input_scale.to(dtype)) + self.pre_dequantized = False + + @classmethod + def from_original(cls, config, original_layer): + """ + Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. + """ + device = original_layer.weight.device + with torch.device(device): + qdq_linear = cls( + in_features=original_layer.in_features, + out_features=original_layer.out_features, + bias=original_layer.bias, + ) + return qdq_linear + + def dequant_weight_online(self): + if self.pre_dequantized: + return self.weight + fp8_weight = self.weight + qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) + return qdq_weight + + def pre_dequantize(self): + if self.pre_dequantized: + return + dequant_weight = self.dequant_weight_online() + del self.weight + del self.weight_scale + self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False) + self.pre_dequantized = True + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = _quant_tensor_to_fp8_with_scale(bf16_input, self.input_scale.data) + qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale + return qdq_input_bf16 + + @torch.no_grad() + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index e7b473593..214e5046e 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -68,95 +68,6 @@ def __init__( self.register_buffer("input_scale", input_scale.to(dtype)) -def quant_tensor_with_scale(tensor, scale): - FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max - qtensor = tensor / scale - clipped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) - clipped_qtensor_fp8 = clipped_qtensor.to(torch.float8_e4m3fn) - return scale, clipped_qtensor_fp8 - - -class WeightFP8ActFP8StaticQuantLinear(torch.nn.Module): - hp_dtype = torch.bfloat16 - fp8_dtype = torch.float8_e4m3fn - - def __init__( - self, - in_features, - out_features, - weight: Optional[torch.Tensor] = None, - weight_scale: Optional[torch.Tensor] = None, - bias: Union[torch.Tensor, bool, None] = None, - weight_zp: Optional[torch.Tensor] = None, - input_scale: Optional[torch.Tensor] = None, - dtype=torch.bfloat16, - ): - super().__init__() - self.in_features = in_features - self.out_features = out_features - init_weight = torch.zeros((out_features, in_features), dtype=dtype) if weight is None else weight - self.weight = torch.nn.Parameter(init_weight, requires_grad=False) - self.dtype = dtype - if bias is not None: - if isinstance(bias, bool): - bias = torch.zeros((out_features,), dtype=dtype) - self.bias = torch.nn.Parameter(bias, requires_grad=False) - else: - self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale - self.register_buffer("weight_scale", init_weight_scale.to(dtype)) - - init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp - if weight_zp: - self.register_buffer("weight_zp", init_weight_zp.to(dtype)) - - init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale - self.register_buffer("input_scale", init_input_scale.to(dtype)) - self.pre_dequantized = False - - @classmethod - def from_original(cls, config, original_layer): - """ - Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. - """ - device = original_layer.weight.device - with torch.device(device): - qdq_linear = cls( - in_features=original_layer.in_features, - out_features=original_layer.out_features, - bias=original_layer.bias, - ) - return qdq_linear - - def dequant_weight_online(self): - if self.pre_dequantized: - return self.weight - fp8_weight = self.weight - qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) - return qdq_weight - - def pre_dequantize(self): - if self.pre_dequantized: - return - dequant_weight = self.dequant_weight_online() - del self.weight - del self.weight_scale - self.weight = torch.nn.Parameter(dequant_weight, requires_grad=False) - self.pre_dequantized = True - - def qdq_input(self, bf16_input: torch.Tensor): - input_scale, input_fp8 = quant_tensor_with_scale(bf16_input, self.input_scale.data) - qdq_input_bf16 = input_fp8.to(self.dtype) * input_scale - return qdq_input_bf16 - - @torch.no_grad() - def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: - qdq_input = self.qdq_input(bf16_input) - qdq_weight = self.dequant_weight_online() - out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) - return out - - def pack_layer(layer_name, model, data_type, packing_device=None): """ Packs a model layer for quantization based on its type and configuration. From 66388e5360173de4e4b6340a4e075bdd1749c46c Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 05:23:48 -0400 Subject: [PATCH 10/47] tmp add bk Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 12 ++++++++++++ auto_round/inference/convert_model.py | 3 +-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 4e3f42861..867d9f398 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -172,6 +172,17 @@ def feature_multiply_checker_group_size( requirements=["auto-round>=0.5.1"], ) +BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( + device=["cuda", "cpu"], + packing_format="", + sym=[True], + bits=[8], + priority=0, + feature_checks=[], + alias=["auto_round", "torch"], + requirements=["auto-round>=0.6.1"], +) + BackendInfos["auto_round:tritonv2_zp"] = BackendInfo( device=["cuda", "xpu"], sym=[True], ## asym has accuracys @@ -732,6 +743,7 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f If no compatible backend is found for the given layer configuration. """ # Check if the provided backend is in BackendInfos + # breakpoint() backend = find_backend(backend) if backend not in BackendInfos.keys(): raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.") diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index bd8b4621d..fbdfb8804 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -566,7 +566,6 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): backend = quantization_config.backend else: backend = "auto" - ##target_backend could be None _, backend = parse_target_device_and_backend(backend) @@ -591,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format): backend = backend[len("auto_round:") :] - + # breakpoint() used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend) if backend == "auto" or backend == "": best_backend = get_highest_priority_backend( From 17ddd2d0d22d42a990a1dafcc47d47f14e45f0a5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:00:54 -0400 Subject: [PATCH 11/47] refactor code Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 7 ++++--- auto_round/inference/convert_model.py | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 867d9f398..3e4c8a7f2 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -176,11 +176,12 @@ def feature_multiply_checker_group_size( device=["cuda", "cpu"], packing_format="", sym=[True], + dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, feature_checks=[], alias=["auto_round", "torch"], - requirements=["auto-round>=0.6.1"], + requirements=["auto-round>=0.6.1.dev0"], ) BackendInfos["auto_round:tritonv2_zp"] = BackendInfo( @@ -463,7 +464,7 @@ def dynamic_import_inference_linear(backend, config): bits, group_size, sym = config["bits"], config["group_size"], config["sym"] if is_weight_fp8_activation_static_fp8(config): - from auto_round.export.export_to_autoround.export_to_fp8_woq import WeightFP8ActFP8StaticQuantLinear + from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear return WeightFP8ActFP8StaticQuantLinear @@ -743,7 +744,6 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f If no compatible backend is found for the given layer configuration. """ # Check if the provided backend is in BackendInfos - # breakpoint() backend = find_backend(backend) if backend not in BackendInfos.keys(): raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.") @@ -855,6 +855,7 @@ def build_pip_commands(gptq_req, other_reqs): # Instructional messages install_instructions = [] + for cmd in pip_cmds: if "intel-extension-for-pytorch" in cmd and target_device == "xpu": install_instructions.append( diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index fbdfb8804..df8b52c07 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -590,7 +590,7 @@ def convert_hf_model(model: nn.Module, target_device="cpu"): if backend.startswith("auto_round:") and ("gptq" in packing_format or "awq" in packing_format): backend = backend[len("auto_round:") :] - # breakpoint() + used_backends = _replace_by_quant_layers(model, layer_configs, backend, target_device, orig_backend) if backend == "auto" or backend == "": best_backend = get_highest_priority_backend( From 808449d71e0d004298c183d76a417a3df83f3528 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:12:52 -0400 Subject: [PATCH 12/47] refine code Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 24 +++++++++++++++++++ .../experimental/qmodules/fp8_static.py | 12 ++++++++++ auto_round/inference/backend.py | 4 ++++ 3 files changed, 40 insertions(+) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index 860e66836..affc7552d 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -19,6 +19,14 @@ class QModuleBase(torch.nn.Module): + """ + Abstract class used to describe the weight creation and forward pass + of different quantization schemes supported by Auto-Round. + The design is inspired by vLLM's CompressedTensorsScheme: + https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py + + """ + def __init__(self): super().__init__() @@ -26,3 +34,19 @@ def __init__(self): @abstractmethod def from_original(cls, config, original_layer): raise NotImplementedError + + @classmethod + @abstractmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + raise NotImplementedError + + @abstractmethod + def process_weights_after_loading(self, layer: torch.nn.Module): + """ + Called after weight loading is complete for any cleanup that + needs to occur. + """ + raise NotImplementedError diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 8d58480d3..3774da810 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from abc import abstractmethod from typing import Optional, Union import torch @@ -106,3 +107,14 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: qdq_weight = self.dequant_weight_online() out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) return out + + @classmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + # FIXME: set to 0 for now, as fp8 kernels are not available yet + return 0 + + def process_weights_after_loading(self, layer: torch.nn.Module): + pass diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 3e4c8a7f2..0ca0d4726 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -172,6 +172,10 @@ def feature_multiply_checker_group_size( requirements=["auto-round>=0.5.1"], ) +# FP8 static quant +# Weight: FP8, per-channel, may be extended to per-tensor in future +# Activation: FP8, per-tensor + BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( device=["cuda", "cpu"], packing_format="", From f74ed6f6ffd7c40b55ce2886a9882f55b5f96bce Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:17:49 -0400 Subject: [PATCH 13/47] fix device list Signed-off-by: yiliu30 --- .../experimental/qmodules/fp8_static.py | 22 +++++++++---------- auto_round/inference/backend.py | 2 +- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 3774da810..074cf34e7 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -66,6 +66,17 @@ def __init__( self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False + @classmethod + def get_min_capability(cls) -> int: + """ + Get minimum device capability. + """ + # FIXME: set to 0 for now, as fp8 kernels are not available yet + return 0 + + def process_weights_after_loading(self, layer: torch.nn.Module): + pass + @classmethod def from_original(cls, config, original_layer): """ @@ -107,14 +118,3 @@ def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: qdq_weight = self.dequant_weight_online() out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) return out - - @classmethod - def get_min_capability(cls) -> int: - """ - Get minimum device capability. - """ - # FIXME: set to 0 for now, as fp8 kernels are not available yet - return 0 - - def process_weights_after_loading(self, layer: torch.nn.Module): - pass diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 0ca0d4726..f74f22b75 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -177,7 +177,7 @@ def feature_multiply_checker_group_size( # Activation: FP8, per-tensor BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( - device=["cuda", "cpu"], + device=["xpu", "cuda", "cpu"], packing_format="", sym=[True], dtype=["float32", "float16", "bfloat16"], From 632cf8a91046608bb26afedf63c81e0920a3d822 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 26 Aug 2025 23:25:13 -0400 Subject: [PATCH 14/47] fix Signed-off-by: yiliu30 --- auto_round/export/export_to_autoround/export_to_fp8_woq.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export_to_fp8_woq.py b/auto_round/export/export_to_autoround/export_to_fp8_woq.py index 9dbbca5ab..b8a32896f 100644 --- a/auto_round/export/export_to_autoround/export_to_fp8_woq.py +++ b/auto_round/export/export_to_autoround/export_to_fp8_woq.py @@ -16,7 +16,6 @@ import json import os from concurrent.futures import ThreadPoolExecutor -from typing import Optional, Union import threadpoolctl as tctl import torch From 5b8b29d4a2e315b9656eb90c8b3948015bcb4a20 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:14:04 -0400 Subject: [PATCH 15/47] refactor code Signed-off-by: yiliu30 --- auto_round/autoround.py | 19 +++++++++++--- .../export/export_to_autoround/export.py | 8 +++++- auto_round/inference/backend.py | 13 +--------- auto_round/utils.py | 26 +++++++++++++++++++ 4 files changed, 50 insertions(+), 16 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index fed33df34..85ea75e60 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -19,6 +19,7 @@ import sys import time import traceback +from enum import Enum from typing import Any, Union import accelerate @@ -74,6 +75,7 @@ is_optimum_habana_available, is_standard_fp, is_static_afp8, + is_torch_fp8_static, llm_load_model, logger, mv_module_from_gpu, @@ -87,6 +89,12 @@ from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block +class AutoRoundFormat(str, Enum): + # Weight: FP8, per-channel, may be extended to per-tensor in future + # Activation: FP8, per-tensor + TORCH_FP8_STATIC = "torch_fp8_static" + + class AutoRound(object): """Automatic weight rounding (Signed Gradient Descent) for LLM quantization @@ -663,9 +671,14 @@ def _parse_format_to_list(self, format: str) -> list: ) if enable_awq: formats[index] = format.replace("auto_round", "auto_round:auto_awq") - if is_nv_fp(self.data_type) or is_mx_fp(self.data_type) or is_standard_fp(self.data_type): + if is_nv_fp(self.data_type) or is_mx_fp(self.data_type): format = format.replace("auto_round", f"auto_round:{self.data_type}") formats[index] = format + if is_torch_fp8_static(self): + format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}") + formats[index] = format + # if is_torch_fp8_static(self): + # formats[index] = "auto_round:torch_fp8_static" elif format == "llmcompressor": from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported @@ -731,10 +744,10 @@ def _check_supported_format(self, format: str) -> bool: ) format = "fake" else: - if not (format == "auto_round" or format == "auto_round:fp8"): + if not (format == "auto_round" or format == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}"): logger.warning( f"Currently only support to export auto_round or fake format for static W{self.bits}AFP8 model," - " change format to auto_round" + f" change format {format} to auto_round" ) format = "auto_round" if self.act_group_size != 0 and not self.act_dynamic and format == "auto_round:fp8": diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 1640528b6..38b815eb1 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -263,6 +263,7 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex Raises: ValueError: If the backend is not supported. """ + # breakpoint() data_type = kwargs.get("data_type", None) if is_nv_fp(data_type) or is_mx_fp(data_type): ## detect nvfp & mxfp first from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp @@ -273,9 +274,14 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex from auto_round.export.export_to_autoround.export_to_fp8_woq import save_quantized_as_autoround return save_quantized_as_autoround(output_dir, inplace=inplace, backend="auto_round", **kwargs) + from auto_round.autoround import AutoRoundFormat ##if using sym, we change to gptq sym kernel to avoid compiling from auto_round source - if (kwargs.get("sym") is None or kwargs.get("sym")) and ("gptq" not in backend and "awq" not in backend): + if ( + (kwargs.get("sym") is None or kwargs.get("sym")) + and ("gptq" not in backend and "awq" not in backend) + and (AutoRoundFormat.TORCH_FP8_STATIC.value not in backend) + ): backend = backend.replace("auto_round", "auto_round:auto_gptq") model = kwargs["model"] diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index f74f22b75..739ff4e89 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -19,7 +19,7 @@ from transformers.utils.versions import require_version import auto_round_extension.cuda.gptqmodel_marlin -from auto_round.utils import get_library_version, logger +from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger BackendInfos = {} @@ -429,17 +429,6 @@ def check_compatible( return True -def is_weight_fp8_activation_static_fp8(config): - bits, group_size, sym, data_type, act_dynamic = ( - config["bits"], - config["group_size"], - config["sym"], - config["data_type"], - config["act_dynamic"], - ) - return bits == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic - - def dynamic_import_inference_linear(backend, config): """Dynamically imports and returns the appropriate QuantLinear class based on the given backend. diff --git a/auto_round/utils.py b/auto_round/utils.py index 74999c624..c13556827 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2516,3 +2516,29 @@ def is_nv_fp(backend): def is_static_afp8(ar): return not ar.act_dynamic and "fp8" in ar.act_data_type + + +def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic): + return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic + + +def is_weight_fp8_activation_static_fp8(config): + bits, group_size, sym, data_type, act_dynamic = ( + config["bits"], + config["group_size"], + config["sym"], + config["data_type"], + config["act_dynamic"], + ) + return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) + + +def is_torch_fp8_static(ar): + bits, group_size, sym, data_type, act_dynamic = ( + ar.bits, + ar.group_size, + ar.sym, + ar.data_type, + ar.act_dynamic, + ) + return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) From 57b4c19913c442434144e8ba50df1dfb6f5ba7df Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:18:02 -0400 Subject: [PATCH 16/47] fix Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 6 +++--- auto_round/export/export_to_autoround/export.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index affc7552d..c069f5151 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod +from abc import ABC, abstractmethod from typing import Optional, Union import torch -class QModuleBase(torch.nn.Module): +class QModuleBase(ABC): """ Abstract class used to describe the weight creation and forward pass of different quantization schemes supported by Auto-Round. @@ -32,7 +32,7 @@ def __init__(self): @classmethod @abstractmethod - def from_original(cls, config, original_layer): + def from_original(cls, config, original_layer: torch.nn.Module): raise NotImplementedError @classmethod diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 38b815eb1..48a59f5e5 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -263,7 +263,6 @@ def save_quantized_as_autoround(output_dir, inplace=True, backend="auto_round:ex Raises: ValueError: If the backend is not supported. """ - # breakpoint() data_type = kwargs.get("data_type", None) if is_nv_fp(data_type) or is_mx_fp(data_type): ## detect nvfp & mxfp first from auto_round.export.export_to_autoround.export_to_fp import save_quantized_as_fp From bdf5f3e554da100b337f327257fa2308b90811f5 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:19:06 -0400 Subject: [PATCH 17/47] update Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 2 ++ auto_round/experimental/qmodules/fp8_static.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index c069f5151..2a74a470d 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -17,6 +17,8 @@ import torch +__all__ = ["QModuleBase"] + class QModuleBase(ABC): """ diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 074cf34e7..b5c7d2dd2 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -19,6 +19,8 @@ from auto_round.experimental.qmodules.base import QModuleBase +__all__ = ["WeightFP8ActFP8StaticQuantLinear"] + def _quant_tensor_to_fp8_with_scale(tensor: torch.Tensor, scale: torch.Tensor): FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max From ce3384f33ec861f00e4c704f032dc99b907c8536 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Wed, 27 Aug 2025 03:26:05 -0400 Subject: [PATCH 18/47] fix ut Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/base.py | 4 +- test/test_cpu/test_export.py | 48 ++++++++++++------------ 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/auto_round/experimental/qmodules/base.py b/auto_round/experimental/qmodules/base.py index 2a74a470d..8b7a9c138 100644 --- a/auto_round/experimental/qmodules/base.py +++ b/auto_round/experimental/qmodules/base.py @@ -20,9 +20,9 @@ __all__ = ["QModuleBase"] -class QModuleBase(ABC): +class QModuleBase(torch.nn.Module): """ - Abstract class used to describe the weight creation and forward pass + Base class used to describe the weight creation and forward pass of different quantization schemes supported by Auto-Round. The design is inspired by vLLM's CompressedTensorsScheme: https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py diff --git a/test/test_cpu/test_export.py b/test/test_cpu/test_export.py index 24498c780..d648fd721 100644 --- a/test/test_cpu/test_export.py +++ b/test/test_cpu/test_export.py @@ -230,31 +230,33 @@ def test_static_afp8_export(self, static_kv_dtype): self.assertIn("model.decoder.layers.8.self_attn.k_proj.weight_scale", f.keys()) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.input_scale").shape, torch.Size([1])) self.assertEqual(f.get_tensor("model.decoder.layers.5.self_attn.v_proj.weight").dtype, torch.float8_e4m3fn) - with torch.no_grad(): - import transformers - - model = transformers.AutoModelForCausalLM.from_pretrained( - quantized_model_path, - torch_dtype="auto", - low_cpu_mem_usage=True, - trust_remote_code=True, - ) - model.eval() - assert ( - model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ == "WeightFP8ActFP8StaticQuantLinear" - ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}" - tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) - prompt = "AI is " - encode = tokenizer.encode(prompt, return_tensors="pt") + if static_kv_dtype is None: with torch.no_grad(): - output_tokens = model.generate( - encode, - max_length=10, + import transformers + + model = transformers.AutoModelForCausalLM.from_pretrained( + quantized_model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, ) - output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) - print(f"Prompt: {prompt}") - print(f"Output: {output}") - assert output is not None, "Output should not be None" + model.eval() + assert ( + model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__ + == "WeightFP8ActFP8StaticQuantLinear" + ), f"Expected WeightFP8ActFP8StaticQuantLinear, got {model.model.decoder.layers[0].self_attn.k_proj.__class__.__name__}" + tokenizer = transformers.AutoTokenizer.from_pretrained(quantized_model_path) + prompt = "AI is " + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate( + encode, + max_length=10, + ) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + print(f"Prompt: {prompt}") + print(f"Output: {output}") + assert output is not None, "Output should not be None" if static_kv_dtype == "fp8": self.assertIn("model.decoder.layers.8.self_attn.k_scale", f.keys()) From 22d11de19ce77a04b29f28c5c19e6639a7130298 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 01:04:39 -0400 Subject: [PATCH 19/47] correct Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/fp8_static.py | 13 +++++-------- auto_round/utils.py | 14 +++++++++++--- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index b5c7d2dd2..90ee09357 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -18,6 +18,7 @@ import torch from auto_round.experimental.qmodules.base import QModuleBase +from auto_round.utils import logger __all__ = ["WeightFP8ActFP8StaticQuantLinear"] @@ -41,7 +42,6 @@ def __init__( weight: Optional[torch.Tensor] = None, weight_scale: Optional[torch.Tensor] = None, bias: Union[torch.Tensor, bool, None] = None, - weight_zp: Optional[torch.Tensor] = None, input_scale: Optional[torch.Tensor] = None, dtype=torch.bfloat16, ): @@ -57,14 +57,10 @@ def __init__( self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale + init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale self.register_buffer("weight_scale", init_weight_scale.to(dtype)) - init_weight_zp = torch.zeros((out_features, 1), dtype=dtype) if weight_zp is None else weight_zp - if weight_zp: - self.register_buffer("weight_zp", init_weight_zp.to(dtype)) - - init_input_scale = torch.zeros((1,), dtype=dtype) if input_scale is None else input_scale + init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False @@ -73,7 +69,8 @@ def get_min_capability(cls) -> int: """ Get minimum device capability. """ - # FIXME: set to 0 for now, as fp8 kernels are not available yet + # TODO: correct that config once we add fp8 op support. + logger.warning_once("FP8 ops are not yet supported. Using capability 0.") return 0 def process_weights_after_loading(self, layer: torch.nn.Module): diff --git a/auto_round/utils.py b/auto_round/utils.py index c13556827..2fd78f7a0 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -108,9 +108,17 @@ def infer_bits_by_data_type(data_type: str): return None -@lru_cache(None) -def warning_once(self, msg: str): - self.warning(msg) +@lru_cache(maxsize=None) +def warning_once(self, msg, *args, **kwargs): + """ + Log a warning message only once per unique message/arguments combination. + + Args: + msg: The warning message format string + *args: Variable positional arguments for message formatting + **kwargs: Variable keyword arguments for message formatting and logging options + """ + self.warning(msg, *args, **kwargs) class AutoRoundFormatter(logging.Formatter): From 90826139a8ddfb53a983ad2e87b2ef978fcbe3fb Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 01:05:28 -0400 Subject: [PATCH 20/47] clean Signed-off-by: yiliu30 --- auto_round/autoround.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 85ea75e60..2af8df95e 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -677,8 +677,7 @@ def _parse_format_to_list(self, format: str) -> list: if is_torch_fp8_static(self): format = format.replace("auto_round", f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}") formats[index] = format - # if is_torch_fp8_static(self): - # formats[index] = "auto_round:torch_fp8_static" + elif format == "llmcompressor": from auto_round.export.export_to_llmcompressor import check_compressed_tensors_supported From 2202856fabc8abe2f8ad7a964899450621fbd598 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 03:34:11 -0400 Subject: [PATCH 21/47] fix shape Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/fp8_static.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index 90ee09357..a6798f53d 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -57,10 +57,10 @@ def __init__( self.bias = torch.nn.Parameter(bias, requires_grad=False) else: self.register_parameter("bias", None) - init_weight_scale = torch.empty((out_features, 1), dtype=dtype) if weight_scale is None else weight_scale + init_weight_scale = torch.empty((out_features), dtype=dtype) if weight_scale is None else weight_scale self.register_buffer("weight_scale", init_weight_scale.to(dtype)) - init_input_scale = torch.zeros((1, 1), dtype=dtype) if input_scale is None else input_scale + init_input_scale = torch.zeros((1), dtype=dtype) if input_scale is None else input_scale self.register_buffer("input_scale", init_input_scale.to(dtype)) self.pre_dequantized = False From d0b99a8f1c493d8484e10871b3a533705c8f1401 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 20:59:33 -0400 Subject: [PATCH 22/47] fix check Signed-off-by: yiliu30 --- auto_round/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 934486c5a..6ef3884a9 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -687,7 +687,7 @@ def _parse_format_to_list(self, format: str) -> list: format = "auto_round:auto_awq" elif is_nv_fp(self.data_type) or is_mx_fp(self.data_type): format = f"auto_round:{self.data_type}" - elif is_wfp8afp8(self): # staic wfp8afp8 + elif is_static_wfp8afp8(self): # staic wfp8afp8 format = f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}" elif self.data_type == "fp" and self.bits == 8 and self.act_bits >= 16: # woq fp8 format = "auto_round:fp8" From 31845d0d025db8b24e4676192a5b998c56188c8e Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 28 Aug 2025 21:02:34 -0400 Subject: [PATCH 23/47] clean code Signed-off-by: yiliu30 --- auto_round/autoround.py | 2 -- auto_round/utils.py | 15 --------------- 2 files changed, 17 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 6ef3884a9..49e3984a7 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -73,9 +73,7 @@ is_nv_fp, is_optimum_habana_available, is_standard_fp, - is_static_afp8, is_static_wfp8afp8, - is_torch_fp8_static, is_wfp8afp8, llm_load_model, logger, diff --git a/auto_round/utils.py b/auto_round/utils.py index 9886a5337..21363688b 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2527,10 +2527,6 @@ def is_nv_fp(backend): return BackendDataType.NV_FP in backend -def is_static_afp8(ar): - return not ar.act_dynamic and "fp8" in ar.act_data_type - - def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic): return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic @@ -2546,17 +2542,6 @@ def is_weight_fp8_activation_static_fp8(config): return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) -def is_torch_fp8_static(ar): - bits, group_size, sym, data_type, act_dynamic = ( - ar.bits, - ar.group_size, - ar.sym, - ar.data_type, - ar.act_dynamic, - ) - return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) - - def is_wfp8afp8(ar): if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and ( "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8) From 1f2e6749d230e0948d6e2177d6ae1f48de93abc6 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 01:00:33 -0400 Subject: [PATCH 24/47] fix backend check Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/fp8_static.py | 4 ++-- auto_round/export/export_to_autoround/export.py | 4 +++- auto_round/utils.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index a6798f53d..a61148d82 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -93,8 +93,7 @@ def from_original(cls, config, original_layer): def dequant_weight_online(self): if self.pre_dequantized: return self.weight - fp8_weight = self.weight - qdq_weight = fp8_weight.to(self.dtype) * self.weight_scale.unsqueeze(1) + qdq_weight = self.weight.to(self.dtype) * self.weight_scale.unsqueeze(1) return qdq_weight def pre_dequantize(self): @@ -113,6 +112,7 @@ def qdq_input(self, bf16_input: torch.Tensor): @torch.no_grad() def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) qdq_weight = self.dequant_weight_online() out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index ee2d61587..2f0552c1b 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -151,8 +151,10 @@ def pack_layer(layer_name, model, backend): from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer return pack_layer(layer_name, model, backend) + # breakpoint() + from auto_round.autoround import AutoRoundFormat - if backend == "auto_round:fp8": + if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}": from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer return pack_layer(layer_name, model, backend) diff --git a/auto_round/utils.py b/auto_round/utils.py index 09dcb92b5..f8c878b43 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2554,8 +2554,8 @@ def is_nv_fp(backend): return BackendDataType.NV_FP in backend -def _is_weight_fp8_activation_static_fp8(bit, group_size, sym, data_type, act_dynamic): - return bit == 8 and group_size == -1 and sym and data_type == "fp8" and not act_dynamic +def _is_weight_fp8_activation_static_fp8(bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool): + return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic def is_weight_fp8_activation_static_fp8(config): From 4cec318ffa1476f74822db40a58eeba7e4951e67 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 02:01:25 -0400 Subject: [PATCH 25/47] update config Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 37 +++++++++++++++++++-------- auto_round/schemes.py | 9 +++++-- 2 files changed, 33 insertions(+), 13 deletions(-) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index df8b52c07..8b569a654 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -30,6 +30,7 @@ is_weight_fp8_activation_static_fp8, process_requirement, ) +from auto_round.schemes import QuantizationScheme from auto_round.utils import ( SUPPORTED_LAYER_TYPES, check_start_with_block_name, @@ -240,7 +241,23 @@ def get_layer_config(model, quantization_config): group_size = quantization_config.group_size data_type = getattr(quantization_config, "data_type", "int") # Default to "int" if not specified sym = quantization_config.sym + + act_bits = getattr(quantization_config, "act_bits", None) + act_group_size = getattr(quantization_config, "act_group_size", False) + act_sym = getattr(quantization_config, "act_sym", None) + act_data_type = getattr(quantization_config, "act_data_type", None) act_dynamic = getattr(quantization_config, "act_dynamic", False) + + default_quant_scheme = QuantizationScheme( + bits=bits, + group_size=group_size, + data_type=data_type, + sym=sym, + act_bits=act_bits, + act_group_size=act_group_size, + act_sym=act_sym, + ) + # Determine the quantization block list quant_block_list = getattr(quantization_config, "quant_block_list", None) if quant_block_list is None: @@ -287,17 +304,15 @@ def get_layer_config(model, quantization_config): layer_names = list(set(layer_names).union(extra_config.keys())) # Construct final layer configuration - layer_configs = { - layer_name: { - "bits": extra_config.get(layer_name, {}).get("bits", bits), - "group_size": extra_config.get(layer_name, {}).get("group_size", group_size), - "data_type": extra_config.get(layer_name, {}).get("data_type", data_type), - "sym": extra_config.get(layer_name, {}).get("sym", sym), - "act_dynamic": extra_config.get(layer_name, {}).get("act_dynamic", act_dynamic), - "clip": extra_config.get(layer_name, {}).get("clip", False), - } - for layer_name in layer_names - } + layer_configs = {} + quant_scheme_attrs = QuantizationScheme.get_attributes() + for layer_name in layer_names: + layer_config = {} + layer_extra_config = extra_config.get(layer_name, {}) + for scheme_attr in quant_scheme_attrs: + layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr)) + layer_configs[layer_name] = layer_config + return layer_configs diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 496af179c..2908cd73b 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -13,8 +13,8 @@ # limitations under the License. import copy from copy import deepcopy -from dataclasses import dataclass -from typing import Optional +from dataclasses import dataclass, fields +from typing import List, Optional __all__ = ["QuantizationScheme", "preset_name_to_scheme"] @@ -32,11 +32,16 @@ class QuantizationScheme: act_dynamic: Optional[bool] = None super_bits: Optional[int] = None super_group_size: Optional[int] = None + clip: Optional[bool] = False @classmethod def from_dict(cls, config: dict): return cls(**config) + @classmethod + def get_attributes(cls: "QuantizationScheme") -> List[str]: + return [field.name for field in fields(cls)] + def preset_name_to_scheme(name: str) -> QuantizationScheme: """Get a QuantizationScheme instance from a preset scheme name.""" From 6b2962fde6472470f9ef8be845008567c16bd095 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 02:03:51 -0400 Subject: [PATCH 26/47] revert change Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 8b569a654..4aec6338d 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -221,7 +221,6 @@ def get_layer_config(model, quantization_config): - group_size (int): Group size for weight quantization. - data_type (str, optional): Data type for quantization (default: "int"). - sym (bool): Whether to use symmetric quantization. - - act_dynamic (bool, optional): Whether to use dynamic activation quantization (default: False). - quant_block_list (list, optional): Predefined list of blocks to quantize. - to_quant_block_names (list or str, optional): Blocks to quantize (if quant_block_list is None). - extra_config (dict, optional): Per-layer overrides for quantization settings. @@ -234,7 +233,6 @@ def get_layer_config(model, quantization_config): - "group_size" (int): Group size for quantization. - "data_type" (str): Data type used for quantization. - "sym" (bool): Whether symmetric quantization is applied. - - "act_dynamic" (bool): Whether dynamic activation quantization is used. - "clip" (bool): Whether weight clipping is enabled. """ bits = quantization_config.bits From 638718e7ae859ae47430b9a10f4a050e96efafa8 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 02:06:36 -0400 Subject: [PATCH 27/47] fix Signed-off-by: yiliu30 --- auto_round/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index f8c878b43..b5514ac38 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -24,7 +24,7 @@ from collections import UserDict from enum import Enum from functools import lru_cache -from typing import Any, Callable, Tuple, Union +from typing import Any, Callable, Dict, Tuple, Union import cpuinfo import torch @@ -2558,7 +2558,7 @@ def _is_weight_fp8_activation_static_fp8(bit: int, group_size: int, sym: bool, d return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic -def is_weight_fp8_activation_static_fp8(config): +def is_weight_fp8_activation_static_fp8(config: Dict): bits, group_size, sym, data_type, act_dynamic = ( config["bits"], config["group_size"], From 4df3e8f754b745b9fa4498ef01f954d064b21f62 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 02:25:09 -0400 Subject: [PATCH 28/47] fix Signed-off-by: yiliu30 --- auto_round/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index b5514ac38..5410609cf 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2554,7 +2554,9 @@ def is_nv_fp(backend): return BackendDataType.NV_FP in backend -def _is_weight_fp8_activation_static_fp8(bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool): +def _is_weight_fp8_activation_static_fp8( + bit: int, group_size: int, sym: bool, data_type: str, act_dynamic: bool +) -> bool: return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic From e01603ce2e6b8673d229ab039cf7ac4af3b1d690 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 02:40:35 -0400 Subject: [PATCH 29/47] update Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 00f0ce648..bacdac8df 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -185,7 +185,7 @@ def feature_multiply_checker_group_size( priority=0, feature_checks=[], alias=["auto_round", "torch"], - requirements=["auto-round>=0.6.1.dev0"], + requirements=["auto-round>0.6.0"], ) BackendInfos["auto_round:tritonv2_zp"] = BackendInfo( From 0cdf28b1507c7023dd17e178b445501bacd44434 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 03:41:02 -0400 Subject: [PATCH 30/47] propagate the config Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 20 +++++++++----------- auto_round/inference/convert_model.py | 4 +--- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index bacdac8df..77bb64784 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -356,9 +356,7 @@ def feature_multiply_checker_group_size( ) -def check_compatible( - backend_name, device, bits, group_size, sym, packing_format, in_features, out_features, check_requirements=True -): +def check_compatible(backend_name, device, config, packing_format, in_features, out_features, check_requirements=True): """Checks if the given configuration is compatible with the specified backend. Args: @@ -388,7 +386,7 @@ def check_compatible( - If the packing format does not match, it must be convertible. """ backend = BackendInfos[backend_name] - + bits, group_size, sym = config["bits"], config["group_size"], config["sym"] # Check if device is supported by the backend if device not in backend.device: return False @@ -685,7 +683,7 @@ def find_backend(target_backend: str, orig_backend: str = None): ) -def get_all_compatible_backend(device, backend, orig_backend, bits, group_size, sym, in_features, out_features): +def get_all_compatible_backend(device, backend, orig_backend, config, in_features, out_features): # Get packing format from the original backend packing_format = BackendInfos[orig_backend].packing_format @@ -693,16 +691,14 @@ def get_all_compatible_backend(device, backend, orig_backend, bits, group_size, compatible_backends = [ key for key in BackendInfos.keys() - if check_compatible( - key, device, bits, group_size, sym, packing_format, in_features, out_features, check_requirements=False - ) + if check_compatible(key, device, config, packing_format, in_features, out_features, check_requirements=False) ] # Return the first compatible backend or an empty list if none found return compatible_backends -def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_features, out_features): +def get_layer_backend(device, backend, orig_backend, config, in_features, out_features): """Selects the most suitable backend for the layer based on compatibility and priority. This function first checks if the specified backend supports the layer with the provided configuration. @@ -736,8 +732,10 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f If the specified backend is not supported. If no compatible backend is found for the given layer configuration. """ + bits, group_size, sym = config["bits"], config["group_size"], config["sym"] # Check if the provided backend is in BackendInfos backend = find_backend(backend) + if backend not in BackendInfos.keys(): raise ValueError(f"Unsupported backend '{backend}'. Please set it to 'auto' to enable automatic selection.") @@ -746,13 +744,13 @@ def get_layer_backend(device, backend, orig_backend, bits, group_size, sym, in_f # Find and store other compatible backends supported_backends = [] for key in BackendInfos.keys(): - if check_compatible(key, device, bits, group_size, sym, packing_format, in_features, out_features): + if check_compatible(key, device, config, packing_format, in_features, out_features): supported_backends.append(key) # Raise an error if no compatible backends are found if len(supported_backends) == 0: supported_backends_need_package = get_all_compatible_backend( - device, backend, orig_backend, bits, group_size, sym, in_features, out_features + device, backend, orig_backend, config, in_features, out_features ) if len(supported_backends_need_package) > 0: diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 4aec6338d..73d1385e9 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -407,9 +407,7 @@ def _get_layer_backend(target_device, target_backend, orig_backend, config, in_f target_device, target_backend, orig_backend, - config["bits"], - config["group_size"], - config["sym"], + config, in_features, out_features, ) From 27910da00fa5c0be064ad8c54de8b81bbb369d5f Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 03:48:06 -0400 Subject: [PATCH 31/47] pass config to checker Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 77bb64784..ca07e1972 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -73,15 +73,16 @@ class BackendInfo: requirements: Optional[List[str]] = None -def feature_multiply_checker(in_feature, out_feature, group_size, in_feature_multiplier, out_feature_multiplier=None): +def feature_multiply_checker(in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None): if out_feature_multiplier is None: out_feature_multiplier = in_feature_multiplier return in_feature % in_feature_multiplier == 0 and out_feature % out_feature_multiplier == 0 def feature_multiply_checker_group_size( - in_feature, out_feature, group_size, in_feature_multiplier, out_feature_multiplier=None + in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None ): + group_size = config["group_size"] if out_feature_multiplier is None: out_feature_multiplier = in_feature_multiplier return ( @@ -410,7 +411,7 @@ def check_compatible(backend_name, device, config, packing_format, in_features, return False for check in backend.feature_checks: - if not check(in_features, out_features, group_size): + if not check(in_features, out_features, config): return False if check_requirements and backend.requirements is not None: From d46acdb2b9ac61255f97a14cb1fd41dcc0356c45 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 04:10:24 -0400 Subject: [PATCH 32/47] add more check Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 19 ++++++++++++++++-- auto_round/inference/convert_model.py | 2 +- auto_round/schemes.py | 29 ++++++++++++++++++++++++++- 3 files changed, 46 insertions(+), 4 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index ca07e1972..0f822d7c2 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -19,6 +19,7 @@ from transformers.utils.versions import require_version import auto_round_extension.cuda.gptqmodel_marlin +from auto_round.schemes import QuantizationScheme from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger BackendInfos = {} @@ -105,6 +106,21 @@ def feature_multiply_checker_group_size( feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64 ) + +def torch_fp8_static_check( + in_feature: int, + out_feature: int, + config: QuantizationScheme, + in_feature_multiplier: Optional[int] = None, + out_feature_multiplier: Optional[int] = None, +): + if not is_weight_fp8_activation_static_fp8(config): + return False + from auto_round.schemes import FPW8_STATIC + + return config == FPW8_STATIC + + BackendInfos["auto_gptq:exllamav2"] = BackendInfo( device=["cuda"], sym=[True, False], @@ -184,7 +200,7 @@ def feature_multiply_checker_group_size( dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, - feature_checks=[], + feature_checks=[torch_fp8_static_check], alias=["auto_round", "torch"], requirements=["auto-round>0.6.0"], ) @@ -733,7 +749,6 @@ def get_layer_backend(device, backend, orig_backend, config, in_features, out_fe If the specified backend is not supported. If no compatible backend is found for the given layer configuration. """ - bits, group_size, sym = config["bits"], config["group_size"], config["sym"] # Check if the provided backend is in BackendInfos backend = find_backend(backend) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 73d1385e9..e50bf71e8 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -309,7 +309,7 @@ def get_layer_config(model, quantization_config): layer_extra_config = extra_config.get(layer_name, {}) for scheme_attr in quant_scheme_attrs: layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr)) - layer_configs[layer_name] = layer_config + layer_configs[layer_name] = QuantizationScheme.from_dict(layer_config) return layer_configs diff --git a/auto_round/schemes.py b/auto_round/schemes.py index 2908cd73b..c7221a1fd 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -14,7 +14,7 @@ import copy from copy import deepcopy from dataclasses import dataclass, fields -from typing import List, Optional +from typing import Generator, List, Optional __all__ = ["QuantizationScheme", "preset_name_to_scheme"] @@ -42,6 +42,33 @@ def from_dict(cls, config: dict): def get_attributes(cls: "QuantizationScheme") -> List[str]: return [field.name for field in fields(cls)] + def __getitem__(self, key: str): + if key not in self.get_attributes(): + raise KeyError(f"{key} is not a valid attribute") + return getattr(self, key) + + def __setitem__(self, key: str, value: None | int | str): + if key not in self.get_attributes(): + raise KeyError(f"{key} is not a valid attribute") + setattr(self, key, value) + + def items(self): + return ((field, getattr(self, field)) for field in self.get_attributes()) + + def keys(self): + return self.get_attributes() + + def values(self): + return (getattr(self, field) for field in self.get_attributes()) + + def __eq__(self, other: "QuantizationScheme") -> bool: + if not isinstance(other, QuantizationScheme): + return False + for field in self.get_attributes(): + if getattr(self, field) != getattr(other, field): + return False + return True + def preset_name_to_scheme(name: str) -> QuantizationScheme: """Get a QuantizationScheme instance from a preset scheme name.""" From fd05799340e863813a3c4c75857fd99ec1b0eeb2 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 05:10:03 -0400 Subject: [PATCH 33/47] refine code Signed-off-by: yiliu30 --- auto_round/export/export_to_autoround/export.py | 3 +-- auto_round/inference/backend.py | 2 -- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 2f0552c1b..89c9032d6 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -25,6 +25,7 @@ import transformers from tqdm import tqdm +from auto_round.autoround import AutoRoundFormat from auto_round.utils import ( SUPPORTED_FORMATS, SUPPORTED_LAYER_TYPES, @@ -151,8 +152,6 @@ def pack_layer(layer_name, model, backend): from auto_round.export.export_to_autoround.export_to_nvfp_mxfp import pack_layer return pack_layer(layer_name, model, backend) - # breakpoint() - from auto_round.autoround import AutoRoundFormat if backend == "auto_round:fp8" or backend == f"auto_round:{AutoRoundFormat.TORCH_FP8_STATIC.value}": from auto_round.export.export_to_autoround.export_to_fp8 import pack_layer diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 0f822d7c2..cb521bf82 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -114,8 +114,6 @@ def torch_fp8_static_check( in_feature_multiplier: Optional[int] = None, out_feature_multiplier: Optional[int] = None, ): - if not is_weight_fp8_activation_static_fp8(config): - return False from auto_round.schemes import FPW8_STATIC return config == FPW8_STATIC From 3d75c276ad234d6d15f61afb5c08be3ff0303c97 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 06:24:09 -0400 Subject: [PATCH 34/47] fix equal check Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 3 ++- auto_round/schemes.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index e50bf71e8..c2d4a6b6a 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -254,6 +254,8 @@ def get_layer_config(model, quantization_config): act_bits=act_bits, act_group_size=act_group_size, act_sym=act_sym, + act_data_type=act_data_type, + act_dynamic=act_dynamic, ) # Determine the quantization block list @@ -310,7 +312,6 @@ def get_layer_config(model, quantization_config): for scheme_attr in quant_scheme_attrs: layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr)) layer_configs[layer_name] = QuantizationScheme.from_dict(layer_config) - return layer_configs diff --git a/auto_round/schemes.py b/auto_round/schemes.py index c7221a1fd..a07cc220a 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -183,6 +183,7 @@ def is_preset_scheme(name: str) -> bool: "act_group_size": 0, "act_data_type": "fp", "act_dynamic": False, + "act_sym": True, } ) From e0c0d58e5d197c085368ff485cd787d95442551a Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 06:24:09 -0400 Subject: [PATCH 35/47] fix equal check Signed-off-by: yiliu30 --- auto_round/inference/convert_model.py | 3 ++- auto_round/schemes.py | 1 + auto_round/utils.py | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index e50bf71e8..c2d4a6b6a 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -254,6 +254,8 @@ def get_layer_config(model, quantization_config): act_bits=act_bits, act_group_size=act_group_size, act_sym=act_sym, + act_data_type=act_data_type, + act_dynamic=act_dynamic, ) # Determine the quantization block list @@ -310,7 +312,6 @@ def get_layer_config(model, quantization_config): for scheme_attr in quant_scheme_attrs: layer_config[scheme_attr] = layer_extra_config.get(scheme_attr, getattr(default_quant_scheme, scheme_attr)) layer_configs[layer_name] = QuantizationScheme.from_dict(layer_config) - return layer_configs diff --git a/auto_round/schemes.py b/auto_round/schemes.py index c7221a1fd..a07cc220a 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -183,6 +183,7 @@ def is_preset_scheme(name: str) -> bool: "act_group_size": 0, "act_data_type": "fp", "act_dynamic": False, + "act_sym": True, } ) diff --git a/auto_round/utils.py b/auto_round/utils.py index 5410609cf..4c42ec334 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -33,6 +33,7 @@ from torch.amp import autocast from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES, GGUF_CONFIG, GGUF_INNER_CONFIG, QK_K, ModelType +from auto_round.schemes import QuantizationScheme SHARED_CACHE_KEYS = ("position_ids", "cache_position", "position_embeddings") @@ -524,7 +525,7 @@ def check_to_quantized(config): bool: True if the configuration is valid for quantization (bits <= 8), False otherwise. """ - if isinstance(config, dict): + if isinstance(config, (dict, QuantizationScheme)): bits = int(config.get("bits", 16)) act_bits = int(config.get("act_bits", 16)) elif hasattr(config, "orig_layer"): From fa3ec2dd5048da6522e7a0dffe05716c9f6a5eba Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 08:55:43 -0400 Subject: [PATCH 36/47] fix get Signed-off-by: yiliu30 --- auto_round/schemes.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/auto_round/schemes.py b/auto_round/schemes.py index a07cc220a..007828290 100644 --- a/auto_round/schemes.py +++ b/auto_round/schemes.py @@ -61,6 +61,15 @@ def keys(self): def values(self): return (getattr(self, field) for field in self.get_attributes()) + def get(self, key: str, default=None): + if key not in self.get_attributes(): + return default + res = getattr(self, key) + # In case the attribute is explicitly set to None, return default + if res is None: + return default + return getattr(self, key) + def __eq__(self, other: "QuantizationScheme") -> bool: if not isinstance(other, QuantizationScheme): return False From ad5269e011310dd3ad28c10d373161124bca0cee Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 22:11:48 -0400 Subject: [PATCH 37/47] rename Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index cb521bf82..0f224d740 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -98,16 +98,16 @@ def feature_multiply_checker_group_size( feature_multiply_checker, in_feature_multiplier=32, out_feature_multiplier=32 ) -exllamav2_feature_check = functools.partial( +exllamav2_feature_checker = functools.partial( feature_multiply_checker_group_size, in_feature_multiplier=32, out_feature_multiplier=32 ) -gptqmodel_marlin_feature_check = functools.partial( +gptqmodel_marlin_feature_checker = functools.partial( feature_multiply_checker_group_size, in_feature_multiplier=1, out_feature_multiplier=64 ) -def torch_fp8_static_check( +def torch_fp8_static_checker( in_feature: int, out_feature: int, config: QuantizationScheme, @@ -128,7 +128,7 @@ def torch_fp8_static_check( dtype=["float16"], ##16, 384,768 accuracy issue group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048], - feature_checks=[exllamav2_feature_check], + feature_checks=[exllamav2_feature_checker], alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"], requirements=["torch<2.6.0", "auto-gptq>=0.7.1"], ) @@ -141,7 +141,7 @@ def torch_fp8_static_check( group_size=None, dtype=["float16"], priority=0, - feature_checks=[exllamav2_feature_check], + feature_checks=[exllamav2_feature_checker], alias=["auto_gptq:tritonv2"], requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"], ) @@ -153,7 +153,7 @@ def torch_fp8_static_check( bits=[2, 3, 4, 8], group_size=None, priority=1, - feature_checks=[exllamav2_feature_check], + feature_checks=[exllamav2_feature_checker], alias=["auto_gptq:cuda"], dtype=["float16"], convertable_format=["int32_zp"], @@ -182,7 +182,7 @@ def torch_fp8_static_check( dtype=["float16", "bfloat16"], bits=[2, 3, 4, 8], priority=0, - feature_checks=[exllamav2_feature_check], + feature_checks=[exllamav2_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>=0.5.1"], ) @@ -198,7 +198,7 @@ def torch_fp8_static_check( dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, - feature_checks=[torch_fp8_static_check], + feature_checks=[torch_fp8_static_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.6.0"], ) @@ -223,7 +223,7 @@ def torch_fp8_static_check( dtype=["float16", "bfloat16"], bits=[2, 3, 4, 8], priority=0, - feature_checks=[exllamav2_feature_check], + feature_checks=[exllamav2_feature_checker], alias=["torch", "torch_zp"], requirements=["auto-round>=0.5.1"], ) @@ -236,7 +236,7 @@ def torch_fp8_static_check( group_size=[-1, 32, 64, 128], dtype=["float16", "bfloat16"], priority=6, - feature_checks=[gptqmodel_marlin_feature_check], + feature_checks=[gptqmodel_marlin_feature_checker], alias=["marlin", "gptqmodel"], requirements=["gptqmodel>=2.0"], ) @@ -249,7 +249,7 @@ def torch_fp8_static_check( group_size=[-1, 32, 64, 128], dtype=["float16", "bfloat16"], priority=6, - feature_checks=[gptqmodel_marlin_feature_check], + feature_checks=[gptqmodel_marlin_feature_checker], alias=["marlin", "gptqmodel"], requirements=["gptqmodel>=2.0"], ) @@ -262,7 +262,7 @@ def torch_fp8_static_check( group_size=[-1, 32, 64, 128], ##16 seems has accuracy issue dtype=["float16", "bfloat16"], priority=5, - feature_checks=[exllamav2_feature_check], + feature_checks=[exllamav2_feature_checker], alias=["exllamav2"], requirements=["gptqmodel>=2.0"], ) From 35e45ed0c314682d9530ea30488519a781915f04 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 22:22:34 -0400 Subject: [PATCH 38/47] update check Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 6 ++++-- auto_round/inference/convert_model.py | 4 ++-- auto_round/utils.py | 11 ----------- 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 0f224d740..cb054aa72 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -19,8 +19,9 @@ from transformers.utils.versions import require_version import auto_round_extension.cuda.gptqmodel_marlin +from auto_round.autoround import AutoRoundFormat from auto_round.schemes import QuantizationScheme -from auto_round.utils import get_library_version, is_weight_fp8_activation_static_fp8, logger +from auto_round.utils import get_library_version, logger BackendInfos = {} @@ -469,7 +470,8 @@ def dynamic_import_inference_linear(backend, config): """ bits, group_size, sym = config["bits"], config["group_size"], config["sym"] - if is_weight_fp8_activation_static_fp8(config): + if AutoRoundFormat.TORCH_FP8_STATIC.value in backend: + logger.warning_once("FP8 static quantization is still experimental.") from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear return WeightFP8ActFP8StaticQuantLinear diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index c2d4a6b6a..46b0bf2ab 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -21,13 +21,13 @@ from tqdm import tqdm from transformers.pytorch_utils import Conv1D +from auto_round.autoround import AutoRoundFormat from auto_round.inference.backend import ( BackendInfos, dynamic_import_inference_linear, find_backend, get_highest_priority_backend, get_layer_backend, - is_weight_fp8_activation_static_fp8, process_requirement, ) from auto_round.schemes import QuantizationScheme @@ -452,7 +452,7 @@ def _create_quant_layer(layer, layer_backend, config, in_features, out_features) out_features=out_features, bias=bias, ) - elif is_weight_fp8_activation_static_fp8(config): + elif AutoRoundFormat.TORCH_FP8_STATIC.value in layer_backend: return QuantLinear.from_original(config, layer) # Default quantized layer creation try: diff --git a/auto_round/utils.py b/auto_round/utils.py index 4c42ec334..077127fe8 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -2561,17 +2561,6 @@ def _is_weight_fp8_activation_static_fp8( return bit == 8 and group_size == -1 and sym and data_type == "fp" and not act_dynamic -def is_weight_fp8_activation_static_fp8(config: Dict): - bits, group_size, sym, data_type, act_dynamic = ( - config["bits"], - config["group_size"], - config["sym"], - config["data_type"], - config["act_dynamic"], - ) - return _is_weight_fp8_activation_static_fp8(bits, group_size, sym, data_type, act_dynamic) - - def is_wfp8afp8(ar): if ("fp8" in ar.act_data_type or ("fp" in ar.act_data_type and ar.act_bits == 8)) and ( "fp8" in ar.data_type or ("fp" in ar.data_type and ar.bits == 8) From f4e254ba7d064dd9893c08078ef9074e0f192715 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Thu, 4 Sep 2025 22:27:23 -0400 Subject: [PATCH 39/47] add warning Signed-off-by: yiliu30 --- auto_round/experimental/qmodules/fp8_static.py | 7 +++++-- auto_round/inference/backend.py | 1 - 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/auto_round/experimental/qmodules/fp8_static.py b/auto_round/experimental/qmodules/fp8_static.py index a61148d82..e7c55086d 100644 --- a/auto_round/experimental/qmodules/fp8_static.py +++ b/auto_round/experimental/qmodules/fp8_static.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from abc import abstractmethod + from typing import Optional, Union import torch @@ -79,8 +79,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module): @classmethod def from_original(cls, config, original_layer): """ - Create an WeightFP8ActFP8StaticQuantLinear layer from an original linear layer. + Create an `WeightFP8ActFP8StaticQuantLinear` layer from an original linear layer. """ + logger.warning_once( + "FP8 static quantization is still in experimental stage, the inference speed might be slow." + ) device = original_layer.weight.device with torch.device(device): qdq_linear = cls( diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index cb054aa72..0006b1061 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -471,7 +471,6 @@ def dynamic_import_inference_linear(backend, config): bits, group_size, sym = config["bits"], config["group_size"], config["sym"] if AutoRoundFormat.TORCH_FP8_STATIC.value in backend: - logger.warning_once("FP8 static quantization is still experimental.") from auto_round.experimental.qmodules.fp8_static import WeightFP8ActFP8StaticQuantLinear return WeightFP8ActFP8StaticQuantLinear From ff5a1e99b1950146fbdff33bb8a080fa29b0dbe3 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 5 Sep 2025 03:22:02 -0400 Subject: [PATCH 40/47] rename check Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 0006b1061..3b14da330 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -55,7 +55,7 @@ class BackendInfo: indicate higher priority. Defaults to 0. convertable_format: A list of strings specifying the formats that the backend can convert from. Defaults to an empty list. - feature_checks: A list of feature check functions (e.g., validation methods) + check_list: A list of feature check functions (e.g., validation methods) used to verify whether the backend supports certain features. Defaults to an empty list. alias: An optional list of strings representing alternative names for the @@ -70,7 +70,7 @@ class BackendInfo: group_size: Optional[List[int]] = None priority: int = 0 ##higher is better convertable_format: List[str] = field(default_factory=list) - feature_checks: List[Any] = field(default_factory=list) + check_list: List[Any] = field(default_factory=list) alias: Optional[List[str]] = None requirements: Optional[List[str]] = None @@ -129,7 +129,7 @@ def torch_fp8_static_checker( dtype=["float16"], ##16, 384,768 accuracy issue group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048], - feature_checks=[exllamav2_feature_checker], + check_list=[exllamav2_feature_checker], alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"], requirements=["torch<2.6.0", "auto-gptq>=0.7.1"], ) @@ -142,7 +142,7 @@ def torch_fp8_static_checker( group_size=None, dtype=["float16"], priority=0, - feature_checks=[exllamav2_feature_checker], + check_list=[exllamav2_feature_checker], alias=["auto_gptq:tritonv2"], requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"], ) @@ -154,7 +154,7 @@ def torch_fp8_static_checker( bits=[2, 3, 4, 8], group_size=None, priority=1, - feature_checks=[exllamav2_feature_checker], + check_list=[exllamav2_feature_checker], alias=["auto_gptq:cuda"], dtype=["float16"], convertable_format=["int32_zp"], @@ -171,7 +171,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 4, 8], priority=2, - feature_checks=[feature_multiply_checker_32], + check_list=[feature_multiply_checker_32], alias=["auto_round", "tritonv2", "triton"], requirements=["triton>=2.0", "auto-round>=0.5.0"], ) @@ -183,7 +183,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 3, 4, 8], priority=0, - feature_checks=[exllamav2_feature_checker], + check_list=[exllamav2_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>=0.5.1"], ) @@ -199,7 +199,7 @@ def torch_fp8_static_checker( dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, - feature_checks=[torch_fp8_static_checker], + check_list=[torch_fp8_static_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.6.0"], ) @@ -212,7 +212,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 4, 8], priority=2, - feature_checks=[feature_multiply_checker_32], + check_list=[feature_multiply_checker_32], alias=["tritonv2", "tritonv2_zp", "triton"], requirements=["triton>=2.0", "auto-round>=0.5.0"], ) @@ -224,7 +224,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 3, 4, 8], priority=0, - feature_checks=[exllamav2_feature_checker], + check_list=[exllamav2_feature_checker], alias=["torch", "torch_zp"], requirements=["auto-round>=0.5.1"], ) @@ -237,7 +237,7 @@ def torch_fp8_static_checker( group_size=[-1, 32, 64, 128], dtype=["float16", "bfloat16"], priority=6, - feature_checks=[gptqmodel_marlin_feature_checker], + check_list=[gptqmodel_marlin_feature_checker], alias=["marlin", "gptqmodel"], requirements=["gptqmodel>=2.0"], ) @@ -250,7 +250,7 @@ def torch_fp8_static_checker( group_size=[-1, 32, 64, 128], dtype=["float16", "bfloat16"], priority=6, - feature_checks=[gptqmodel_marlin_feature_checker], + check_list=[gptqmodel_marlin_feature_checker], alias=["marlin", "gptqmodel"], requirements=["gptqmodel>=2.0"], ) @@ -263,7 +263,7 @@ def torch_fp8_static_checker( group_size=[-1, 32, 64, 128], ##16 seems has accuracy issue dtype=["float16", "bfloat16"], priority=5, - feature_checks=[exllamav2_feature_checker], + check_list=[exllamav2_feature_checker], alias=["exllamav2"], requirements=["gptqmodel>=2.0"], ) @@ -287,7 +287,7 @@ def torch_fp8_static_checker( bits=[2, 4, 8], group_size=None, priority=1, - feature_checks=[], + check_list=[], alias=["itrex", "qbits"], dtype=["float16", "bfloat16"], convertable_format=["int32"], @@ -302,7 +302,7 @@ def torch_fp8_static_checker( group_size=None, dtype=["float16", "bfloat16"], priority=1, - feature_checks=[], + check_list=[], alias=["itrex", "qbits"], convertable_format=["int32_zp"], requirements=["torch<2.7.0", "intel-extension-for-transformers"], @@ -316,7 +316,7 @@ def torch_fp8_static_checker( group_size=None, dtype=["float16", "bfloat16"], priority=1, - feature_checks=[], + check_list=[], alias=["itrex", "qbits"], requirements=["torch<2.7.0", "intel-extension-for-transformers"], ) @@ -328,7 +328,7 @@ def torch_fp8_static_checker( bits=[4], group_size=None, priority=5, - feature_checks=[], + check_list=[], dtype=["float16", "bfloat16"], convertable_format=["int32_zp"], alias=["ipex"], @@ -343,7 +343,7 @@ def torch_fp8_static_checker( group_size=None, priority=1, dtype=["float16", "bfloat16"], - feature_checks=[], + check_list=[], alias=["ipex"], convertable_format=["awq"], requirements=["intel-extension-for-pytorch>=2.6"], @@ -425,7 +425,7 @@ def check_compatible(backend_name, device, config, packing_format, in_features, else: return False - for check in backend.feature_checks: + for check in backend.check_list: if not check(in_features, out_features, config): return False From 50968fd0acea6e2db1057e8db22478a9a24f8fcc Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Fri, 5 Sep 2025 03:36:15 -0400 Subject: [PATCH 41/47] rename Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 38 ++++++++++++++++----------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 3b14da330..9d3aa58c5 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -55,7 +55,7 @@ class BackendInfo: indicate higher priority. Defaults to 0. convertable_format: A list of strings specifying the formats that the backend can convert from. Defaults to an empty list. - check_list: A list of feature check functions (e.g., validation methods) + checkers: A list of check functions (e.g., validation methods) used to verify whether the backend supports certain features. Defaults to an empty list. alias: An optional list of strings representing alternative names for the @@ -70,7 +70,7 @@ class BackendInfo: group_size: Optional[List[int]] = None priority: int = 0 ##higher is better convertable_format: List[str] = field(default_factory=list) - check_list: List[Any] = field(default_factory=list) + checkers: List[Any] = field(default_factory=list) alias: Optional[List[str]] = None requirements: Optional[List[str]] = None @@ -129,7 +129,7 @@ def torch_fp8_static_checker( dtype=["float16"], ##16, 384,768 accuracy issue group_size=[-1, 32, 64, 128, 256, 512, 1024, 2048], - check_list=[exllamav2_feature_checker], + checkers=[exllamav2_feature_checker], alias=["gptq", "auto_gptq", "exllamav2", "gptq:exllamav2", "auto_gptq:exllamav2"], requirements=["torch<2.6.0", "auto-gptq>=0.7.1"], ) @@ -142,7 +142,7 @@ def torch_fp8_static_checker( group_size=None, dtype=["float16"], priority=0, - check_list=[exllamav2_feature_checker], + checkers=[exllamav2_feature_checker], alias=["auto_gptq:tritonv2"], requirements=["torch<2.6.0", "auto-gptq>=0.7.1", "triton>=2.0"], ) @@ -154,7 +154,7 @@ def torch_fp8_static_checker( bits=[2, 3, 4, 8], group_size=None, priority=1, - check_list=[exllamav2_feature_checker], + checkers=[exllamav2_feature_checker], alias=["auto_gptq:cuda"], dtype=["float16"], convertable_format=["int32_zp"], @@ -171,7 +171,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 4, 8], priority=2, - check_list=[feature_multiply_checker_32], + checkers=[feature_multiply_checker_32], alias=["auto_round", "tritonv2", "triton"], requirements=["triton>=2.0", "auto-round>=0.5.0"], ) @@ -183,7 +183,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 3, 4, 8], priority=0, - check_list=[exllamav2_feature_checker], + checkers=[exllamav2_feature_checker], alias=["auto_round", "torch"], requirements=["auto-round>=0.5.1"], ) @@ -199,7 +199,7 @@ def torch_fp8_static_checker( dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, - check_list=[torch_fp8_static_checker], + checkers=[torch_fp8_static_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.6.0"], ) @@ -212,7 +212,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 4, 8], priority=2, - check_list=[feature_multiply_checker_32], + checkers=[feature_multiply_checker_32], alias=["tritonv2", "tritonv2_zp", "triton"], requirements=["triton>=2.0", "auto-round>=0.5.0"], ) @@ -224,7 +224,7 @@ def torch_fp8_static_checker( dtype=["float16", "bfloat16"], bits=[2, 3, 4, 8], priority=0, - check_list=[exllamav2_feature_checker], + checkers=[exllamav2_feature_checker], alias=["torch", "torch_zp"], requirements=["auto-round>=0.5.1"], ) @@ -237,7 +237,7 @@ def torch_fp8_static_checker( group_size=[-1, 32, 64, 128], dtype=["float16", "bfloat16"], priority=6, - check_list=[gptqmodel_marlin_feature_checker], + checkers=[gptqmodel_marlin_feature_checker], alias=["marlin", "gptqmodel"], requirements=["gptqmodel>=2.0"], ) @@ -250,7 +250,7 @@ def torch_fp8_static_checker( group_size=[-1, 32, 64, 128], dtype=["float16", "bfloat16"], priority=6, - check_list=[gptqmodel_marlin_feature_checker], + checkers=[gptqmodel_marlin_feature_checker], alias=["marlin", "gptqmodel"], requirements=["gptqmodel>=2.0"], ) @@ -263,7 +263,7 @@ def torch_fp8_static_checker( group_size=[-1, 32, 64, 128], ##16 seems has accuracy issue dtype=["float16", "bfloat16"], priority=5, - check_list=[exllamav2_feature_checker], + checkers=[exllamav2_feature_checker], alias=["exllamav2"], requirements=["gptqmodel>=2.0"], ) @@ -287,7 +287,7 @@ def torch_fp8_static_checker( bits=[2, 4, 8], group_size=None, priority=1, - check_list=[], + checkers=[], alias=["itrex", "qbits"], dtype=["float16", "bfloat16"], convertable_format=["int32"], @@ -302,7 +302,7 @@ def torch_fp8_static_checker( group_size=None, dtype=["float16", "bfloat16"], priority=1, - check_list=[], + checkers=[], alias=["itrex", "qbits"], convertable_format=["int32_zp"], requirements=["torch<2.7.0", "intel-extension-for-transformers"], @@ -316,7 +316,7 @@ def torch_fp8_static_checker( group_size=None, dtype=["float16", "bfloat16"], priority=1, - check_list=[], + checkers=[], alias=["itrex", "qbits"], requirements=["torch<2.7.0", "intel-extension-for-transformers"], ) @@ -328,7 +328,7 @@ def torch_fp8_static_checker( bits=[4], group_size=None, priority=5, - check_list=[], + checkers=[], dtype=["float16", "bfloat16"], convertable_format=["int32_zp"], alias=["ipex"], @@ -343,7 +343,7 @@ def torch_fp8_static_checker( group_size=None, priority=1, dtype=["float16", "bfloat16"], - check_list=[], + checkers=[], alias=["ipex"], convertable_format=["awq"], requirements=["intel-extension-for-pytorch>=2.6"], @@ -425,7 +425,7 @@ def check_compatible(backend_name, device, config, packing_format, in_features, else: return False - for check in backend.check_list: + for check in backend.checkers: if not check(in_features, out_features, config): return False From abd83acf652b6e1d176110472f636cf3142b9654 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 6 Sep 2025 06:59:06 +0000 Subject: [PATCH 42/47] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/export/export_to_autoround/export.py | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index c66f109da..472d18e06 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -25,7 +25,6 @@ import transformers from tqdm import tqdm - from auto_round.autoround import AutoRoundFormat from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config from auto_round.utils import ( From d332a957db8e2ace5cd11efe67a9b5ab3ec83966 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Sep 2025 08:30:11 +0000 Subject: [PATCH 43/47] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/autoround.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 3cacaf4a4..7cb77acfe 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -18,8 +18,8 @@ import sys import time import traceback -from enum import Enum from dataclasses import asdict, fields +from enum import Enum from typing import Any, Callable, Union import accelerate From 8a4a533451ed51d7554a5853a87e19006e7256c9 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 8 Sep 2025 12:36:53 +0000 Subject: [PATCH 44/47] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- auto_round/utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/auto_round/utils.py b/auto_round/utils.py index 24850f860..e91b37918 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -24,11 +24,8 @@ from collections import UserDict from enum import Enum from functools import lru_cache - -from typing import Any, Callable, Dict, Tuple, Union - from pathlib import Path - +from typing import Any, Callable, Dict, Tuple, Union import cpuinfo import torch From f05e38ba4b4af0c17fb81e8daeac6aefd5ebaa70 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Mon, 8 Sep 2025 22:12:21 -0400 Subject: [PATCH 45/47] fix Signed-off-by: yiliu30 --- auto_round/inference/backend.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 9d3aa58c5..cedcbbd64 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -115,9 +115,9 @@ def torch_fp8_static_checker( in_feature_multiplier: Optional[int] = None, out_feature_multiplier: Optional[int] = None, ): - from auto_round.schemes import FPW8_STATIC + from auto_round.schemes import FP8_STATIC - return config == FPW8_STATIC + return config == FP8_STATIC BackendInfos["auto_gptq:exllamav2"] = BackendInfo( From c58a61c6176bd03a49d16ace0a0bf2089fdf838a Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 9 Sep 2025 07:25:20 -0400 Subject: [PATCH 46/47] update Signed-off-by: yiliu30 --- auto_round/autoround.py | 7 +------ auto_round/export/export_to_autoround/__init__.py | 2 +- auto_round/export/export_to_autoround/export.py | 8 +++++++- auto_round/inference/backend.py | 2 +- auto_round/inference/convert_model.py | 2 +- auto_round/utils.py | 2 +- 6 files changed, 12 insertions(+), 11 deletions(-) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index b67d729af..ef01063b4 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -31,6 +31,7 @@ from auto_round.data_type import QUANT_FUNC_WITH_DTYPE from auto_round.data_type.utils import reshape_pad_tensor_by_group_size +from auto_round.export.export_to_autoround import AutoRoundFormat from auto_round.export.export_to_gguf.config import GGUF_CONFIG, GGUF_INNER_CONFIG, ModelType from auto_round.low_cpu_mem.utils import get_layers_before_block from auto_round.schemes import QuantizationScheme, preset_name_to_scheme @@ -97,12 +98,6 @@ from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block -class AutoRoundFormat(str, Enum): - # Weight: FP8, per-channel, may be extended to per-tensor in future - # Activation: FP8, per-tensor - TORCH_FP8_STATIC = "torch_fp8_static" - - class AutoRound(object): """Automatic weight rounding (Signed Gradient Descent) for LLM quantization diff --git a/auto_round/export/export_to_autoround/__init__.py b/auto_round/export/export_to_autoround/__init__.py index 0c036d831..6cdcd5aed 100644 --- a/auto_round/export/export_to_autoround/__init__.py +++ b/auto_round/export/export_to_autoround/__init__.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .export import save_quantized_as_autoround +from .export import save_quantized_as_autoround, AutoRoundFormat diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index de8d15752..03f8ee0c2 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -18,6 +18,7 @@ import json import os from concurrent.futures import ThreadPoolExecutor +from enum import Enum import threadpoolctl as tctl import torch @@ -25,7 +26,6 @@ import transformers from tqdm import tqdm -from auto_round.autoround import AutoRoundFormat from auto_round.export.export_to_autoround.utils import REQUIRED_CONFIG_KEYS, check_neq_config from auto_round.utils import ( SUPPORTED_FORMATS, @@ -44,6 +44,12 @@ ) +class AutoRoundFormat(str, Enum): + # Weight: FP8, per-channel, may be extended to per-tensor in future + # Activation: FP8, per-tensor + TORCH_FP8_STATIC = "torch_fp8_static" + + def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16): """ Dynamically imports and returns the appropriate QuantLinear class based on the specified backend and parameters. diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index cedcbbd64..3c47af6b3 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -19,7 +19,7 @@ from transformers.utils.versions import require_version import auto_round_extension.cuda.gptqmodel_marlin -from auto_round.autoround import AutoRoundFormat +from auto_round.export.export_to_autoround import AutoRoundFormat from auto_round.schemes import QuantizationScheme from auto_round.utils import get_library_version, logger diff --git a/auto_round/inference/convert_model.py b/auto_round/inference/convert_model.py index 59bf29a09..fcfb83b4c 100644 --- a/auto_round/inference/convert_model.py +++ b/auto_round/inference/convert_model.py @@ -21,7 +21,7 @@ from tqdm import tqdm from transformers.pytorch_utils import Conv1D -from auto_round.autoround import AutoRoundFormat +from auto_round.export.export_to_autoround import AutoRoundFormat from auto_round.inference.backend import ( BackendInfos, dynamic_import_inference_linear, diff --git a/auto_round/utils.py b/auto_round/utils.py index 65695ff87..c14131e57 100644 --- a/auto_round/utils.py +++ b/auto_round/utils.py @@ -25,7 +25,7 @@ from enum import Enum from functools import lru_cache from pathlib import Path -from typing import Any, Callable, Dict, Tuple, Union +from typing import Any, Callable, Tuple, Union import cpuinfo import torch From 2c34244c98e69e2495e3f1f303f643d9097241e6 Mon Sep 17 00:00:00 2001 From: yiliu30 Date: Tue, 9 Sep 2025 21:34:00 -0400 Subject: [PATCH 47/47] fix Signed-off-by: yiliu30 --- auto_round/export/export_to_autoround/export.py | 2 +- auto_round/inference/backend.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index 03f8ee0c2..ffd10036c 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -47,7 +47,7 @@ class AutoRoundFormat(str, Enum): # Weight: FP8, per-channel, may be extended to per-tensor in future # Activation: FP8, per-tensor - TORCH_FP8_STATIC = "torch_fp8_static" + TORCH_FP8_STATIC = "fp8_static" def dynamic_import_quant_linear_for_packing(backend, bits, group_size, sym, act_bits=16): diff --git a/auto_round/inference/backend.py b/auto_round/inference/backend.py index 3c47af6b3..1868ee14c 100644 --- a/auto_round/inference/backend.py +++ b/auto_round/inference/backend.py @@ -73,6 +73,7 @@ class BackendInfo: checkers: List[Any] = field(default_factory=list) alias: Optional[List[str]] = None requirements: Optional[List[str]] = None + # TODO(Yi): Add more fields for activation dtype, group size, etc. def feature_multiply_checker(in_feature, out_feature, config, in_feature_multiplier, out_feature_multiplier=None): @@ -108,7 +109,7 @@ def feature_multiply_checker_group_size( ) -def torch_fp8_static_checker( +def fp8_static_scheme_checker( in_feature: int, out_feature: int, config: QuantizationScheme, @@ -192,14 +193,14 @@ def torch_fp8_static_checker( # Weight: FP8, per-channel, may be extended to per-tensor in future # Activation: FP8, per-tensor -BackendInfos["auto_round:torch_fp8_static"] = BackendInfo( +BackendInfos["auto_round:fp8_static"] = BackendInfo( device=["xpu", "cuda", "cpu"], packing_format="", sym=[True], dtype=["float32", "float16", "bfloat16"], bits=[8], priority=0, - checkers=[torch_fp8_static_checker], + checkers=[fp8_static_scheme_checker], alias=["auto_round", "torch"], requirements=["auto-round>0.6.0"], )