Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 52 additions & 1 deletion optimum/commands/export/executorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,13 +89,25 @@ def parse_args_executorch(parser):
required_group.add_argument(
"--qlinear_group_size", type=int, required=False, help="Group size for decoder linear quantization."
)
required_group.add_argument(
"--qlinear_packing_format",
type=str,
choices=["tile_packed_to_4d"],
required=False,
help=(
"Packing format for decoder linear layers.\n"
"Only applicable to certain backends such as CUDA and Metal\n\n"
"Options:\n"
" tile_packed_to_4d - int4 4d packing format"
),
)
required_group.add_argument(
"--qlinear_encoder",
type=str,
choices=["8da4w", "4w", "8w"],
required=False,
help=(
"Quantization config for linear layers.\n\n"
"Quantization config for encoder linear layers.\n\n"
"Options:\n"
" 8da4w - 8-bit dynamic activation, 4-bit weight\n"
" 4w - 4-bit weight only\n"
Expand All @@ -105,6 +117,18 @@ def parse_args_executorch(parser):
required_group.add_argument(
"--qlinear_encoder_group_size", type=int, required=False, help="Group size for encoder linear quantization."
)
required_group.add_argument(
"--qlinear_encoder_packing_format",
type=str,
choices=["tile_packed_to_4d"],
required=False,
help=(
"Packing format for encoder linear layers.\n"
"Only applicable to certain backends such as CUDA and Metal\n\n"
"Options:\n"
" tile_packed_to_4d - int4 4d packing format"
),
)
required_group.add_argument(
"--qembedding",
type=str,
Expand Down Expand Up @@ -152,6 +176,29 @@ def parse_args(parser: "ArgumentParser"):
def run(self):
from ...exporters.executorch import main_export

# Validate int4 packing format can only be used with CUDA devices and 4w quantization
device = getattr(self.args, "device", None)
qlinear_packing_format = getattr(self.args, "qlinear_packing_format", None)
if qlinear_packing_format:
if not device or not device.startswith("cuda"):
raise ValueError(
"--qlinear_packing_format can only be used when --device is set to CUDA (e.g., 'cuda', 'cuda:0', etc.)"
)
if not self.args.qlinear or self.args.qlinear != "4w":
raise ValueError(
"--qlinear_packing_format can only be used when --qlinear is set to '4w'"
)
qlinear_encoder_packing_format = getattr(self.args, "qlinear_encoder_packing_format", None)
if qlinear_encoder_packing_format:
if not device or not device.startswith("cuda"):
raise ValueError(
"--qlinear_encoder_packing_format can only be used when --device is set to CUDA (e.g., 'cuda', 'cuda:0', etc.)"
)
if not self.args.qlinear_encoder or self.args.qlinear_encoder != "4w":
raise ValueError(
"--qlinear_encoder_packing_format can only be used when --qlinear_encoder is set to '4w'"
)

kwargs = {}
if self.args.use_custom_sdpa:
kwargs["use_custom_sdpa"] = self.args.use_custom_sdpa
Expand All @@ -163,10 +210,14 @@ def run(self):
kwargs["qlinear"] = self.args.qlinear
if self.args.qlinear_group_size:
kwargs["qlinear_group_size"] = self.args.qlinear_group_size
if qlinear_packing_format:
kwargs["qlinear_packing_format"] = qlinear_packing_format
if self.args.qlinear_encoder:
kwargs["qlinear_encoder"] = self.args.qlinear_encoder
if self.args.qlinear_encoder_group_size:
kwargs["qlinear_encoder_group_size"] = self.args.qlinear_encoder_group_size
if qlinear_encoder_packing_format:
kwargs["qlinear_encoder_packing_format"] = qlinear_encoder_packing_format
if self.args.qembedding:
kwargs["qembedding"] = self.args.qembedding
if self.args.qembedding_group_size:
Expand Down
40 changes: 26 additions & 14 deletions optimum/exporters/executorch/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def quantize_model_(
eager_model: torch.nn.Module,
qlinear_config: Optional[str] = None,
qlinear_group_size: Optional[int] = 32,
qlinear_packing_format: Optional[str] = None,
qembedding_config: Optional[str] = None,
qembedding_group_size: Optional[int] = 0,
) -> torch.nn.Module:
Expand All @@ -30,6 +31,7 @@ def quantize_model_(

from torchao.quantization.granularity import PerAxis, PerGroup
from torchao.quantization.quant_api import (
Int4WeightOnlyConfig,
Int8DynamicActivationIntxWeightConfig,
IntxWeightOnlyConfig,
quantize_,
Expand Down Expand Up @@ -74,20 +76,30 @@ def quantize_model_(
linear_weight_granularity = PerGroup(qlinear_group_size)

logging.info("Quantizing linear layers.")
linear_config = {
"8da4w": Int8DynamicActivationIntxWeightConfig(
weight_dtype=torch.int4,
weight_granularity=linear_weight_granularity,
),
"4w": IntxWeightOnlyConfig(
weight_dtype=torch.int4,
granularity=linear_weight_granularity,
),
"8w": IntxWeightOnlyConfig(
weight_dtype=torch.int8,
granularity=linear_weight_granularity,
),
}[qlinear_config]

# Determine if we need to use Int4WeightOnlyConfig with int4_packing_format
if qlinear_config == "4w" and qlinear_packing_format:
linear_config = Int4WeightOnlyConfig(
group_size=qlinear_group_size,
int4_packing_format=qlinear_packing_format,
int4_choose_qparams_algorithm="hqq",
)
else:
linear_config = {
"8da4w": Int8DynamicActivationIntxWeightConfig(
weight_dtype=torch.int4,
weight_granularity=linear_weight_granularity,
),
"4w": IntxWeightOnlyConfig(
weight_dtype=torch.int4,
granularity=linear_weight_granularity,
),
"8w": IntxWeightOnlyConfig(
weight_dtype=torch.int8,
granularity=linear_weight_granularity,
),
}[qlinear_config]

quantize_(
eager_model,
linear_config,
Expand Down
8 changes: 7 additions & 1 deletion optimum/exporters/executorch/tasks/causal_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,14 @@ def _load_eager_pretrained(
param.requires_grad = False

qlinear_config = kwargs.get("qlinear", None)
qlinear_packing_format = kwargs.get("qlinear_packing_format", None)
qembedding_config = kwargs.get("qembedding", None)
quantize_model_(eager_model, qlinear_config=qlinear_config, qembedding_config=qembedding_config)
quantize_model_(
eager_model,
qlinear_config=qlinear_config,
qlinear_packing_format=qlinear_packing_format,
qembedding_config=qembedding_config,
)

return CausalLMExportableModule(
eager_model, max_length, use_custom_kv_cache, use_custom_sdpa, disable_dynamic_shapes
Expand Down
8 changes: 7 additions & 1 deletion optimum/exporters/executorch/tasks/masked_lm.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,13 @@ def load_masked_lm_model(model_name_or_path: str, **kwargs) -> MaskedLMExportabl
eager_model = AutoModelForMaskedLM.from_pretrained(model_name_or_path).to("cpu").eval()

qlinear_config = kwargs.get("qlinear", None)
qlinear_packing_format = kwargs.get("qlinear_packing_format", None)
qembedding_config = kwargs.get("qembedding", None)
quantize_model_(eager_model, qlinear_config=qlinear_config, qembedding_config=qembedding_config)
quantize_model_(
eager_model,
qlinear_config=qlinear_config,
qlinear_packing_format=qlinear_packing_format,
qembedding_config=qembedding_config,
)

return MaskedLMExportableModule(eager_model)
6 changes: 6 additions & 0 deletions optimum/exporters/executorch/tasks/multimodal_text_to_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,8 +196,10 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):

qlinear_config = kwargs.get("qlinear", None)
qlinear_group_size = kwargs.get("qlinear_group_size", None)
qlinear_packing_format = kwargs.get("qlinear_packing_format", None)
qlinear_encoder_config = kwargs.get("qlinear_encoder", None)
qlinear_encoder_group_size = kwargs.get("qlinear_encoder_group_size", None)
qlinear_encoder_packing_format = kwargs.get("qlinear_encoder_packing_format", None)
qembedding_config = kwargs.get("qembedding", None)
qembedding_group_size = kwargs.get("qembedding_group_size", None)

Expand All @@ -208,6 +210,8 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
}
if qlinear_group_size is not None:
quantize_decoder_kwargs["qlinear_group_size"] = qlinear_group_size
if qlinear_packing_format is not None:
quantize_decoder_kwargs["qlinear_packing_format"] = qlinear_packing_format
quantize_model_(**quantize_decoder_kwargs)

# Quantize encoder linear weights.
Expand All @@ -217,6 +221,8 @@ def load_multimodal_text_to_text_model(model_name_or_path: str, **kwargs):
}
if qlinear_encoder_group_size is not None:
quantize_encoder_kwargs["qlinear_group_size"] = qlinear_encoder_group_size
if qlinear_encoder_packing_format is not None:
quantize_encoder_kwargs["qlinear_packing_format"] = qlinear_encoder_packing_format
quantize_model_(**quantize_encoder_kwargs)

# TODO: quantize other parts of the model, e.g. MultimodalProjector?
Expand Down
Loading