From 1082007001c39f715f8ee0b7b314cc8db08cf8fa Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Wed, 14 Aug 2024 10:08:31 -0700 Subject: [PATCH 1/5] Update install_requirements.sh to use PT 20240814 https://hud.pytorch.org/pytorch/pytorch/commit/3a023a67c47bcde45538c9991e332d21ac548e46 To pick up fix in https://github.com/pytorch/pytorch/pull/133235 --- install_requirements.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_requirements.sh b/install_requirements.sh index 6c6a04350..2668a8cc1 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -47,7 +47,7 @@ fi # NOTE: If a newly-fetched version of the executorch repo changes the value of # NIGHTLY_VERSION, you should re-run this script to install the necessary # package versions. -NIGHTLY_VERSION=dev20240728 +NIGHTLY_VERSION=dev20240814 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same ( From 61efa9da3114efdf084baf7847dc5ab7668e94fe Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Wed, 14 Aug 2024 13:47:21 -0700 Subject: [PATCH 2/5] Update AO pin --- install_requirements.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install_requirements.sh b/install_requirements.sh index 2668a8cc1..9baac5ec0 100755 --- a/install_requirements.sh +++ b/install_requirements.sh @@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=( # TODO: Remove this and install nightly build, once it supports macos ( set -x - $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8 + $PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3 ) if [[ -x "$(command -v nvidia-smi)" ]]; then ( From c07ddd046efa65c0369183f102598a928994d846 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Wed, 14 Aug 2024 17:59:16 -0700 Subject: [PATCH 3/5] Remove unwrapping tensor_subclass --- quantization/quantize.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/quantization/quantize.py b/quantization/quantize.py index c72ef2aa1..5c562d8de 100644 --- a/quantization/quantize.py +++ b/quantization/quantize.py @@ -43,7 +43,6 @@ Int8DynActInt4WeightQuantizer, quantize_, ) -from torchao.utils import unwrap_tensor_subclass ######################################################################### @@ -74,7 +73,6 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None): # Use tensor subclass API for int4 weight only. if device == "cuda" and quantizer == "linear:int4": quantize_(model, int4_weight_only(q_kwargs["groupsize"])) - unwrap_tensor_subclass(model) continue # Use dtype precision specified in user config, else fallback on global precision. if "precision" in quantize_options: From b890f6187492295106bd49422cca35cac233e892 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Thu, 15 Aug 2024 18:47:24 -0700 Subject: [PATCH 4/5] Conditional subclass unwrap since aoti doesn't support tensor_subclass --- build/builder.py | 3 ++- export.py | 1 + quantization/quantize.py | 10 +++++++++- 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/build/builder.py b/build/builder.py index 5f0876aab..42c1d6b90 100644 --- a/build/builder.py +++ b/build/builder.py @@ -440,6 +440,7 @@ def _initialize_model( quantize, tokenizer=None, max_seq_length=None, + support_tensor_subclass:bool=True, ): print("Loading model...") @@ -510,7 +511,7 @@ def _initialize_model( if quantize: print(f"Quantizing the model with: {quantize}") with measure_time("Time to quantize model: {time:.02f} seconds"): - quantize_model(model, builder_args.device, quantize, tokenizer) + quantize_model(model, builder_args.device, quantize, tokenizer, support_tensor_subclass) device_sync(device=builder_args.device) if builder_args.setup_caches: diff --git a/export.py b/export.py index 6068abb4d..58730896b 100644 --- a/export.py +++ b/export.py @@ -143,6 +143,7 @@ def main(args): model_to_dso = _initialize_model( builder_args, quantize, + support_tensor_subclass=False, ) _unset_gguf_kwargs(builder_args) diff --git a/quantization/quantize.py b/quantization/quantize.py index 5c562d8de..b9aa4ddf7 100644 --- a/quantization/quantize.py +++ b/quantization/quantize.py @@ -49,7 +49,13 @@ ### torchchat quantization API ### -def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None): +def quantize_model( + model: nn.Module, + device, + quantize_options, + tokenizer=None, + support_tensor_subclass:bool=True +): """ Quantize the specified model using the quantizers described by a quantization dict of the form: @@ -73,6 +79,8 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None): # Use tensor subclass API for int4 weight only. if device == "cuda" and quantizer == "linear:int4": quantize_(model, int4_weight_only(q_kwargs["groupsize"])) + if not support_tensor_subclass: + unwrap_tensor_subclass(model) continue # Use dtype precision specified in user config, else fallback on global precision. if "precision" in quantize_options: From 1cd144cb5728ae7fc8462369d11053d69f4799b0 Mon Sep 17 00:00:00 2001 From: Jack-Khuu Date: Fri, 16 Aug 2024 02:29:50 -0700 Subject: [PATCH 5/5] Missed intialize entry point with conditional unwrap --- build/builder.py | 10 ++++++++-- export.py | 1 + quantization/quantize.py | 11 ++++++----- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/build/builder.py b/build/builder.py index 42c1d6b90..d8b803149 100644 --- a/build/builder.py +++ b/build/builder.py @@ -440,7 +440,7 @@ def _initialize_model( quantize, tokenizer=None, max_seq_length=None, - support_tensor_subclass:bool=True, + support_tensor_subclass: bool = True, ): print("Loading model...") @@ -511,7 +511,13 @@ def _initialize_model( if quantize: print(f"Quantizing the model with: {quantize}") with measure_time("Time to quantize model: {time:.02f} seconds"): - quantize_model(model, builder_args.device, quantize, tokenizer, support_tensor_subclass) + quantize_model( + model, + builder_args.device, + quantize, + tokenizer, + support_tensor_subclass, + ) device_sync(device=builder_args.device) if builder_args.setup_caches: diff --git a/export.py b/export.py index 58730896b..148e9cb65 100644 --- a/export.py +++ b/export.py @@ -126,6 +126,7 @@ def main(args): quantize, tokenizer, max_seq_length=builder_args.max_seq_length, + support_tensor_subclass=output_dso_path is None, ) model_to_pte = model model_to_dso = model diff --git a/quantization/quantize.py b/quantization/quantize.py index b9aa4ddf7..8efc4fa08 100644 --- a/quantization/quantize.py +++ b/quantization/quantize.py @@ -43,6 +43,7 @@ Int8DynActInt4WeightQuantizer, quantize_, ) +from torchao.utils import unwrap_tensor_subclass ######################################################################### @@ -50,11 +51,11 @@ def quantize_model( - model: nn.Module, - device, - quantize_options, - tokenizer=None, - support_tensor_subclass:bool=True + model: nn.Module, + device, + quantize_options, + tokenizer=None, + support_tensor_subclass: bool = True, ): """ Quantize the specified model using the quantizers described by