Skip to content

Update PT Pin: 2024-08-14 #1029

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion build/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ def _initialize_model(
quantize,
tokenizer=None,
max_seq_length=None,
support_tensor_subclass: bool = True,
):
print("Loading model...")

Expand Down Expand Up @@ -510,7 +511,13 @@ def _initialize_model(
if quantize:
print(f"Quantizing the model with: {quantize}")
with measure_time("Time to quantize model: {time:.02f} seconds"):
quantize_model(model, builder_args.device, quantize, tokenizer)
quantize_model(
model,
builder_args.device,
quantize,
tokenizer,
support_tensor_subclass,
)
device_sync(device=builder_args.device)

if builder_args.setup_caches:
Expand Down
2 changes: 2 additions & 0 deletions export.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def main(args):
quantize,
tokenizer,
max_seq_length=builder_args.max_seq_length,
support_tensor_subclass=output_dso_path is None,
)
model_to_pte = model
model_to_dso = model
Expand All @@ -143,6 +144,7 @@ def main(args):
model_to_dso = _initialize_model(
builder_args,
quantize,
support_tensor_subclass=False,
)
_unset_gguf_kwargs(builder_args)

Expand Down
4 changes: 2 additions & 2 deletions install_requirements.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ fi
# NOTE: If a newly-fetched version of the executorch repo changes the value of
# NIGHTLY_VERSION, you should re-run this script to install the necessary
# package versions.
NIGHTLY_VERSION=dev20240728
NIGHTLY_VERSION=dev20240814

# Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
(
Expand Down Expand Up @@ -82,7 +82,7 @@ REQUIREMENTS_TO_INSTALL=(
# TODO: Remove this and install nightly build, once it supports macos
(
set -x
$PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@d477c0e59b458b5617dcb3e999290a87df3070d8
$PIP_EXECUTABLE install git+https://github.com/pytorch/ao.git@e11201a62669f582d81cdb33e031a07fb8dfc4f3
)
if [[ -x "$(command -v nvidia-smi)" ]]; then
(
Expand Down
11 changes: 9 additions & 2 deletions quantization/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,13 @@
### torchchat quantization API ###


def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
def quantize_model(
model: nn.Module,
device,
quantize_options,
tokenizer=None,
support_tensor_subclass: bool = True,
):
"""
Quantize the specified model using the quantizers described by
a quantization dict of the form:
Expand All @@ -74,7 +80,8 @@ def quantize_model(model: nn.Module, device, quantize_options, tokenizer=None):
# Use tensor subclass API for int4 weight only.
if device == "cuda" and quantizer == "linear:int4":
quantize_(model, int4_weight_only(q_kwargs["groupsize"]))
unwrap_tensor_subclass(model)
if not support_tensor_subclass:
unwrap_tensor_subclass(model)
continue
# Use dtype precision specified in user config, else fallback on global precision.
if "precision" in quantize_options:
Expand Down
Loading