Implicit conversion

jainapurva · jainapurva · commit a73180d760e1 · 2024-11-27T11:23:54.000-08:00
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -14,7 +14,7 @@
 import torch
 import torch.nn as nn
 
-from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
+from torchao.utils import TORCH_VERSION_AT_LEAST_2_5, is_sm_89
 
 if not TORCH_VERSION_AT_LEAST_2_5:
     pytest.skip("Unsupported PyTorch version", allow_module_level=True)
@@ -531,6 +531,21 @@ def test_inference_mode(self):
         with torch.inference_mode(mode=True):
             m(x)
 
+    @unittest.skipIf(not is_sm_89(), "CUDA arch 8.9 not available")
+    def test_quantize(self):
+        x = torch.randn(32, 32, device="cuda")
+        m = nn.Sequential(nn.Linear(32, 32)).cuda()
+        m = convert_to_float8_training(m)
+        assert isinstance(m[0], Float8Linear), "Module is not a Float8Linear"
+        from torchao.quantization.quant_api import float8_weight_only, quantize_
+
+        quantize_(m, float8_weight_only())
+        assert (
+            m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn
+        ), "Post quantization dtype should be torch.float8_e4m3fn"
+        with torch.no_grad():
+            m(x)
+
 
 class TestScaledMM:
     @unittest.skipIf(
diff --git a/torchao/float8/__init__.py b/torchao/float8/__init__.py
@@ -14,7 +14,6 @@
 from torchao.float8.float8_linear import WeightWithDelayedFloat8CastTensor
 from torchao.float8.float8_linear_utils import (
     convert_to_float8_training,
-    dequantize_float8_training,
     linear_requires_sync,
     sync_float8_amax_and_scale_history,
 )
@@ -55,6 +54,5 @@
     "linear_requires_sync",
     "sync_float8_amax_and_scale_history",
     "precompute_float8_dynamic_scale_for_fsdp",
-    "dequantize_float8_training",
     # note: Float8Tensor and Float8Linear are not public APIs
 ]
diff --git a/torchao/float8/float8_linear_utils.py b/torchao/float8/float8_linear_utils.py
@@ -321,21 +321,3 @@ def inner_func():
     for child in fp8_layers:
         # Set a flag to signal that initialization is done
         child.is_amax_initialized = True
-
-
-def dequantize_float8_training(model: nn.Module) -> nn.Module:
-    """
-    Converts `Float8Linear` modules in `model` to `torch.nn.Linear`.
-    """
-
-    def dequant_func(mod: Float8Linear) -> nn.Linear:
-        new_module = nn.Linear(mod.in_features, mod.out_features)
-        new_module.weight = mod.weight
-        new_module.bias = mod.bias
-        return new_module
-
-    return swap_linear_layers(
-        model,
-        dequant_func,
-        target_module=Float8Linear,
-    )
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -39,6 +39,8 @@
     to_affine_quantized_intx,
     to_marlinqqq_quantized_intx,
 )
+from torchao.float8.float8_linear import Float8Linear
+from torchao.float8.float8_linear_utils import swap_linear_layers
 from torchao.float8.inference import Float8MMConfig
 from torchao.quantization.linear_activation_weight_observed_tensor import (
     LinearActivationWeightObservedTensor,
@@ -199,6 +201,22 @@ def change_linear_weights_to_int4_woqtensors(
 ########
 # TO BE DEPRECATED END
 ########
+def dequantize_float8_training(model: nn.Module) -> nn.Module:
+    """
+    Converts `Float8Linear` modules in `model` to `torch.nn.Linear`.
+    """
+
+    def dequant_func(mod: Float8Linear) -> nn.Linear:
+        new_module = nn.Linear(mod.in_features, mod.out_features)
+        new_module.weight = mod.weight
+        new_module.bias = mod.bias
+        return new_module
+
+    return swap_linear_layers(
+        model,
+        dequant_func,
+        target_module=Float8Linear,
+    )
 
 
 def _replace_with_custom_fn_if_matches_filter(
@@ -222,6 +240,8 @@ def _replace_with_custom_fn_if_matches_filter(
     Returns:
         None
     """
+    if isinstance(model, Float8Linear):
+        model = dequantize_float8_training(model)
     if filter_fn(model, cur_fqn[:-1]):
         if device is not None:
             model.to(device=device)  # move to device before quantization

Original file line number	Diff line number	Diff line change
`@@ -14,7 +14,6 @@`
`14`	`14`	`from torchao.float8.float8_linear import WeightWithDelayedFloat8CastTensor`
`15`	`15`	`from torchao.float8.float8_linear_utils import (`
`16`	`16`	`convert_to_float8_training,`
`17`		`- dequantize_float8_training,`
`18`	`17`	`linear_requires_sync,`
`19`	`18`	`sync_float8_amax_and_scale_history,`
`20`	`19`	`)`
`@@ -55,6 +54,5 @@`
`55`	`54`	`"linear_requires_sync",`
`56`	`55`	`"sync_float8_amax_and_scale_history",`
`57`	`56`	`"precompute_float8_dynamic_scale_for_fsdp",`
`58`		`- "dequantize_float8_training",`
`59`	`57`	`# note: Float8Tensor and Float8Linear are not public APIs`
`60`	`58`	`]`