Add exhaustive config option to intmm kernel (#1392)

jerryzh168 · web-flow · commit cac526145711 · 2024-12-11T14:04:27.000-08:00
* Add exhaustive config option to intmm kernel Summary: similar to pytorch/pytorch#126220 we added exhaustive option for int8mm and scaled_mm kernels in torchao Note that there seems to be native int8mm and scaled_mm support in pytorch: https://github.com/pytorch/pytorch/blob/0610b9730e27d066e26396a2d655ba0d98c2012d/torch/_inductor/kernel/mm.py#L305 for int8mm and https://github.com/pytorch/pytorch/blob/0610b9730e27d066e26396a2d655ba0d98c2012d/torch/_inductor/kernel/mm_scaled.py#L575 for scaled mm maybe we should use that at some point. Test Plan: ``` cd benchmarks TORCHAO_AUTOTUNER_ENABLE=1 python intmm.py --file_path intmm_shapes.csv TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE=EXHAUSTIVE TORCHAO_AUTOTUNER_ENABLE=1 python intmm.py --file_path intmm_shapes.csv ``` Reviewers: Subscribers: Tasks: Tags: * remove unused * enable all autoquant qtensor * guard float8 qtensor subclass * guard exhaustive config torch version
diff --git a/torchao/_models/sam/eval_combo.py b/torchao/_models/sam/eval_combo.py
@@ -350,6 +350,8 @@ def mlp_only(mod, name):
             autoquant_v2(predictor.model.image_encoder, example_input=example_input, manual=True, qtensor_class_list=torchao.prototype.quantization.autoquant_v2.DEFAULT_INT4_AUTOQUANT_CLASS_LIST)
         elif "autoquant_v2-float8" == compress:
             autoquant_v2(predictor.model.image_encoder, example_input=example_input, manual=True, qtensor_class_list=torchao.prototype.quantization.autoquant_v2.OTHER_AUTOQUANT_CLASS_LIST)
+        elif "autoquant_v2-all" == compress:
+            autoquant_v2(predictor.model.image_encoder, example_input=example_input, manual=True, qtensor_class_list=torchao.prototype.quantization.autoquant_v2.ALL_AUTOQUANT_CLASS_LIST)
         else:
             autoquant_v2(predictor.model.image_encoder, example_input=example_input, manual=True)
 
@@ -362,6 +364,8 @@ def mlp_only(mod, name):
             autoquant(predictor.model.image_encoder, example_input=example_input, manual=True, qtensor_class_list=torchao.quantization.DEFAULT_INT4_AUTOQUANT_CLASS_LIST)
         elif "autoquant-float8" == compress:
             autoquant(predictor.model.image_encoder, example_input=example_input, manual=True, qtensor_class_list=torchao.quantization.OTHER_AUTOQUANT_CLASS_LIST)
+        elif "autoquant-all" == compress:
+            autoquant(predictor.model.image_encoder, example_input=example_input, manual=True, qtensor_class_list=torchao.quantization.ALL_AUTOQUANT_CLASS_LIST)
         else:
             autoquant(predictor.model.image_encoder, example_input=example_input, manual=True)
         predictor.model.image_encoder(example_input)
diff --git a/torchao/kernel/README.md b/torchao/kernel/README.md
@@ -6,6 +6,9 @@
 
 Set this to a nonzero value to enable the kernels generated by the autotuner. This is turned off by default, because it is still an experimental feature and also can take a long time to run.
 
+`TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE=EXHAUSTIVE`
+Use this to enable exhaustive search for both int8mm and scaled_mm kernels.
+
 Searching a new config can take a long time and we'll save the updated data in `data.pkl`. If you'd like to contributed updated configs for your hardware or shapes, please open a pull request.
 
 `TORCHAO_AUTOTUNER_DATA_PATH=torchao/kernel/configs/data_a100.pkl`
diff --git a/torchao/kernel/intmm.py b/torchao/kernel/intmm.py
@@ -10,7 +10,8 @@
         from torchao.kernel import intmm_triton
     else:
         intmm_triton = None
-except ImportError:
+except ImportError as e:
+    print("import error:", e)
     # On cpu-only builds might not be available.
     intmm_triton = None
 
@@ -56,7 +57,7 @@ def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
             and j_is_nonzero_multiple_of_8
             and k_is_nonzero_multiple_of_8
         )
-        
+
         if device_cpu or bad_dimensions_for_cublas:
             # fallback path
             return torch.matmul(input.cpu().to(torch.int32), mat2.cpu().to(torch.int32)).to(
@@ -75,8 +76,8 @@ def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
         try:
             return out_dtype(torch.ops.aten.mm.default, torch.int32, input, mat2)
         except Exception:
-            # fallback path, would run on H100 for float8 dtypes 
-            # Exception on H100 float8 dtype : "addmm_cuda" not implemented for 'Float8_e4m3fn' 
+            # fallback path, would run on H100 for float8 dtypes
+            # Exception on H100 float8 dtype : "addmm_cuda" not implemented for 'Float8_e4m3fn'
             return torch.matmul(input.to(torch.float32), mat2.to(torch.float32)).to(torch.int32)
 else:
     def safe_int_mm(input: torch.Tensor, mat2: torch.Tensor) -> torch.Tensor:
diff --git a/torchao/kernel/intmm_triton.py b/torchao/kernel/intmm_triton.py
@@ -7,35 +7,50 @@
 import triton.language as tl
 
 from torchao.kernel.autotuner import get_best_config_fn
+from torchao.utils import TORCH_VERSION_AFTER_2_5
 
-int8_powers_of_two = [32, 64, 128, 256]
-int8_mm_kernel_configs = sum(
-    [
-        # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+# TORCHINDUCTOR_MAX_AUTOTUNE_GEMM_SEARCH_SPACE=EXHAUSTIVE to enable exhaustive option
+int8_mm_kernel_configs = (
+    sum(
         [
-            (i, j, k, 1, 1),
-            (i, j, k, 1, 2),
-            (i, j, k, 2, 2),
-            (i, j, k, 1, 4),
-            (i, j, k, 2, 4),
-            (i, j, k, 3, 4),
-            (i, j, k, 4, 4),
-            (i, j, k, 1, 8),
-            (i, j, k, 2, 8),
-            (i, j, k, 3, 8),
-            (i, j, k, 4, 8),
-            (i, j, k, 5, 8),
-            (i, j, k, 6, 8),
-            (i, j, k, 7, 8),
-            (i, j, k, 8, 8),
-        ]
-        for (i, j, k) in itertools.product(
-            int8_powers_of_two, int8_powers_of_two, int8_powers_of_two
-        )
-    ],
-    [],
+            # "BLOCK_M", "BLOCK_N", "BLOCK_K", "num_stages", "num_warps"
+            [
+                (i, j, k, 1, 1),
+                (i, j, k, 1, 2),
+                (i, j, k, 2, 2),
+                (i, j, k, 1, 4),
+                (i, j, k, 2, 4),
+                (i, j, k, 3, 4),
+                (i, j, k, 4, 4),
+                (i, j, k, 1, 8),
+                (i, j, k, 2, 8),
+                (i, j, k, 3, 8),
+                (i, j, k, 4, 8),
+                (i, j, k, 5, 8),
+                (i, j, k, 6, 8),
+                (i, j, k, 7, 8),
+                (i, j, k, 8, 8),
+            ]
+            for (i, j, k) in itertools.product(
+                [32, 64, 128, 256], repeat=3
+            )
+        ],
+        []
+    )
 )
 
+if TORCH_VERSION_AFTER_2_5:
+    if torch._inductor.config.max_autotune_gemm_search_space == "EXHAUSTIVE":
+        int8_mm_kernel_configs = [
+            (BLOCK_M, BLOCK_N, BLOCK_K, num_stages, num_warps)
+            for BLOCK_M, BLOCK_N, BLOCK_K in itertools.product(
+                [16, 32, 64, 128, 256], repeat=3
+            )
+            for num_stages in [1, 2, 3, 4, 5, 6, 7, 8]
+            for num_warps in [2, 4, 8]
+        ]
+
+
 # Baseline configs from pytorch/pytorch
 # https://github.com/pytorch/pytorch/blob/7718a1cd4f8e0b794c18a31ebd6353d6273c534e/torch/_inductor/kernel/mm_common.py#L132-L147
 # int8_mm_kernel_configs = [
diff --git a/torchao/prototype/quantization/autoquant_v2.py b/torchao/prototype/quantization/autoquant_v2.py
@@ -31,6 +31,8 @@
     TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_5,
     TorchAOBaseTensor,
+    is_sm_at_least_89,
+    is_sm_at_least_90,
 )
 
 from torchao.quantization.granularity import (
@@ -63,6 +65,7 @@
     "DEFAULT_INT4_AUTOQUANT_CLASS_LIST",
     "DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST",
     "OTHER_AUTOQUANT_CLASS_LIST",
+    "ALL_AUTOQUANT_CLASS_LIST",
     "_is_linear",
 ]
 
@@ -1087,6 +1090,13 @@ def get_weight_block_size(x):
     AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
 ]
 
+ALL_AUTOQUANT_CLASS_LIST = list(set(DEFAULT_AUTOQUANT_CLASS_LIST + DEFAULT_INT4_AUTOQUANT_CLASS_LIST + DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST))
+if is_sm_at_least_89():
+    ALL_AUTOQUANT_CLASS_LIST += [AQFloat8WeightOnlyQuantizedLinearWeight, AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight]
+
+if is_sm_at_least_90():
+    ALL_AUTOQUANT_CLASS_LIST += [AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight]
+
 
 def _replace_with_custom_fn_if_matches_filter(
     model,
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -10,6 +10,7 @@
 )
 
 from .autoquant import (
+    ALL_AUTOQUANT_CLASS_LIST,
     DEFAULT_AUTOQUANT_CLASS_LIST,
     DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST,
     DEFAULT_INT4_AUTOQUANT_CLASS_LIST,
@@ -92,6 +93,7 @@
     "DEFAULT_INT4_AUTOQUANT_CLASS_LIST",
     "DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST",
     "OTHER_AUTOQUANT_CLASS_LIST",
+    "ALL_AUTOQUANT_CLASS_LIST",
     # top level API - manual
     "quantize_",
     "int8_dynamic_activation_int4_weight",
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -26,6 +26,8 @@
     TORCH_VERSION_AT_LEAST_2_3,
     TORCH_VERSION_AT_LEAST_2_5,
     TorchAOBaseTensor,
+    is_sm_at_least_89,
+    is_sm_at_least_90,
 )
 
 from .granularity import (
@@ -45,6 +47,7 @@
     "DEFAULT_INT4_AUTOQUANT_CLASS_LIST",
     "DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST",
     "OTHER_AUTOQUANT_CLASS_LIST",
+    "ALL_AUTOQUANT_CLASS_LIST",
 ]
 
 
@@ -951,6 +954,22 @@ def get_weight_block_size(x):
     AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
 ]
 
+ALL_AUTOQUANT_CLASS_LIST = list(
+    set(
+        DEFAULT_AUTOQUANT_CLASS_LIST
+        + DEFAULT_INT4_AUTOQUANT_CLASS_LIST
+        + DEFAULT_FLOAT_AUTOQUANT_CLASS_LIST
+    )
+)
+if is_sm_at_least_89():
+    ALL_AUTOQUANT_CLASS_LIST += [
+        AQFloat8WeightOnlyQuantizedLinearWeight,
+        AQFloat8PerTensorScalingDynamicallyQuantizedLinearWeight,
+    ]
+
+if is_sm_at_least_90():
+    ALL_AUTOQUANT_CLASS_LIST += [AQFloat8PerRowScalingDynamicallyQuantizedLinearWeight]
+
 
 def _change_linears_to_autoquantizable(model, **kwargs):
     """