Change arg order in lowbit linear ops to match aten (#982)

metascroy · web-flow · commit ae3e7c68eae7 · 2024-09-30T16:47:53.000-07:00
Change arg order in lowbit linear ops to match aten (#982) Summary: Pull Request resolved: #982 Discussed with Manuel to align on arg order between CPU/MPS ops. Reviewed By: digantdesai, manuelcandales Differential Revision: D63422524
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h
@@ -218,14 +218,14 @@ Tensor pack_weights_with_zeros_meta(
 #if defined(USE_ATEN) || defined(USE_EXECUTORCH)
 template <int weight_nbit, bool has_weight_zeros>
 Tensor linear_out_cpu(
+    const Tensor& activations,
     const Tensor& packed_weights,
     // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
     // int64_t when supported by AOTI Currently they are tensors with size
     // equal to (0, the int they wrap)
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   int n = n_tensor.size(1);
   int k = k_tensor.size(1);
@@ -307,21 +307,21 @@ Tensor linear_out_cpu(
 #ifdef USE_ATEN
 template <int weight_nbit, bool has_weight_zeros>
 Tensor linear_cpu(
+    const Tensor& activations,
     const Tensor& packed_weights,
     // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
     // int64_t when supported by AOTI Currently they are tensors with size
     // equal to (0, the int they wrap)
-    const Tensor& n_tensor,
-    const Tensor& k_tensor,
     const Tensor& group_size_tensor,
-    const Tensor& activations) {
+    const Tensor& n_tensor,
+    const Tensor& k_tensor) {
   Tensor output_tensor = torch::empty({}, torch::kFloat32);
   linear_out_cpu<weight_nbit, has_weight_zeros>(
+      activations,
       packed_weights,
+      group_size_tensor,
       n_tensor,
       k_tensor,
-      group_size_tensor,
-      activations,
       output_tensor);
   return output_tensor;
 }
@@ -330,14 +330,14 @@ Tensor linear_cpu(
 #ifdef USE_ATEN
 template <int weight_nbit, bool has_weight_zeros>
 Tensor linear_meta(
+    const Tensor& activations,
     const Tensor& packed_weights,
     // TODO(T200095131): convert n_tensor, k_tensor, group_size_tensor to
     // int64_t when supported by AOTI
     // Currently they are tensors with size equal to (0, the int they wrap)
-    const Tensor& n_tensor,
-    const Tensor& k_tensor,
     const Tensor& group_size_tensor,
-    const Tensor& activations) {
+    const Tensor& n_tensor,
+    const Tensor& k_tensor) {
   int n = n_tensor.size(1);
   int k = k_tensor.size(1);
   CHECK_MSG(n >= 1, "n must be >= 1");
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_aten.cpp
@@ -6,67 +6,78 @@
 
 #include <torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h>
 
-#define DEFINE_OP(weight_nbit)                                                                                                     \
-  m.def(                                                                                                                           \
-      "_pack_weights_a8sz_w" #weight_nbit                                                                                          \
-      "s(Tensor weight_qvals, Tensor weight_scales, Tensor group_size) -> Tensor");                                                \
-  m.def(                                                                                                                           \
-      "_pack_weights_a8sz_w" #weight_nbit                                                                                          \
-      "sz(Tensor weight_qvals, Tensor weight_scales, Tensor weight_zeros, Tensor group_size) -> Tensor");                          \
-  m.def(                                                                                                                           \
-      "_linear_a8sz_w" #weight_nbit                                                                                                \
-      "s(Tensor packed_weights, Tensor n, Tensor k, Tensor group_size, Tensor activations) -> Tensor");                            \
-  m.def(                                                                                                                           \
-      "_linear_a8sz_w" #weight_nbit                                                                                                \
-      "sz(Tensor packed_weights, Tensor n, Tensor k, Tensor group_size, Tensor activations) -> Tensor");                           \
-  m.def(                                                                                                                           \
-      "_linear_a8sz_w" #weight_nbit                                                                                                \
-      "s.out(Tensor packed_weights, Tensor n, Tensor k, Tensor group_size, Tensor activations, *, Tensor(a!) out) -> Tensor(a!)"); \
-  m.def(                                                                                                                           \
-      "_linear_a8sz_w" #weight_nbit                                                                                                \
-      "sz.out(Tensor packed_weights, Tensor n, Tensor k, Tensor group_size, Tensor activations, *, Tensor(a!) out) -> Tensor(a!)")
+#define DEFINE_OP(weight_nbit)                                                                                                                 \
+  m.def(                                                                                                                                       \
+      "_pack_8bit_act_" #weight_nbit                                                                                                   \
+      "bit0zp_weight(Tensor weight_qvals, Tensor weight_scales, Tensor group_size) -> Tensor");                                                \
+  m.def(                                                                                                                                       \
+      "_pack_8bit_act_" #weight_nbit                                                                                                   \
+      "bit_weight(Tensor weight_qvals, Tensor weight_scales, Tensor weight_zeros, Tensor group_size) -> Tensor");                              \
+  m.def(                                                                                                                                       \
+      "_linear_8bit_act_" #weight_nbit                                                                                                         \
+      "bit0zp_weight(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k) -> Tensor");                            \
+  m.def(                                                                                                                                       \
+      "_linear_8bit_act_" #weight_nbit                                                                                                         \
+      "bit_weight(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k) -> Tensor");                               \
+  m.def(                                                                                                                                       \
+      "_linear_8bit_act_" #weight_nbit                                                                                                         \
+      "bit0zp_weight.out(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k, *, Tensor(a!) out) -> Tensor(a!)"); \
+  m.def(                                                                                                                                       \
+      "_linear_8bit_act_" #weight_nbit                                                                                                         \
+      "bit_weight.out(Tensor activations, Tensor packed_weights, Tensor group_size, Tensor n, Tensor k, *, Tensor(a!) out) -> Tensor(a!)")
 
-#define DEFINE_CPU_IMPL(weight_nbit)                                          \
-  m.impl(                                                                     \
-      "_pack_weights_a8sz_w" #weight_nbit "s",                                \
-      &pack_weights_without_zeros_cpu<weight_nbit>);                          \
-  m.impl(                                                                     \
-      "_pack_weights_a8sz_w" #weight_nbit "sz",                               \
-      &pack_weights_with_zeros_cpu<weight_nbit>);                             \
-  m.impl("_linear_a8sz_w" #weight_nbit "s", &linear_cpu<weight_nbit, false>); \
-  m.impl("_linear_a8sz_w" #weight_nbit "sz", &linear_cpu<weight_nbit, true>); \
-  m.impl(                                                                     \
-      "_linear_a8sz_w" #weight_nbit "s.out",                                  \
-      &linear_out_cpu<weight_nbit, false>);                                   \
-  m.impl(                                                                     \
-      "_linear_a8sz_w" #weight_nbit "sz.out",                                 \
+#define DEFINE_CPU_IMPL(weight_nbit)                          \
+  m.impl(                                                     \
+      "_pack_8bit_act_" #weight_nbit "bit0zp_weight", \
+      &pack_weights_without_zeros_cpu<weight_nbit>);          \
+  m.impl(                                                     \
+      "_pack_8bit_act_" #weight_nbit "bit_weight",    \
+      &pack_weights_with_zeros_cpu<weight_nbit>);             \
+  m.impl(                                                     \
+      "_linear_8bit_act_" #weight_nbit "bit0zp_weight",       \
+      &linear_cpu<weight_nbit, false>);                       \
+  m.impl(                                                     \
+      "_linear_8bit_act_" #weight_nbit "bit_weight",          \
+      &linear_cpu<weight_nbit, true>);                        \
+  m.impl(                                                     \
+      "_linear_8bit_act_" #weight_nbit "bit0zp_weight.out",   \
+      &linear_out_cpu<weight_nbit, false>);                   \
+  m.impl(                                                     \
+      "_linear_8bit_act_" #weight_nbit "bit_weight.out",      \
       &linear_out_cpu<weight_nbit, true>)
 
-#define DEFINE_META_IMPL(weight_nbit)                                          \
-  m.impl(                                                                      \
-      "_pack_weights_a8sz_w" #weight_nbit "s",                                 \
-      &pack_weights_without_zeros_meta<weight_nbit>);                          \
-  m.impl(                                                                      \
-      "_pack_weights_a8sz_w" #weight_nbit "sz",                                \
-      &pack_weights_with_zeros_meta<weight_nbit>);                             \
-  m.impl("_linear_a8sz_w" #weight_nbit "s", &linear_meta<weight_nbit, false>); \
-  m.impl("_linear_a8sz_w" #weight_nbit "sz", &linear_meta<weight_nbit, true>);
+#define DEFINE_META_IMPL(weight_nbit)                         \
+  m.impl(                                                     \
+      "_pack_8bit_act_" #weight_nbit "bit0zp_weight", \
+      &pack_weights_without_zeros_meta<weight_nbit>);         \
+  m.impl(                                                     \
+      "_pack_8bit_act_" #weight_nbit "bit_weight",    \
+      &pack_weights_with_zeros_meta<weight_nbit>);            \
+  m.impl(                                                     \
+      "_linear_8bit_act_" #weight_nbit "bit0zp_weight",       \
+      &linear_meta<weight_nbit, false>);                      \
+  m.impl(                                                     \
+      "_linear_8bit_act_" #weight_nbit "bit_weight",          \
+      &linear_meta<weight_nbit, true>);
 
 TORCH_LIBRARY(torchao, m) {
+  DEFINE_OP(1);
   DEFINE_OP(2);
   DEFINE_OP(3);
   DEFINE_OP(4);
   DEFINE_OP(5);
 }
 
 TORCH_LIBRARY_IMPL(torchao, CPU, m) {
+  DEFINE_CPU_IMPL(1);
   DEFINE_CPU_IMPL(2);
   DEFINE_CPU_IMPL(3);
   DEFINE_CPU_IMPL(4);
   DEFINE_CPU_IMPL(5);
 }
 
 TORCH_LIBRARY_IMPL(torchao, Meta, m) {
+  DEFINE_META_IMPL(1);
   DEFINE_META_IMPL(2);
   DEFINE_META_IMPL(3);
   DEFINE_META_IMPL(4);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1s.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1s.cpp
@@ -0,0 +1,29 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+// Unlike ATen, ExecuTorch op registration appears to only allow on
+// EXECUTORCH_LIBRARY per cpp file due to a name redefinition error, so a new
+// file is needed for each variant
+
+#include <torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h>
+
+namespace {
+Tensor _op_out(
+    RuntimeContext& ctx,
+    const Tensor& activations,
+    const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
+    const Tensor& n_tensor,
+    const Tensor& k_tensor,
+    Tensor& out) {
+  (void)ctx;
+  linear_out_cpu</*weight_nbit*/ 1, /*has_weight_zeros*/ false>(
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
+  return out;
+}
+} // namespace
+
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_1bit0zp_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1sz.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w1sz.cpp
@@ -0,0 +1,29 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+// All rights reserved.
+//
+// This source code is licensed under the license found in the
+// LICENSE file in the root directory of this source tree.
+
+// Unlike ATen, ExecuTorch op registration appears to only allow on
+// EXECUTORCH_LIBRARY per cpp file due to a name redefinition error, so a new
+// file is needed for each variant
+
+#include <torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight-impl.h>
+
+namespace {
+Tensor _op_out(
+    RuntimeContext& ctx,
+    const Tensor& activations,
+    const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
+    const Tensor& n_tensor,
+    const Tensor& k_tensor,
+    Tensor& out) {
+  (void)ctx;
+  linear_out_cpu</*weight_nbit*/ 1, /*has_weight_zeros*/ true>(
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
+  return out;
+}
+} // namespace
+
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_1bit_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w2s.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w2s.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 2, /*has_weight_zeros*/ false>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w2s.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_2bit0zp_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w2sz.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w2sz.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 2, /*has_weight_zeros*/ true>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w2sz.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_2bit_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w3s.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w3s.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 3, /*has_weight_zeros*/ false>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w3s.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_3bit0zp_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w3sz.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w3sz.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 3, /*has_weight_zeros*/ true>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w3sz.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_3bit_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w4s.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w4s.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 4, /*has_weight_zeros*/ false>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w4s.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_4bit0zp_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w4sz.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w4sz.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 4, /*has_weight_zeros*/ true>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w4sz.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_4bit_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w5s.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w5s.cpp
@@ -13,17 +13,17 @@
 namespace {
 Tensor _op_out(
     RuntimeContext& ctx,
+    const Tensor& activations,
     const Tensor& packed_weights,
+    const Tensor& group_size_tensor,
     const Tensor& n_tensor,
     const Tensor& k_tensor,
-    const Tensor& group_size_tensor,
-    const Tensor& activations,
     Tensor& out) {
   (void)ctx;
   linear_out_cpu</*weight_nbit*/ 5, /*has_weight_zeros*/ false>(
-      packed_weights, n_tensor, k_tensor, group_size_tensor, activations, out);
+      activations, packed_weights, group_size_tensor, n_tensor, k_tensor, out);
   return out;
 }
 } // namespace
 
-EXECUTORCH_LIBRARY(torchao, "_linear_a8sz_w5s.out", _op_out);
+EXECUTORCH_LIBRARY(torchao, "_linear_8bit_act_5bit0zp_weight.out", _op_out);
diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w5sz.cpp b/torchao/experimental/ops/linear_8bit_act_xbit_weight/op_linear_8bit_act_xbit_weight_executorch/w5sz.cpp
diff --git a/torchao/experimental/quant_api.py b/torchao/experimental/quant_api.py
diff --git a/torchao/experimental/tests/test_linear_8bit_act_xbit_weight_quantizer.py b/torchao/experimental/tests/test_linear_8bit_act_xbit_weight_quantizer.py