diff --git a/setup.py b/setup.py index 31dd6a1094..6174196de2 100644 --- a/setup.py +++ b/setup.py @@ -49,7 +49,7 @@ def read_version(file_path="version.txt"): import platform -build_torchao_experimental = ( +build_macos_arm_auto = ( use_cpp == "1" and platform.machine().startswith("arm64") and platform.system() == "Darwin" @@ -121,8 +121,33 @@ def __init__(self): "TORCHAO_BUILD_EXPERIMENTAL_MPS requires MPS be available" ) + # TORCHAO_PARALLEL_BACKEND specifies which parallel backend to use + # Possible values: aten_openmp, executorch, openmp, pthreadpool, single_threaded + self.parallel_backend = os.getenv("TORCHAO_PARALLEL_BACKEND", "aten_openmp") + + # TORCHAO_ENABLE_ARM_NEON_DOT enable ARM NEON Dot Product extension + # Enabled by default on macOS silicon + self.enable_arm_neon_dot = self._os_bool_var( + "TORCHAO_ENABLE_ARM_NEON_DOT", + default=(self._is_arm64() and self._is_macos()), + ) + if self.enable_arm_neon_dot: + assert self.build_cpu_aarch64, ( + "TORCHAO_ENABLE_ARM_NEON_DOT requires TORCHAO_BUILD_CPU_AARCH64 be set" + ) + + # TORCHAO_ENABLE_ARM_I8MM enable ARM 8-bit Integer Matrix Multiply instructions + # Not enabled by default on macOS as not all silicon mac supports it + self.enable_arm_i8mm = self._os_bool_var( + "TORCHAO_ENABLE_ARM_I8MM", default=False + ) + if self.enable_arm_i8mm: + assert self.build_cpu_aarch64, ( + "TORCHAO_ENABLE_ARM_I8MM requires TORCHAO_BUILD_CPU_AARCH64 be set" + ) + def _is_arm64(self) -> bool: - return platform.machine().startswith("arm64") + return platform.machine().startswith("arm64") or platform.machine() == "aarch64" def _is_macos(self) -> bool: return platform.system() == "Darwin" @@ -498,7 +523,8 @@ def get_extensions(): ) ) - if build_torchao_experimental: + # Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND + if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1": build_options = BuildOptions() def bool_to_on_off(value): @@ -518,6 +544,9 @@ def bool_to_on_off(value): f"-DTORCHAO_BUILD_CPU_AARCH64={bool_to_on_off(build_options.build_cpu_aarch64)}", f"-DTORCHAO_BUILD_KLEIDIAI={bool_to_on_off(build_options.build_kleidi_ai)}", f"-DTORCHAO_BUILD_MPS_OPS={bool_to_on_off(build_options.build_experimental_mps)}", + f"-DTORCHAO_ENABLE_ARM_NEON_DOT={bool_to_on_off(build_options.enable_arm_neon_dot)}", + f"-DTORCHAO_ENABLE_ARM_I8MM={bool_to_on_off(build_options.enable_arm_i8mm)}", + f"-DTORCHAO_PARALLEL_BACKEND={build_options.parallel_backend}", "-DTorch_DIR=" + torch_dir, ] + ( diff --git a/torchao/experimental/CMakeLists.txt b/torchao/experimental/CMakeLists.txt index e6b2a6aff0..4dd02b2dd7 100644 --- a/torchao/experimental/CMakeLists.txt +++ b/torchao/experimental/CMakeLists.txt @@ -15,10 +15,13 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release) endif() +# Platform options option(TORCHAO_BUILD_EXECUTORCH_OPS "Building torchao ops for ExecuTorch." OFF) option(TORCHAO_BUILD_MPS_OPS "Building torchao MPS ops" OFF) option(TORCHAO_BUILD_CPU_AARCH64 "Build torchao's CPU aarch64 kernels" OFF) option(TORCHAO_BUILD_KLEIDIAI "Download, build, and link against Arm KleidiAI library (arm64 only)" OFF) +option(TORCHAO_ENABLE_ARM_NEON_DOT "Enable ARM Neon Dot Product extension" OFF) +option(TORCHAO_ENABLE_ARM_I8MM "Enable ARM 8-bit Integer Matrix Multiply instructions" OFF) if(NOT TORCHAO_INCLUDE_DIRS) set(TORCHAO_INCLUDE_DIRS ${CMAKE_CURRENT_SOURCE_DIR}/../..) @@ -28,19 +31,49 @@ if(NOT DEFINED TORCHAO_PARALLEL_BACKEND) set(TORCHAO_PARALLEL_BACKEND aten_openmp) endif() -include(CMakePrintHelpers) - +# Set default compiler options add_compile_options("-Wall" "-Werror" "-Wno-deprecated") include(CMakePrintHelpers) message("TORCHAO_INCLUDE_DIRS: ${TORCHAO_INCLUDE_DIRS}") include_directories(${TORCHAO_INCLUDE_DIRS}) - if(TORCHAO_BUILD_CPU_AARCH64) message(STATUS "Building with cpu/aarch64") add_compile_definitions(TORCHAO_BUILD_CPU_AARCH64) - add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT) + + # Set aarch64 compiler options + if (CMAKE_SYSTEM_NAME STREQUAL "Linux") + message(STATUS "Add aarch64 linux compiler options") + add_compile_options( + "-fPIC" + "-Wno-error=unknown-pragmas" + "-Wno-array-parameter" + "-Wno-maybe-uninitialized" + "-Wno-sign-compare" + ) + + # Since versions are hierarchical (each includes features from prior versions): + # - dotprod is included by default in armv8.4-a and later + # - i8mm is included by default in armv8.6-a and later + if(TORCHAO_ENABLE_ARM_I8MM) + message(STATUS "Using armv8.6-a (includes 'i8mm' and 'dotprod' flags)") + add_compile_options("-march=armv8.6-a") + elseif(TORCHAO_ENABLE_ARM_NEON_DOT) + message(STATUS "Using armv8.4-a (includes '+dotprod' flag)") + add_compile_options("-march=armv8.4-a") + endif() + endif() + + if(TORCHAO_ENABLE_ARM_NEON_DOT) + message(STATUS "Building with ARM NEON dot product support") + add_compile_definitions(TORCHAO_ENABLE_ARM_NEON_DOT) + endif() + + if(TORCHAO_ENABLE_ARM_I8MM) + message(STATUS "Building with ARM I8MM support") + add_compile_definitions(TORCHAO_ENABLE_ARM_I8MM) + endif() # Defines torchao_kernels_aarch64 add_subdirectory(kernels/cpu/aarch64) @@ -51,26 +84,33 @@ if(TORCHAO_BUILD_CPU_AARCH64) endif() endif() +# Add quantized operation dir add_subdirectory(ops/linear_8bit_act_xbit_weight) add_subdirectory(ops/embedding_xbit) +# ATen ops lib add_library(torchao_ops_aten SHARED) target_link_libraries( torchao_ops_aten PRIVATE torchao_ops_linear_8bit_act_xbit_weight_aten torchao_ops_embedding_xbit_aten ) + +# Add MPS support if enabled if (TORCHAO_BUILD_MPS_OPS) message(STATUS "Building with MPS support") add_subdirectory(ops/mps) target_link_libraries(torchao_ops_aten PRIVATE torchao_ops_mps_aten) endif() +# Install ATen targets install( TARGETS torchao_ops_aten EXPORT _targets DESTINATION lib ) + +# Build executorch lib if enabled if(TORCHAO_BUILD_EXECUTORCH_OPS) add_library(torchao_ops_executorch STATIC) target_link_libraries(torchao_ops_executorch PRIVATE diff --git a/torchao/experimental/build_torchao_ops.sh b/torchao/experimental/build_torchao_ops.sh index 782e187092..1bcc1a9658 100644 --- a/torchao/experimental/build_torchao_ops.sh +++ b/torchao/experimental/build_torchao_ops.sh @@ -22,6 +22,7 @@ cmake -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \ -DCMAKE_INSTALL_PREFIX=${CMAKE_OUT} \ -DTORCHAO_BUILD_EXECUTORCH_OPS="${TORCHAO_BUILD_EXECUTORCH_OPS}" \ -DTORCHAO_BUILD_CPU_AARCH64=ON \ + -DTORCHAO_ENABLE_ARM_NEON_DOT=ON \ -S . \ -B ${CMAKE_OUT} cmake --build ${CMAKE_OUT} -j 16 --target install --config Release diff --git a/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp b/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp index 8cbf036957..0274b0889e 100644 --- a/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp @@ -7,6 +7,7 @@ #include #include #include +#include // Interleaves data across channels (row/column) and groups. // Each channel is the same size (vals_per_channel) and is diff --git a/torchao/experimental/op_lib.py b/torchao/experimental/op_lib.py index 4fe478d1e8..456b0ca160 100644 --- a/torchao/experimental/op_lib.py +++ b/torchao/experimental/op_lib.py @@ -10,15 +10,44 @@ from torch import Tensor from torch.library import impl -# Load C++ ops -lib_path = Path(__file__).parent.parent -libs = list(lib_path.glob("libtorchao_ops_aten.*")) -assert len(libs) == 1, ( - f"Expected to find one libtorchao_ops_aten.* library at {lib_path}, but found {len(libs)}" -) -torch.ops.load_library(str(libs[0])) +# Load C++ ops - use multiple potential paths +potential_paths = [ + # Standard path from the module location + Path(__file__).parent.parent, + # Site-packages installation path + Path(torch.__file__).parent.parent / "torchao", + # For editable installs + Path(__file__).parent.parent.parent / "torchao", +] +def find_and_load_libtorchao_ops(potential_paths): + for lib_path in potential_paths: + libs = list(lib_path.glob("libtorchao_ops_aten.*")) + + if not libs: + continue + + assert len(libs) == 1, ( + f"Expected to find one libtorchao_ops_aten.* library at {lib_path}, but found {len(libs)}" + ) + + target_lib = libs[0] + print(f"Found library at: {target_lib}") + + try: + torch.ops.load_library(str(target_lib)) + return + except Exception as e: + print(f"Error loading library from {target_lib}: {e}") + + raise FileNotFoundError( + "Could not find libtorchao_ops_aten library in any of the provided paths" + ) + + +find_and_load_libtorchao_ops(potential_paths) + # Define meta ops. To support dynamic shapes, some meta ops need to # be defined in python instead of C++. torchao_lib = torch.library.Library("torchao", "IMPL") diff --git a/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h b/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h index 114e97838c..b699bdd3d3 100644 --- a/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h +++ b/torchao/experimental/ops/linear_8bit_act_xbit_weight/kernel_config.h @@ -190,7 +190,7 @@ struct UKernelConfig { TORCHAO_CHECK(pack_weights != nullptr || pack_weights_with_lut != nullptr, "pack_weights or pack_weights_with_lut must be set"); bool linear_configs_set = true; // first linear config must be set - for (int i = 0; i < linear_configs.size(); i++) { + for (size_t i = 0; i < linear_configs.size(); i++) { if (linear_configs_set) { TORCHAO_CHECK( linear_configs[i].m_step >= 1, @@ -225,7 +225,7 @@ struct UKernelConfig { assert(m >= 1); assert(linear_configs[0].m_step >= 1); - int i = 0; + size_t i = 0; while (i + 1 < linear_configs.size() && linear_configs[i + 1].m_step >= 1 && linear_configs[i + 1].m_step <= m) { assert(linear_configs[i].m_step < linear_configs[i + 1].m_step); @@ -235,7 +235,7 @@ struct UKernelConfig { assert(i < linear_configs.size()); assert(linear_configs[i].m_step >= 1); assert(i == 0 || linear_configs[i].m_step <= m); - return i; + return static_cast(i); } }; diff --git a/torchao/experimental/ops/packed_weights_header.h b/torchao/experimental/ops/packed_weights_header.h index 0869c12ef9..11703e8454 100644 --- a/torchao/experimental/ops/packed_weights_header.h +++ b/torchao/experimental/ops/packed_weights_header.h @@ -43,7 +43,7 @@ class PackedWeightsHeader { auto header = reinterpret_cast(packed_weights); header[0] = magic; header[1] = static_cast(type); - for (int i = 0; i < params.size(); i++) { + for (size_t i = 0; i < params.size(); i++) { header[i + 2] = params[i]; } } @@ -52,7 +52,7 @@ class PackedWeightsHeader { auto header = reinterpret_cast(packed_weights); assert(header[0] == PackedWeightsHeader::magic); params_type params; - for (int i = 0; i < params.size(); i++) { + for (size_t i = 0; i < params.size(); i++) { params[i] = header[i + 2]; } return PackedWeightsHeader( @@ -63,7 +63,7 @@ class PackedWeightsHeader { if (type != other.type) { return false; } - for (int i = 0; i < params.size(); i++) { + for (size_t i = 0; i < params.size(); i++) { if (params[i] != other.params[i]) { return false; } @@ -79,7 +79,7 @@ namespace std { struct hash { std::size_t operator()(const torchao::ops::PackedWeightsHeader& f) const { std::size_t hash = std::hash()(static_cast(f.type)); - for (int i = 0; i < f.params.size(); i++) { + for (size_t i = 0; i < f.params.size(); i++) { hash ^= std::hash()(f.params[i]); } return hash; diff --git a/torchao/experimental/ops/parallel-aten-impl.h b/torchao/experimental/ops/parallel-aten-impl.h index 07725f70eb..c2eb0b8498 100644 --- a/torchao/experimental/ops/parallel-aten-impl.h +++ b/torchao/experimental/ops/parallel-aten-impl.h @@ -5,7 +5,7 @@ // LICENSE file in the root directory of this source tree. #pragma once -#include +#include #include #include diff --git a/torchao/experimental/tests/test_load_libtorchao_ops.py b/torchao/experimental/tests/test_load_libtorchao_ops.py new file mode 100644 index 0000000000..4fec52f494 --- /dev/null +++ b/torchao/experimental/tests/test_load_libtorchao_ops.py @@ -0,0 +1,53 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +import unittest +from pathlib import Path +from unittest.mock import MagicMock, patch + + +class TestLibTorchAoOpsLoader(unittest.TestCase): + def test_find_and_load_success(self): + mock_paths = [Path("/test/path1")] + mock_lib = MagicMock() + mock_lib.__str__.return_value = "/test/path1/libtorchao_ops_aten.so" + + with patch("pathlib.Path.glob", return_value=[mock_lib]): + with patch("torch.ops.load_library") as mock_load: + from ..op_lib import find_and_load_libtorchao_ops + + find_and_load_libtorchao_ops(mock_paths) + + mock_load.assert_called_once_with("/test/path1/libtorchao_ops_aten.so") + + def test_no_library_found(self): + mock_paths = [Path("/test/path1"), Path("/test/path2")] + + with patch("pathlib.Path.glob", return_value=[]): + from ..op_lib import find_and_load_libtorchao_ops + + with self.assertRaises(FileNotFoundError): + find_and_load_libtorchao_ops(mock_paths) + + def test_multiple_libraries_error(self): + mock_paths = [Path("/test/path1")] + mock_lib1 = MagicMock() + mock_lib2 = MagicMock() + mock_libs = [mock_lib1, mock_lib2] + + with patch("pathlib.Path.glob", return_value=mock_libs): + from ..op_lib import find_and_load_libtorchao_ops + + try: + find_and_load_libtorchao_ops(mock_paths) + self.fail("Expected AssertionError was not raised") + except AssertionError as e: + expected_error_msg = f"Expected to find one libtorchao_ops_aten.* library at {mock_paths[0]}, but found 2" + self.assertIn(expected_error_msg, str(e)) + + +if __name__ == "__main__": + unittest.main()