diff --git a/torchao/csrc/cuda/fp6_llm/fp6_linear.cu b/torchao/csrc/cuda/fp6_llm/fp6_linear.cu index 1d44acde08..7f973e6987 100644 --- a/torchao/csrc/cuda/fp6_llm/fp6_linear.cu +++ b/torchao/csrc/cuda/fp6_llm/fp6_linear.cu @@ -14,6 +14,8 @@ // // This file is adapted from https://github.com/usyd-fsalab/fp6_llm/blob/5df6737cca32f604e957e3f63f03ccc2e4d1df0d/fp6_llm/csrc/fp6_linear.cu +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere + #include "kernel_matmul.cuh" #include "kernel_reduction.cuh" @@ -200,3 +202,5 @@ TORCH_LIBRARY_IMPL(torchao, CUDA, m) { } } // namespace torchao + +#endif diff --git a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu index 652bba5ca6..7af29caac9 100644 --- a/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu +++ b/torchao/csrc/cuda/tensor_core_tiled_layout/tensor_core_tiled_layout.cu @@ -1,3 +1,5 @@ +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 // at least Ampere + #include #include #include @@ -310,3 +312,5 @@ TORCH_LIBRARY_IMPL(torchao, CUDA, m) { m.impl("torchao::dequantize_tensor_core_tiled_layout", &_dequantize_tensor_core_tiled_layout); } + +#endif