easybuilders · VRehnberg · Sep 18, 2024 · Sep 23, 2024 · Sep 23, 2024 · Sep 24, 2024
diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb
@@ -21,8 +21,13 @@ toolchain = {'name': 'foss', 'version': '2023a'}
 
 github_account = 'NVIDIA'
 source_urls = [GITHUB_LOWER_SOURCE]
+patches = ['CUTLASS-3.4.0_install_tools.patch']
+
 sources = ['v%(version)s.tar.gz']
-checksums = ['49f4b854acc2a520126ceefe4f701cfe8c2b039045873e311b1f10a8ca5d5de1']
+checksums = [
+    {'v%(version)s.tar.gz': '49f4b854acc2a520126ceefe4f701cfe8c2b039045873e311b1f10a8ca5d5de1'},
+    {'CUTLASS-3.4.0_install_tools.patch': '52c2f17a0fb78febf7100288a25e001a8a7fa24d52c8437225146834d6efce79'},
+]
 
 builddependencies = [
     ('CMake', '3.26.3'),
@@ -38,12 +43,13 @@ _copts = [
     '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"',
     '-DCUTLASS_ENABLE_CUBLAS=1',
     '-DCUTLASS_ENABLE_CUDNN=1',
+    '-DCUTLASS_ENABLE_TOOLS=1',
 ]
 configopts = ' '.join(_copts)
 
 sanity_check_paths = {
     'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT],
-    'dirs': ['lib/cmake'],
+    'dirs': ['lib/cmake', 'tools/util/include'],
 }
 
 moduleclass = 'lib'
diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0_install_tools.patch b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0_install_tools.patch
@@ -0,0 +1,33 @@
+From cb006bb549901f5a15676d9d4f59647508fea21b Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <[email protected]>
+Date: Tue, 8 Oct 2024 08:24:23 +0000
+Subject: [PATCH] Optionally install tools/util/include
+
+DeepSpeed EvoformerAttn expects this file, see
+https://github.com/microsoft/DeepSpeed/blob/v0.14.5/op_builder/evoformer_attn.py#L76
+---
+ CMakeLists.txt | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 114d7936..fe506795 100755
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -623,6 +623,14 @@ install(
+   PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+   )
+
++if(CUTLASS_ENABLE_TOOLS)
++  install(
++    DIRECTORY ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}/
++    DESTINATION ${CMAKE_INSTALL_PREFIX}/tools/util/include
++    )
++endif()
++
++
+ ################################################################################
+
+ # Doxygen is available. Generate documentation
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch
@@ -0,0 +1,57 @@
+Patch away dependency on ninja python package by falling back to checking
+returncode of `ninja --version`.
+
+Author: Viktor Rehnberg (Chalmers University of Technology)
+
+
+diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
+index 85a2f9b2..8bb64626 100644
+--- a/deepspeed/env_report.py
++++ b/deepspeed/env_report.py
+@@ -62,7 +62,7 @@ def ninja_installed():
+     try:
+         import ninja  # noqa: F401 # type: ignore
+     except ImportError:
+-        return False
++        return (subprocess.run(["ninja", "--version"]).returncode == 0)
+     return True
+
+
+diff --git a/op_builder/builder.py b/op_builder/builder.py
+index 8dc825c7..970d18b2 100644
+--- a/op_builder/builder.py
++++ b/op_builder/builder.py
+@@ -487,7 +487,8 @@ class OpBuilder(ABC):
+         try:
+             import ninja  # noqa: F401 # type: ignore
+         except ImportError:
+-            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
++            if subprocess.run(["ninja", "--version"]).returncode != 0:
++                raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+
+         if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
+             self.build_for_cpu = not torch.cuda.is_available()
+diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
+index 81b15f19..cf0a1cc0 100644
+--- a/op_builder/xpu/builder.py
++++ b/op_builder/xpu/builder.py
+@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder):
+         try:
+             import ninja  # noqa: F401
+         except ImportError:
+-            raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
++            if subprocess.run(["ninja", "--version"]).returncode != 0:
++                raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+
+         self.jit_mode = True
+         from intel_extension_for_pytorch.xpu.cpp_extension import load
+diff --git a/requirements/requirements.txt b/requirements/requirements.txt
+index 80c9f9b3..eed77fa3 100755
+--- a/requirements/requirements.txt
++++ b/requirements/requirements.txt
+@@ -1,5 +1,4 @@
+ hjson
+-ninja
+ numpy
+ packaging>=20.0
+ psutil
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb
@@ -0,0 +1,94 @@
+easyblock = 'PythonBundle'
+
+name = 'DeepSpeed'
+version = '0.14.5'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = "http://www.deepspeed.ai/"
+description = """
+DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective.
+"""
+
+
+toolchain = {'name': 'foss', 'version': '2023a'}
+
+builddependencies = [
+    ('Ninja', '1.11.1'),
+    ('Transformers', '4.39.3'),
+]
+
+dependencies = [
+    ('Python', '3.11.3'),
+    ('CUDA', '12.1.1', '', SYSTEM),
+    ('NCCL', '2.18.3', versionsuffix),
+    ('CUTLASS', '3.4.0', versionsuffix),
+    ('PyTorch', '2.1.2', versionsuffix),
+    ('CuPy', '13.0.0', versionsuffix),
+    ('Triton', '2.1.0', versionsuffix),
+    ('accelerate', '0.33.0', versionsuffix),
+    ('PyTorch-bundle', '2.1.2', versionsuffix),  # torchvision dependency for mup
+    ('mpi4py', '3.1.4'),
+    ('Seaborn', '0.13.2'),  # dependency for mup
+    ('DLPack', '0.8'),
+    ('py-cpuinfo', '9.0.0'),
+    ('pydantic', '2.5.3'),
+    ('tqdm', '4.66.1'),
+    ('libaio', '0.3.113'),  # for async_io (builddep only?)
+]
+
+github_account = 'microsoft'
+exts_list = [
+    ('hjson', '3.1.0', {
+        'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'],
+    }),
+    ('nvidia-ml-py', '12.535.161', {
+        'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'],
+        'modulename': 'pynvml',
+    }),
+    ('mup', '1.0.0', {
+        'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'],
+    }),
+    (name, version, {
+        'ds_build_ops_to_skip': [
+            # Sets DS_BUILD_<OPT>=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops
+            'SPARSE_ATTN',  # requires PyTorch<2.0
+            'FP_QUANTIZER',  # Untested triton version (2.1.0), only 2.3.0 and 2.3.1 are known to be compatible
+            'CUTLASS_OPS',  # requires dskernels
+            'RAGGED_DEVICE_OPS',  # requires dskernels
+        ],
+        'fix_python_shebang_for': ['bin/*'],
+        'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"',
+        'patches': [
+            'DeepSpeed-0.14.2_no-ninja-dep.patch',
+            'DeepSpeed-0.14.5_pic-compile.patch',
+            'DeepSpeed-0.14.5_pdsh-env-vars.patch',
+            'DeepSpeed-0.14.5_use-eb-cutlass.patch',
+            'DeepSpeed-0.14.5_test-nvme-offload.patch',
+        ],
+        'runtest': (
+            'ln -s $PWD/tests/ ../tests'
+            ' && cd ../'
+            ' && pytest tests/unit/'
+            ' -k "not TestTensorBoard'  # requires tensorboard
+            ' and not TestWandb'  # requires wandb
+            ' and not TestCometMonitor"'  # requires comet
+        ),
+        # Test suite not available on pypi
+        'source_urls': [GITHUB_SOURCE],
+        'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}],
+        'testinstall': True,
+        'checksums': [
+            {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
+            {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'},
+            {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
+            {'DeepSpeed-0.14.5_pdsh-env-vars.patch':
+             '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'},
+            {'DeepSpeed-0.14.5_use-eb-cutlass.patch':
+             '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'},
+            {'DeepSpeed-0.14.5_test-nvme-offload.patch':
+             '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'},
+        ],
+    }),
+]
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch
@@ -0,0 +1,36 @@
+From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <[email protected]>
+Date: Wed, 25 Sep 2024 09:29:23 +0000
+Subject: [PATCH] Add software relevant environment variables
+
+The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is
+not included in these exports then the python .so file may not be found.
+Also including what seemed important and was added from loading DeepSpeed.
+(Couldn't add everything, then argumet list becomes too long).
+
+See
+ - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
+for more details.
+---
+ deepspeed/launcher/runner.py | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
+index 07d1713e..e9cd61b8 100755
+--- a/deepspeed/launcher/runner.py
++++ b/deepspeed/launcher/runner.py
+@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator
+
+ DLTS_HOSTFILE = "/job/hostfile"
+ EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
++EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed
++    'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA',  # important
++    'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL',
++    'PKG_CONFIG_PATH', 'XDG_DATA_DIRS',
++]
+ EXPORT_ENVS += NEBULA_EXPORT_ENVS
+ DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
+ DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
+-- 
+2.39.3
+
diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
@@ -0,0 +1,141 @@
+From 90afd671dadf9fd6a7a221428f2c04c16d637494 Mon Sep 17 00:00:00 2001
+From: Viktor Rehnberg <[email protected]>
+Date: Thu, 23 May 2024 07:09:53 +0000
+Subject: [PATCH] Compile with PIC
+
+---
+ op_builder/builder.py     | 15 ++++++++++-----
+ op_builder/cpu/builder.py |  3 ++-
+ op_builder/fused_adam.py  |  4 +++-
+ op_builder/fused_lamb.py  |  4 +++-
+ op_builder/fused_lion.py  |  4 +++-
+ op_builder/xpu/builder.py |  3 ++-
+ 6 files changed, 23 insertions(+), 10 deletions(-)
+
+diff --git a/op_builder/builder.py b/op_builder/builder.py
+index ec7566aa..f08e1799 100644
+--- a/op_builder/builder.py
++++ b/op_builder/builder.py
+@@ -288,13 +288,13 @@ class OpBuilder(ABC):
+         '''
+         Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+         '''
+-        return []
++        return ['-Xcompiler', '-fPIC']
+
+     def cxx_args(self):
+         '''
+         Returns optional list of compiler flags to forward to the build
+         '''
+-        return []
++        return ['-fPIC']
+
+     def is_compatible(self, verbose=True):
+         '''
+@@ -746,15 +746,18 @@ class CUDAOpBuilder(OpBuilder):
+             )
+
+     def cxx_args(self):
++        args = super().cxx_args()
+         if sys.platform == "win32":
+-            return ['-O2']
++            args += ['-O2']
+         else:
+-            return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
++            args += ['-O3', '-std=c++17', '-g', '-Wno-reorder']
++        return args
+
+     def nvcc_args(self):
+         if self.build_for_cpu:
+             return []
+-        args = ['-O3']
++        args = super().nvcc_args()
++        args += ['-O3']
+         if self.is_rocm_pytorch():
+             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+             args += [
+@@ -835,6 +838,8 @@ class TorchCPUOpBuilder(CUDAOpBuilder):
+                 '-lcublas',
+                 '-g',
+             ]
++        else:
++            args += super(CUDAOpBuilder, self).cxx_args()
+
+         CPU_ARCH = self.cpu_arch()
+         SIMD_WIDTH = self.simd_width()
+diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
+index d881842a..dfc5a31d 100644
+--- a/op_builder/cpu/builder.py
++++ b/op_builder/cpu/builder.py
+@@ -30,7 +30,8 @@ class CPUOpBuilder(OpBuilder):
+         return cpp_ext
+
+     def cxx_args(self):
+-        args = ['-O3', '-g', '-Wno-reorder']
++        args = super().cxx_args()
++        args += ['-O3', '-g', '-Wno-reorder']
+         CPU_ARCH = self.cpu_arch()
+         SIMD_WIDTH = self.simd_width()
+         args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
+diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
+index ac6e4eea..0c723572 100644
+--- a/op_builder/fused_adam.py
++++ b/op_builder/fused_adam.py
+@@ -29,7 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if not self.is_rocm_pytorch():
+             nvcc_flags.extend(
+                 ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
+index f0cb5577..a59b97d4 100644
+--- a/op_builder/fused_lamb.py
++++ b/op_builder/fused_lamb.py
+@@ -29,7 +29,9 @@ class FusedLambBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if self.is_rocm_pytorch():
+             ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
+             nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
+diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py
+index b900a8f2..119232b5 100644
+--- a/op_builder/fused_lion.py
++++ b/op_builder/fused_lion.py
+@@ -29,7 +29,9 @@ class FusedLionBuilder(CUDAOpBuilder):
+         return args + self.version_dependent_macros()
+
+     def nvcc_args(self):
+-        nvcc_flags = ['-O3'] + self.version_dependent_macros()
++        nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
++        nvcc_flags += ['-O3']
++        nvcc_flags += self.version_dependent_macros()
+         if not self.is_rocm_pytorch():
+             nvcc_flags.extend(
+                 ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
+diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
+index f430b7b6..5a1a2219 100644
+--- a/op_builder/xpu/builder.py
++++ b/op_builder/xpu/builder.py
+@@ -52,7 +52,8 @@ class SYCLOpBuilder(OpBuilder):
+         return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+
+     def cxx_args(self):
+-        cxx_flags = [
++        cxx_flags = super().cxx_args()
++        cxx_flags += [
+             '-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64',
+             '-fno-strict-aliasing'
+         ]
+-- 
+2.39.3
+