From 1613d53f7ee7de0805dacb1ef17010f41869148b Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Wed, 18 Sep 2024 14:49:43 +0000 Subject: [PATCH 01/19] adding easyconfigs: DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb and patches: DeepSpeed-0.14.5_pic-compile.patch, DeepSpeed-0.14.2_no-ninja-dep.patch --- .../DeepSpeed-0.14.2_no-ninja-dep.patch | 57 +++++++ ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 79 ++++++++++ .../DeepSpeed-0.14.5_pic-compile.patch | 141 ++++++++++++++++++ 3 files changed, 277 insertions(+) create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch new file mode 100644 index 00000000000..8a51596fb3b --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.2_no-ninja-dep.patch @@ -0,0 +1,57 @@ +Patch away dependency on ninja python package by falling back to checking +returncode of `ninja --version`. + +Author: Viktor Rehnberg (Chalmers University of Technology) + + +diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py +index 85a2f9b2..8bb64626 100644 +--- a/deepspeed/env_report.py ++++ b/deepspeed/env_report.py +@@ -62,7 +62,7 @@ def ninja_installed(): + try: + import ninja # noqa: F401 # type: ignore + except ImportError: +- return False ++ return (subprocess.run(["ninja", "--version"]).returncode == 0) + return True + + +diff --git a/op_builder/builder.py b/op_builder/builder.py +index 8dc825c7..970d18b2 100644 +--- a/op_builder/builder.py ++++ b/op_builder/builder.py +@@ -487,7 +487,8 @@ class OpBuilder(ABC): + try: + import ninja # noqa: F401 # type: ignore + except ImportError: +- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") ++ if subprocess.run(["ninja", "--version"]).returncode != 0: ++ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") + + if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch(): + self.build_for_cpu = not torch.cuda.is_available() +diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py +index 81b15f19..cf0a1cc0 100644 +--- a/op_builder/xpu/builder.py ++++ b/op_builder/xpu/builder.py +@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder): + try: + import ninja # noqa: F401 + except ImportError: +- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") ++ if subprocess.run(["ninja", "--version"]).returncode != 0: ++ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.") + + self.jit_mode = True + from intel_extension_for_pytorch.xpu.cpp_extension import load +diff --git a/requirements/requirements.txt b/requirements/requirements.txt +index 80c9f9b3..eed77fa3 100755 +--- a/requirements/requirements.txt ++++ b/requirements/requirements.txt +@@ -1,5 +1,4 @@ + hjson +-ninja + numpy + packaging>=20.0 + psutil diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb new file mode 100644 index 00000000000..689d3699077 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -0,0 +1,79 @@ +easyblock = 'PythonBundle' + +name = 'DeepSpeed' +version = '0.14.5' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = "http://www.deepspeed.ai/" +description = """ +DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective. +""" + + +toolchain = {'name': 'foss', 'version': '2023a'} + +builddependencies = [ + ('Ninja', '1.11.1'), +] + +dependencies = [ + ('Python', '3.11.3'), + ('CUDA', '12.1.1', '', SYSTEM), + ('NCCL', '2.18.3', versionsuffix), + ('CUTLASS', '3.5.0', versionsuffix), + ('PyTorch', '2.1.2', versionsuffix), + ('CuPy', '13.0.0', versionsuffix), + ('DLPack', '0.8'), + ('py-cpuinfo', '9.0.0'), + ('pydantic', '2.5.3'), + ('tqdm', '4.66.1'), + ('libaio', '0.3.113'), # for async_io (builddep only?) + ('Transformers', '4.39.3'), +] + +use_pip = True + +github_account = 'microsoft' +exts_list = [ + ('hjson', '3.1.0', { + 'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'], + }), + ('pynvml', '11.5.3', { + 'checksums': ['183d223ae487e5f00402d8da06c68c978ef8a9295793ee75559839c6ade7b229'], + }), + ('mup', '1.0.0', { + 'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'], + }), + ('accelerate', '0.34.2', { + 'checksums': ['98c1ebe1f5a45c0a3af02dc60b5bb8b7d58d60c3326a326a06ce6d956b18ca5b'], + }), + ('triton', '2.1.0', { + 'sources': [{'filename': '%(name)s-%(version)s-0-cp311-cp311-manylinux2014_%(arch)s.manylinux_2_17_%(arch)s.whl'}], + 'checksums': ['919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8'], + }), + (name, version, { + # Test suite not available on pypi + 'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"', + 'patches': [ + 'DeepSpeed-0.14.2_no-ninja-dep.patch', + 'DeepSpeed-0.14.5_pic-compile.patch', + ], + 'runtest': 'PATH="$PATH:./bin" pytest tests/unit/ -k "not TestTensorBoard and not TestWandb and not TestCometML"', + 'source_urls': [GITHUB_SOURCE], + 'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}], + 'testinstall': True, + 'checksums': [ + {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, + {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, + {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, + ], + }), +] + +sanity_check_commands = [ + "deepspeed --help", + "python -m deepspeed.env_report", +] +sanity_pip_check = True + +moduleclass = 'ai' diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch new file mode 100644 index 00000000000..a7b257ad0a4 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch @@ -0,0 +1,141 @@ +From 90afd671dadf9fd6a7a221428f2c04c16d637494 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Thu, 23 May 2024 07:09:53 +0000 +Subject: [PATCH] Compile with PIC + +--- + op_builder/builder.py | 15 ++++++++++----- + op_builder/cpu/builder.py | 3 ++- + op_builder/fused_adam.py | 4 +++- + op_builder/fused_lamb.py | 4 +++- + op_builder/fused_lion.py | 4 +++- + op_builder/xpu/builder.py | 3 ++- + 6 files changed, 23 insertions(+), 10 deletions(-) + +diff --git a/op_builder/builder.py b/op_builder/builder.py +index ec7566aa..f08e1799 100644 +--- a/op_builder/builder.py ++++ b/op_builder/builder.py +@@ -288,13 +288,13 @@ class OpBuilder(ABC): + ''' + Returns optional list of compiler flags to forward to nvcc when building CUDA sources + ''' +- return [] ++ return ['-fPIC'] + + def cxx_args(self): + ''' + Returns optional list of compiler flags to forward to the build + ''' +- return [] ++ return ['-fPIC'] + + def is_compatible(self, verbose=True): + ''' +@@ -746,15 +746,18 @@ class CUDAOpBuilder(OpBuilder): + ) + + def cxx_args(self): ++ args = super().cxx_args() + if sys.platform == "win32": +- return ['-O2'] ++ args += ['-O2'] + else: +- return ['-O3', '-std=c++17', '-g', '-Wno-reorder'] ++ args += ['-O3', '-std=c++17', '-g', '-Wno-reorder'] ++ return args + + def nvcc_args(self): + if self.build_for_cpu: + return [] +- args = ['-O3'] ++ args = super().nvcc_args() ++ args += ['-O3'] + if self.is_rocm_pytorch(): + ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() + args += [ +@@ -835,6 +838,8 @@ class TorchCPUOpBuilder(CUDAOpBuilder): + '-lcublas', + '-g', + ] ++ else: ++ args += super(CUDAOpBuilder, self).cxx_args() + + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() +diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py +index d881842a..dfc5a31d 100644 +--- a/op_builder/cpu/builder.py ++++ b/op_builder/cpu/builder.py +@@ -30,7 +30,8 @@ class CPUOpBuilder(OpBuilder): + return cpp_ext + + def cxx_args(self): +- args = ['-O3', '-g', '-Wno-reorder'] ++ args = super().cxx_args() ++ args += ['-O3', '-g', '-Wno-reorder'] + CPU_ARCH = self.cpu_arch() + SIMD_WIDTH = self.simd_width() + args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH] +diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py +index ac6e4eea..0c723572 100644 +--- a/op_builder/fused_adam.py ++++ b/op_builder/fused_adam.py +@@ -29,7 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder): + return args + self.version_dependent_macros() + + def nvcc_args(self): +- nvcc_flags = ['-O3'] + self.version_dependent_macros() ++ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args() ++ nvcc_flags += ['-O3'] ++ nvcc_flags += self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + +diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py +index f0cb5577..a59b97d4 100644 +--- a/op_builder/fused_lamb.py ++++ b/op_builder/fused_lamb.py +@@ -29,7 +29,9 @@ class FusedLambBuilder(CUDAOpBuilder): + return args + self.version_dependent_macros() + + def nvcc_args(self): +- nvcc_flags = ['-O3'] + self.version_dependent_macros() ++ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args() ++ nvcc_flags += ['-O3'] ++ nvcc_flags += self.version_dependent_macros() + if self.is_rocm_pytorch(): + ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version() + nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR] +diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py +index b900a8f2..119232b5 100644 +--- a/op_builder/fused_lion.py ++++ b/op_builder/fused_lion.py +@@ -29,7 +29,9 @@ class FusedLionBuilder(CUDAOpBuilder): + return args + self.version_dependent_macros() + + def nvcc_args(self): +- nvcc_flags = ['-O3'] + self.version_dependent_macros() ++ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args() ++ nvcc_flags += ['-O3'] ++ nvcc_flags += self.version_dependent_macros() + if not self.is_rocm_pytorch(): + nvcc_flags.extend( + ['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] + +diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py +index f430b7b6..5a1a2219 100644 +--- a/op_builder/xpu/builder.py ++++ b/op_builder/xpu/builder.py +@@ -52,7 +52,8 @@ class SYCLOpBuilder(OpBuilder): + return version_ge_1_1 + version_ge_1_3 + version_ge_1_5 + + def cxx_args(self): +- cxx_flags = [ ++ cxx_flags = super().cxx_args() ++ cxx_flags += [ + '-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64', + '-fno-strict-aliasing' + ] +-- +2.39.3 + From e81c5d2ba1eecf75740eed75b076c621eebf8325 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Mon, 23 Sep 2024 14:13:46 +0000 Subject: [PATCH 02/19] Add Triton dependency --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 689d3699077..ea91862d471 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -29,6 +29,7 @@ dependencies = [ ('tqdm', '4.66.1'), ('libaio', '0.3.113'), # for async_io (builddep only?) ('Transformers', '4.39.3'), + ('Triton', '2.1.0', versionsuffix), ] use_pip = True @@ -47,10 +48,6 @@ exts_list = [ ('accelerate', '0.34.2', { 'checksums': ['98c1ebe1f5a45c0a3af02dc60b5bb8b7d58d60c3326a326a06ce6d956b18ca5b'], }), - ('triton', '2.1.0', { - 'sources': [{'filename': '%(name)s-%(version)s-0-cp311-cp311-manylinux2014_%(arch)s.manylinux_2_17_%(arch)s.whl'}], - 'checksums': ['919b06453f0033ea52c13eaf7833de0e57db3178d23d4e04f9fc71c4f2c32bf8'], - }), (name, version, { # Test suite not available on pypi 'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"', @@ -58,7 +55,7 @@ exts_list = [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pic-compile.patch', ], - 'runtest': 'PATH="$PATH:./bin" pytest tests/unit/ -k "not TestTensorBoard and not TestWandb and not TestCometML"', + 'runtest': 'PATH="$PATH:./bin" pytest tests/unit/ -k "not TestTensorBoard and not TestWandb and not TestCometMonitor"', 'source_urls': [GITHUB_SOURCE], 'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}], 'testinstall': True, From 407099835fd825d9c3effa9bf04af0faceadb2dc Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Mon, 23 Sep 2024 14:20:20 +0000 Subject: [PATCH 03/19] Add easyconfigs: CUTLASS, DLPack --- .../CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb | 49 +++++++++++++++++++ .../d/DLPack/DLPack-0.8-GCC-12.3.0.eb | 26 ++++++++++ 2 files changed, 75 insertions(+) create mode 100644 easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb create mode 100644 easybuild/easyconfigs/d/DLPack/DLPack-0.8-GCC-12.3.0.eb diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb new file mode 100644 index 00000000000..dc0bfc9acbc --- /dev/null +++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb @@ -0,0 +1,49 @@ +easyblock = 'CMakeMake' + +name = 'CUTLASS' +version = '3.5.0' +versionsuffix = '-CUDA-%(cudaver)s' + +homepage = 'https://github.com/NVIDIA/cutlass' +description = """CUTLASS is a collection of CUDA C++ template +abstractions for implementing high-performance matrix-matrix +multiplication (GEMM) and related computations at all levels and scales +within CUDA. It incorporates strategies for hierarchical decomposition +and data movement similar to those used to implement cuBLAS and cuDNN. +CUTLASS decomposes these "moving parts" into reusable, modular software +components abstracted by C++ template classes. Primitives for different +levels of a conceptual parallelization hierarchy can be specialized and +tuned via custom tiling sizes, data types, and other algorithmic policy. +The resulting flexibility simplifies their use as building blocks within +custom kernels and applications.""" + +toolchain = {'name': 'foss', 'version': '2023a'} + +github_account = 'NVIDIA' +source_urls = [GITHUB_LOWER_SOURCE] +sources = ['v%(version)s.tar.gz'] +checksums = ['ef6af8526e3ad04f9827f35ee57eec555d09447f70a0ad0cf684a2e426ccbcb6'] + +builddependencies = [ + ('CMake', '3.26.3'), + ('Python', '3.11.3'), +] + +dependencies = [ + ('CUDA', '12.1.1', '', SYSTEM), + ('cuDNN', '8.9.2.26', versionsuffix, SYSTEM), +] + +_copts = [ + '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"', + '-DCUTLASS_ENABLE_CUBLAS=1', + '-DCUTLASS_ENABLE_CUDNN=1', +] +configopts = ' '.join(_copts) + +sanity_check_paths = { + 'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT], + 'dirs': ['lib/cmake'], +} + +moduleclass = 'lib' diff --git a/easybuild/easyconfigs/d/DLPack/DLPack-0.8-GCC-12.3.0.eb b/easybuild/easyconfigs/d/DLPack/DLPack-0.8-GCC-12.3.0.eb new file mode 100644 index 00000000000..30c8128f840 --- /dev/null +++ b/easybuild/easyconfigs/d/DLPack/DLPack-0.8-GCC-12.3.0.eb @@ -0,0 +1,26 @@ +easyblock = 'CMakeMake' + +name = 'DLPack' +version = '0.8' + +homepage = 'https://dmlc.github.io/dlpack/latest/' +description = """DLPack is a stable in-memory data structure for an ndarray +system to interact with a variety of frameworks.""" + +toolchain = {'name': 'GCC', 'version': '12.3.0'} + +github_account = 'dmlc' +source_urls = [GITHUB_LOWER_SOURCE] +sources = ['v%(version)s.tar.gz'] +checksums = ['cf965c26a5430ba4cc53d61963f288edddcd77443aa4c85ce722aaf1e2f29513'] + +builddependencies = [ + ('CMake', '3.26.3'), +] + +sanity_check_paths = { + 'files': ['include/dlpack/dlpack.h', 'lib/cmake/dlpack/dlpackConfig.cmake'], + 'dirs': [], +} + +moduleclass = 'lib' From 0822a41d94c66102fa25d786a6739e785651e471 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Tue, 24 Sep 2024 13:00:53 +0000 Subject: [PATCH 04/19] Split long line for testing --- .../DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index ea91862d471..377ed2cba5a 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -55,7 +55,13 @@ exts_list = [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pic-compile.patch', ], - 'runtest': 'PATH="$PATH:./bin" pytest tests/unit/ -k "not TestTensorBoard and not TestWandb and not TestCometMonitor"', + 'runtest': ( + 'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests + 'pytest tests/unit/' + ' -k "not TestTensorBoard"' # requires tensorboard + '" and not TestWandb"' # requires wandb + '" and not TestCometMonitor"' # requires comet + ), 'source_urls': [GITHUB_SOURCE], 'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}], 'testinstall': True, From bf324ace2b708493078066e106287ec54f97d241 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Wed, 25 Sep 2024 08:04:12 +0000 Subject: [PATCH 05/19] Fix whitespace in cmd after linebreaks --- .../DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 377ed2cba5a..0f0d8c82f2f 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -57,10 +57,10 @@ exts_list = [ ], 'runtest': ( 'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests - 'pytest tests/unit/' - ' -k "not TestTensorBoard"' # requires tensorboard - '" and not TestWandb"' # requires wandb - '" and not TestCometMonitor"' # requires comet + ' pytest tests/unit/' + ' -k "not TestTensorBoard' # requires tensorboard + ' and not TestWandb' # requires wandb + ' and not TestCometMonitor"' # requires comet ), 'source_urls': [GITHUB_SOURCE], 'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}], From 155c09dc0e46646e0e000a0a09f7e68b7fa86d47 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Wed, 25 Sep 2024 09:39:33 +0000 Subject: [PATCH 06/19] Add LD_LIBRARY_PATH and PATH to exported env vars --- ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 2 ++ .../DeepSpeed-0.14.5_pdsh-ld-path.patch | 30 +++++++++++++++++++ 2 files changed, 32 insertions(+) create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 0f0d8c82f2f..ffea217830d 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -54,6 +54,7 @@ exts_list = [ 'patches': [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pic-compile.patch', + 'DeepSpeed-0.14.5_pdsh-ld-path.patch', ], 'runtest': ( 'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests @@ -69,6 +70,7 @@ exts_list = [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, + {'DeepSpeed-0.14.5_pdsh-ld-path.patch': '23418ecf453a18b144517a2c476b004c64c09a39fcfbfd74388afff4b22a0313'}, ], }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch new file mode 100644 index 00000000000..c2183d9f0b0 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch @@ -0,0 +1,30 @@ +From ba8e5efdf5a8bb2d3dd2b20c751ea8ad94869492 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Wed, 25 Sep 2024 09:29:23 +0000 +Subject: [PATCH] Add LD_LIBRARY_PATH and PATH to env vars + +The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is +not included in these exports then the python .so file may not be found. + +See https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 +for more details. +--- + deepspeed/launcher/runner.py | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py +index 07d1713e..498cd829 100755 +--- a/deepspeed/launcher/runner.py ++++ b/deepspeed/launcher/runner.py +@@ -31,7 +31,7 @@ from ..autotuning import Autotuner + from deepspeed.accelerator import get_accelerator + + DLTS_HOSTFILE = "/job/hostfile" +-EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX'] ++EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX', 'LD_LIBRARY_PATH', 'PATH'] + EXPORT_ENVS += NEBULA_EXPORT_ENVS + DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env") + DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] +-- +2.39.3 + From 4d3751329fa72c98509c6e4fb695108fd5b7320c Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Thu, 26 Sep 2024 13:00:00 +0000 Subject: [PATCH 07/19] Include more relevant env vars for pdsh --- .../DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 4 ++-- ...ch => DeepSpeed-0.14.5_pdsh-env-vars.patch} | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) rename easybuild/easyconfigs/d/DeepSpeed/{DeepSpeed-0.14.5_pdsh-ld-path.patch => DeepSpeed-0.14.5_pdsh-env-vars.patch} (60%) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index ffea217830d..5e7e77a343f 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -54,7 +54,7 @@ exts_list = [ 'patches': [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pic-compile.patch', - 'DeepSpeed-0.14.5_pdsh-ld-path.patch', + 'DeepSpeed-0.14.5_pdsh-env-vars.patch', ], 'runtest': ( 'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests @@ -70,7 +70,7 @@ exts_list = [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, - {'DeepSpeed-0.14.5_pdsh-ld-path.patch': '23418ecf453a18b144517a2c476b004c64c09a39fcfbfd74388afff4b22a0313'}, + {'DeepSpeed-0.14.5_pdsh-env-vars.patch': 'a22cf89d3eb99b78127ffc850b0d272eb21cbd09bb638fef8fd3160a63f2693d'}, ], }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch similarity index 60% rename from easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch rename to easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch index c2183d9f0b0..7059f07a3cb 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-ld-path.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch @@ -1,27 +1,27 @@ -From ba8e5efdf5a8bb2d3dd2b20c751ea8ad94869492 Mon Sep 17 00:00:00 2001 +From 681049c970de6640a62e9719e1ba784b40ddf9a0 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Wed, 25 Sep 2024 09:29:23 +0000 -Subject: [PATCH] Add LD_LIBRARY_PATH and PATH to env vars +Subject: [PATCH] Add software relevant environment variables The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is not included in these exports then the python .so file may not be found. +Also including a selection of other variables that seem important. See https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 for more details. --- - deepspeed/launcher/runner.py | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) + deepspeed/launcher/runner.py | 1 + + 1 file changed, 1 insertion(+) diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py -index 07d1713e..498cd829 100755 +index 07d1713e..e49d08c8 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py -@@ -31,7 +31,7 @@ from ..autotuning import Autotuner - from deepspeed.accelerator import get_accelerator +@@ -32,6 +32,7 @@ from deepspeed.accelerator import get_accelerator DLTS_HOSTFILE = "/job/hostfile" --EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX'] -+EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX', 'LD_LIBRARY_PATH', 'PATH'] + EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX'] ++EXPORT_ENVS += ['LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA'] EXPORT_ENVS += NEBULA_EXPORT_ENVS DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env") DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] From ee1f77bb7e532f782e335e39da952c837fde7543 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Thu, 26 Sep 2024 13:04:45 +0000 Subject: [PATCH 08/19] Add mup dependencies --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 5e7e77a343f..8900af2aa90 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -23,13 +23,15 @@ dependencies = [ ('CUTLASS', '3.5.0', versionsuffix), ('PyTorch', '2.1.2', versionsuffix), ('CuPy', '13.0.0', versionsuffix), + ('Triton', '2.1.0', versionsuffix), + ('PyTorch-bundle', '2.1.2', versionsuffix), # torchvision dependency for mup + ('Seaborn', '0.13.2'), # dependency for mup ('DLPack', '0.8'), ('py-cpuinfo', '9.0.0'), ('pydantic', '2.5.3'), ('tqdm', '4.66.1'), ('libaio', '0.3.113'), # for async_io (builddep only?) ('Transformers', '4.39.3'), - ('Triton', '2.1.0', versionsuffix), ] use_pip = True From 6963a60068eff3d382837b28f239dd26a80d8be3 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Fri, 27 Sep 2024 07:28:40 +0000 Subject: [PATCH 09/19] Further update deps --- .../DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 8900af2aa90..93124d09480 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -24,6 +24,7 @@ dependencies = [ ('PyTorch', '2.1.2', versionsuffix), ('CuPy', '13.0.0', versionsuffix), ('Triton', '2.1.0', versionsuffix), + ('accelerate', '0.33.0', versionsuffix), ('PyTorch-bundle', '2.1.2', versionsuffix), # torchvision dependency for mup ('Seaborn', '0.13.2'), # dependency for mup ('DLPack', '0.8'), @@ -41,15 +42,13 @@ exts_list = [ ('hjson', '3.1.0', { 'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'], }), - ('pynvml', '11.5.3', { - 'checksums': ['183d223ae487e5f00402d8da06c68c978ef8a9295793ee75559839c6ade7b229'], + ('nvidia-ml-py', '12.535.161', { + 'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'], + 'modulename': 'pynvml', }), ('mup', '1.0.0', { 'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'], }), - ('accelerate', '0.34.2', { - 'checksums': ['98c1ebe1f5a45c0a3af02dc60b5bb8b7d58d60c3326a326a06ce6d956b18ca5b'], - }), (name, version, { # Test suite not available on pypi 'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"', From 891d5564305fe65b92d4f35551aa35ce46d66a86 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Fri, 27 Sep 2024 07:32:18 +0000 Subject: [PATCH 10/19] Fix style --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 93124d09480..1ab2e7568e7 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -71,7 +71,8 @@ exts_list = [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, - {'DeepSpeed-0.14.5_pdsh-env-vars.patch': 'a22cf89d3eb99b78127ffc850b0d272eb21cbd09bb638fef8fd3160a63f2693d'}, + {'DeepSpeed-0.14.5_pdsh-env-vars.patch': + 'a22cf89d3eb99b78127ffc850b0d272eb21cbd09bb638fef8fd3160a63f2693d'}, ], }), ] From 5b5081a8761e514e3e5cf3bb96e6edbed9ad4ec6 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Tue, 1 Oct 2024 08:08:19 +0000 Subject: [PATCH 11/19] Add all extra envvars from module --- ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 2 +- .../DeepSpeed-0.14.5_pdsh-env-vars.patch | 24 ++++++++++++------- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 1ab2e7568e7..8e2208fb5ec 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -72,7 +72,7 @@ exts_list = [ {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': - 'a22cf89d3eb99b78127ffc850b0d272eb21cbd09bb638fef8fd3160a63f2693d'}, + 'f4703d16fa2859fdcb82445573a68df94c66dde21575e17fb17b22723afb328d'}, ], }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch index 7059f07a3cb..477ecc52273 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch @@ -1,27 +1,35 @@ -From 681049c970de6640a62e9719e1ba784b40ddf9a0 Mon Sep 17 00:00:00 2001 +From 00b832aad94e8bd003ce8d4d99b6f442678e04a0 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Wed, 25 Sep 2024 09:29:23 +0000 Subject: [PATCH] Add software relevant environment variables The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is not included in these exports then the python .so file may not be found. -Also including a selection of other variables that seem important. +Also including all environment variables added by module load DeepSpeed. -See https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 +See + - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 + - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2385060679 for more details. --- - deepspeed/launcher/runner.py | 1 + - 1 file changed, 1 insertion(+) + deepspeed/launcher/runner.py | 7 +++++++ + 1 file changed, 7 insertions(+) diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py -index 07d1713e..e49d08c8 100755 +index 07d1713e..02d077d8 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py -@@ -32,6 +32,7 @@ from deepspeed.accelerator import get_accelerator +@@ -32,6 +32,13 @@ from deepspeed.accelerator import get_accelerator DLTS_HOSTFILE = "/job/hostfile" EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX'] -+EXPORT_ENVS += ['LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA'] ++EXPORT_ENVS += [ # Diff `module load DeepSpeed` vs `module purge` ++ 'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA', # needed ++ 'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', '_LMFILES', '__LMOD', ++ 'LOADEDMODULES', 'MANPATH', '_ModuleTable', 'MPL', 'NCCL', ++ 'PKG_CONFIG_PATH', 'PYTHONPATH', 'SLURM_MPI_TYPE', ++ 'SLURM_PMIX_DIRECT_CONN_UCX', 'UCX_MODULE', 'XDG_DATA_DIRS', ++] EXPORT_ENVS += NEBULA_EXPORT_ENVS DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env") DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.'] From aaf9fbecd3dbc0731bf296749f612853755192ba Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Tue, 1 Oct 2024 14:47:25 +0000 Subject: [PATCH 12/19] Move functionality between block and config --- ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 16 ++++++++----- .../DeepSpeed-0.14.5_pdsh-env-vars.patch | 24 +++++++++---------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 8e2208fb5ec..f506b45a90a 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -50,7 +50,14 @@ exts_list = [ 'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'], }), (name, version, { - # Test suite not available on pypi + 'ds_build_opts_to_skip': [ + # DS_BUILD_=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops + 'SPARSE_ATTN', # requires PyTorch<2.0, triton==1.0.0 + 'EVOFORMER_ATTN', # requires PyTorch<2.0, triton==1.0.0 + 'CUTLASS_OPS', # requires dskernels + 'RAGGED_DEVICE_OPS', # requires dskernels + + ], 'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"', 'patches': [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', @@ -64,6 +71,7 @@ exts_list = [ ' and not TestWandb' # requires wandb ' and not TestCometMonitor"' # requires comet ), + # Test suite not available on pypi 'source_urls': [GITHUB_SOURCE], 'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}], 'testinstall': True, @@ -72,15 +80,11 @@ exts_list = [ {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': - 'f4703d16fa2859fdcb82445573a68df94c66dde21575e17fb17b22723afb328d'}, + '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'}, ], }), ] -sanity_check_commands = [ - "deepspeed --help", - "python -m deepspeed.env_report", -] sanity_pip_check = True moduleclass = 'ai' diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch index 477ecc52273..9d4342f66bc 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pdsh-env-vars.patch @@ -1,34 +1,32 @@ -From 00b832aad94e8bd003ce8d4d99b6f442678e04a0 Mon Sep 17 00:00:00 2001 +From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Wed, 25 Sep 2024 09:29:23 +0000 Subject: [PATCH] Add software relevant environment variables The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is not included in these exports then the python .so file may not be found. -Also including all environment variables added by module load DeepSpeed. +Also including what seemed important and was added from loading DeepSpeed. +(Couldn't add everything, then argumet list becomes too long). See - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098 - - https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2385060679 for more details. --- - deepspeed/launcher/runner.py | 7 +++++++ - 1 file changed, 7 insertions(+) + deepspeed/launcher/runner.py | 5 +++++ + 1 file changed, 5 insertions(+) diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py -index 07d1713e..02d077d8 100755 +index 07d1713e..e9cd61b8 100755 --- a/deepspeed/launcher/runner.py +++ b/deepspeed/launcher/runner.py -@@ -32,6 +32,13 @@ from deepspeed.accelerator import get_accelerator +@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator DLTS_HOSTFILE = "/job/hostfile" EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX'] -+EXPORT_ENVS += [ # Diff `module load DeepSpeed` vs `module purge` -+ 'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA', # needed -+ 'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', '_LMFILES', '__LMOD', -+ 'LOADEDMODULES', 'MANPATH', '_ModuleTable', 'MPL', 'NCCL', -+ 'PKG_CONFIG_PATH', 'PYTHONPATH', 'SLURM_MPI_TYPE', -+ 'SLURM_PMIX_DIRECT_CONN_UCX', 'UCX_MODULE', 'XDG_DATA_DIRS', ++EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed ++ 'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA', # important ++ 'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL', ++ 'PKG_CONFIG_PATH', 'XDG_DATA_DIRS', +] EXPORT_ENVS += NEBULA_EXPORT_ENVS DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env") From 79c97cab3c38affabc26c6301aed5e204e7110a0 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Thu, 10 Oct 2024 14:07:21 +0000 Subject: [PATCH 13/19] Prebuild ops and bump CUTLASS version --- ...> CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb} | 12 +++- .../CUTLASS/CUTLASS-3.5.1_install_tools.patch | 33 +++++++++++ ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 12 ++-- .../DeepSpeed-0.14.5_use-eb-cutlass.patch | 55 +++++++++++++++++++ 4 files changed, 104 insertions(+), 8 deletions(-) rename easybuild/easyconfigs/c/CUTLASS/{CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb => CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb} (79%) create mode 100644 easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb similarity index 79% rename from easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb rename to easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb index dc0bfc9acbc..0d0caa25270 100644 --- a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.0-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb @@ -1,7 +1,7 @@ easyblock = 'CMakeMake' name = 'CUTLASS' -version = '3.5.0' +version = '3.5.1' versionsuffix = '-CUDA-%(cudaver)s' homepage = 'https://github.com/NVIDIA/cutlass' @@ -21,8 +21,13 @@ toolchain = {'name': 'foss', 'version': '2023a'} github_account = 'NVIDIA' source_urls = [GITHUB_LOWER_SOURCE] +patches = ['CUTLASS-3.5.1_install_tools.patch'] + sources = ['v%(version)s.tar.gz'] -checksums = ['ef6af8526e3ad04f9827f35ee57eec555d09447f70a0ad0cf684a2e426ccbcb6'] +checksums = [ + {'v%(version)s.tar.gz': '20b7247cda2d257cbf8ba59ba3ca40a9211c4da61a9c9913e32b33a2c5883a36'}, + {'CUTLASS-3.5.1_install_tools.patch': '18fa5361b15848d98435b8b08bd921130718b963ca4ad47fa0db96fbe815e509'}, +] builddependencies = [ ('CMake', '3.26.3'), @@ -38,12 +43,13 @@ _copts = [ '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"', '-DCUTLASS_ENABLE_CUBLAS=1', '-DCUTLASS_ENABLE_CUDNN=1', + '-DCUTLASS_ENABLE_TOOLS=1', ] configopts = ' '.join(_copts) sanity_check_paths = { 'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT], - 'dirs': ['lib/cmake'], + 'dirs': ['lib/cmake', 'tools/util/include'], } moduleclass = 'lib' diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch new file mode 100644 index 00000000000..aede4b53fb9 --- /dev/null +++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch @@ -0,0 +1,33 @@ +From fd04f818d16431ee8979728d4725f63ab7f31a05 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Tue, 8 Oct 2024 08:24:23 +0000 +Subject: [PATCH] Optionally install tools/util/include + +DeepSpeed EvoformerAttn expects this file, see +https://github.com/microsoft/DeepSpeed/blob/v0.14.5/op_builder/evoformer_attn.py#L76 +--- + CMakeLists.txt | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 7419bdf5..1cee21ac 100755 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -689,6 +689,14 @@ install( + PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + ) + ++if(CUTLASS_ENABLE_TOOLS) ++ install( ++ DIRECTORY ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}/ ++ DESTINATION ${CMAKE_INSTALL_PREFIX}/tools/util/include ++ ) ++endif() ++ ++ + ################################################################################ + + # Doxygen is available. Generate documentation +-- +2.39.3 + diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index f506b45a90a..f6d747d2eea 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -20,7 +20,7 @@ dependencies = [ ('Python', '3.11.3'), ('CUDA', '12.1.1', '', SYSTEM), ('NCCL', '2.18.3', versionsuffix), - ('CUTLASS', '3.5.0', versionsuffix), + ('CUTLASS', '3.5.1', versionsuffix), ('PyTorch', '2.1.2', versionsuffix), ('CuPy', '13.0.0', versionsuffix), ('Triton', '2.1.0', versionsuffix), @@ -50,19 +50,19 @@ exts_list = [ 'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'], }), (name, version, { - 'ds_build_opts_to_skip': [ + 'ds_build_ops_to_skip': [ # DS_BUILD_=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops - 'SPARSE_ATTN', # requires PyTorch<2.0, triton==1.0.0 - 'EVOFORMER_ATTN', # requires PyTorch<2.0, triton==1.0.0 + 'SPARSE_ATTN', # requires PyTorch<2.0 + 'FP_QUANTIZER', # Untested triton version (2.1.0), only 2.3.0 and 2.3.1 are known to be compatible 'CUTLASS_OPS', # requires dskernels 'RAGGED_DEVICE_OPS', # requires dskernels - ], 'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"', 'patches': [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', 'DeepSpeed-0.14.5_pic-compile.patch', 'DeepSpeed-0.14.5_pdsh-env-vars.patch', + 'DeepSpeed-0.14.5_use-eb-cutlass.patch', ], 'runtest': ( 'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests @@ -81,6 +81,8 @@ exts_list = [ {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'}, + {'DeepSpeed-0.14.5_use-eb-cutlass.patch': + '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'}, ], }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch new file mode 100644 index 00000000000..35fe2cb8b66 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_use-eb-cutlass.patch @@ -0,0 +1,55 @@ +From 27a64a22d6f84585ce9685e72ed9000f569ba941 Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Thu, 10 Oct 2024 05:57:38 +0000 +Subject: [PATCH] Use EB env vars to search for CUTLASS + +Instead of needing to set CUTLASS_PATH to the cutlass source directory, +use EBROOTCUTLASS to find the installation directory. +--- + op_builder/evoformer_attn.py | 15 ++++++++------- + 1 file changed, 8 insertions(+), 7 deletions(-) + +diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py +index af3aa742..5545b1aa 100644 +--- a/op_builder/evoformer_attn.py ++++ b/op_builder/evoformer_attn.py +@@ -5,6 +5,7 @@ + + from .builder import CUDAOpBuilder, installed_cuda_version + import os ++from packaging.version import Version + + + class EvoformerAttnBuilder(CUDAOpBuilder): +@@ -14,7 +15,8 @@ class EvoformerAttnBuilder(CUDAOpBuilder): + def __init__(self, name=None): + name = self.NAME if name is None else name + super().__init__(name=name) +- self.cutlass_path = os.environ.get('CUTLASS_PATH') ++ self.cutlass_path = os.environ.get('EBROOTCUTLASS') ++ self.cutlass_version = os.environ.get('EBVERSIONCUTLASS') + + def absolute_name(self): + return f'deepspeed.ops.{self.NAME}_op' +@@ -50,13 +52,12 @@ class EvoformerAttnBuilder(CUDAOpBuilder): + return False + if self.cutlass_path is None: + if verbose: +- self.warning("Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH") ++ self.warning("EBROOTCUTLASS not set, please load CUTLASS module.") ++ return False ++ if Version(self.cutlass_version) < Version('3.1.0'): ++ if verbose: ++ self.warning(f"Please use CUTLASS version >= 3.1.0, detected {self.cutlass_version}") + return False +- with open(f'{self.cutlass_path}/CHANGELOG.md', 'r') as f: +- if '3.1.0' not in f.read(): +- if verbose: +- self.warning("Please use CUTLASS version >= 3.1.0") +- return False + cuda_okay = True + if not self.is_rocm_pytorch() and torch.cuda.is_available(): #ignore-cuda + sys_cuda_major, _ = installed_cuda_version() +-- +2.39.3 + From 57d8582a2e5761ea9434f428119066f7acb69f71 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Fri, 1 Nov 2024 13:40:30 +0000 Subject: [PATCH 14/19] Add -Xcompiler to nvcc -fPIC patch --- .../easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch index a7b257ad0a4..707bc826e88 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch @@ -21,7 +21,7 @@ index ec7566aa..f08e1799 100644 Returns optional list of compiler flags to forward to nvcc when building CUDA sources ''' - return [] -+ return ['-fPIC'] ++ return ['-Xcompiler', '-fPIC'] def cxx_args(self): ''' From 9b3e07b6f63547fee82720caf9ebc677b23d0a04 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Fri, 1 Nov 2024 13:41:13 +0000 Subject: [PATCH 15/19] Fix python shebangs --- .../DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index f6d747d2eea..7e057047741 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -14,6 +14,7 @@ toolchain = {'name': 'foss', 'version': '2023a'} builddependencies = [ ('Ninja', '1.11.1'), + ('Transformers', '4.39.3'), ] dependencies = [ @@ -32,7 +33,6 @@ dependencies = [ ('pydantic', '2.5.3'), ('tqdm', '4.66.1'), ('libaio', '0.3.113'), # for async_io (builddep only?) - ('Transformers', '4.39.3'), ] use_pip = True @@ -51,12 +51,13 @@ exts_list = [ }), (name, version, { 'ds_build_ops_to_skip': [ - # DS_BUILD_=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops + # Sets DS_BUILD_=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops 'SPARSE_ATTN', # requires PyTorch<2.0 'FP_QUANTIZER', # Untested triton version (2.1.0), only 2.3.0 and 2.3.1 are known to be compatible 'CUTLASS_OPS', # requires dskernels 'RAGGED_DEVICE_OPS', # requires dskernels ], + 'fix_python_shebang_for': ['bin/*'], 'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"', 'patches': [ 'DeepSpeed-0.14.2_no-ninja-dep.patch', @@ -65,8 +66,9 @@ exts_list = [ 'DeepSpeed-0.14.5_use-eb-cutlass.patch', ], 'runtest': ( - 'PATH="$PATH:$PWD/bin"' # deepspeed cli used in a lot of tests - ' pytest tests/unit/' + 'ln -s $PWD/tests/ ../tests' + ' && cd ../' + ' && pytest tests/unit/' ' -k "not TestTensorBoard' # requires tensorboard ' and not TestWandb' # requires wandb ' and not TestCometMonitor"' # requires comet From 06250f75444d19191e5c7d4662b536f660681540 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Mon, 4 Nov 2024 14:50:16 +0000 Subject: [PATCH 16/19] Patch nvme offload test --- ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 5 +- .../DeepSpeed-0.14.5_test-nvme-offload.patch | 144 ++++++++++++++++++ 2 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 7e057047741..61004a97493 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -64,6 +64,7 @@ exts_list = [ 'DeepSpeed-0.14.5_pic-compile.patch', 'DeepSpeed-0.14.5_pdsh-env-vars.patch', 'DeepSpeed-0.14.5_use-eb-cutlass.patch', + 'DeepSpeed-0.14.5_test-nvme-offload.patch', ], 'runtest': ( 'ln -s $PWD/tests/ ../tests' @@ -80,11 +81,13 @@ exts_list = [ 'checksums': [ {'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'}, {'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'}, - {'DeepSpeed-0.14.5_pic-compile.patch': '7d250f6bf57d006cab01a8763803b026f0d9029634557746c2a759893ab279b3'}, + {'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'}, {'DeepSpeed-0.14.5_pdsh-env-vars.patch': '02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'}, {'DeepSpeed-0.14.5_use-eb-cutlass.patch': '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'}, + {'DeepSpeed-0.14.5_test-nvme-offload.patch': + 'a5a7af9b0d8531a5d3b2fbc01f22a856bcf0fdb55d43d60b7945240e0ea826f1'}, ], }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch new file mode 100644 index 00000000000..2fd36b43c21 --- /dev/null +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch @@ -0,0 +1,144 @@ +From 2b1baf3226d82923ae9b4edc18a601d2dccf6bef Mon Sep 17 00:00:00 2001 +From: Viktor Rehnberg +Date: Mon, 4 Nov 2024 15:31:55 +0100 +Subject: [PATCH] Fix quantization tests + +NVME tests didn't always run because the hard-coded nvme_path wasn't +always writable. This commit changed to use tmp_path fixture instead and +disabled distributed test to avoid thread locks hanging. +--- + .../quantization/test_intX_quantization.py | 43 ++++++++++--------- + 1 file changed, 22 insertions(+), 21 deletions(-) + +diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py +index 77b51fcd..f5a87dea 100644 +--- a/tests/unit/inference/quantization/test_intX_quantization.py ++++ b/tests/unit/inference/quantization/test_intX_quantization.py +@@ -17,6 +17,7 @@ from transformers import AutoConfig, OPTConfig, AutoModel + import pytest + from collections import OrderedDict + from typing import Dict ++from pathlib import Path + + device = get_accelerator().device_name() if get_accelerator().is_available() else 'cpu' + +@@ -53,11 +54,11 @@ def quantization_test_helper(pre_quant_type: torch.dtype, num_bits: int): + assert mean_diff < 0.15 and max_diff < 0.5, f'Numeric error exceed threshold, mean diff {mean_diff} (threshold 0.15), max diff {max_diff} (threshold 0.5)' + + +-def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int): ++def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path): + import deepspeed + from transformers.integrations.deepspeed import HfDeepSpeedConfig + +- def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict: ++ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict: + GB = 1 << 30 + + ds_config = { +@@ -127,7 +128,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo + ds_config["zero_optimization"]["offload_param"] = dict( + device="nvme", + pin_memory=True, +- nvme_path='~/tmp_offload_dir', ++ nvme_path=tmp_path / "tmp_offload_dir", + buffer_count=5, + buffer_size=1 * GB, + ) +@@ -142,7 +143,7 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo + return ds_config + + hf_config = AutoConfig.from_pretrained('facebook/opt-125m') +- ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits) ++ ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path) + + input_ids = torch.ones(1, 16, dtype=torch.int32, device=device) + attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device) +@@ -170,11 +171,11 @@ def zero3_post_init_quantization_test_helper(cpu_offload: bool, nvme_offload: bo + assert mean_diff < 0.4, f'Numeric error exceed threshold, relative error {mean_diff} (threshold 0.4)' + + +-def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int): ++def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path): + import deepspeed + from transformers.integrations.deepspeed import HfDeepSpeedConfig + +- def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int) -> Dict: ++ def get_zero3_ds_config(hf_config: OPTConfig, cpu_offload: bool, nvme_offload: bool, bits: int, tmp_path: Path) -> Dict: + GB = 1 << 30 + + ds_config = { +@@ -206,7 +207,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: + ds_config["zero_optimization"]["offload_param"] = dict( + device="nvme", + pin_memory=True, +- nvme_path='~/tmp_offload_dir', ++ nvme_path=tmp_path / "tmp_offload_dir", + buffer_count=5, + buffer_size=1 * GB, + ) +@@ -221,7 +222,7 @@ def zero3_quantized_initialization_test_helper(cpu_offload: bool, nvme_offload: + return ds_config + + hf_config = AutoConfig.from_pretrained('facebook/opt-125m') +- ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits) ++ ds_config = get_zero3_ds_config(hf_config=hf_config, cpu_offload=cpu_offload, nvme_offload=nvme_offload, bits=bits, tmp_path=tmp_path) + + input_ids = torch.ones(1, 16, dtype=torch.int32, device=device) + attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device) +@@ -257,7 +258,7 @@ def group_dim(request): + return request.param + + +-class TestQuantizedInt(DistributedTest): ++class TestQuantizedInt(): + + def test_model_quantization(self, quantization_bits): + reset_random() +@@ -376,31 +377,31 @@ class TestQuantizedInt(DistributedTest): + quantization_test_helper(torch.float16, 8) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_post_init_quant(self, quantization_bits): ++ def test_zero3_int4_post_init_quant(self, quantization_bits, tmp_path): + reset_random() +- zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits) ++ zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits): ++ def test_zero3_int4_post_init_quant_cpu_offload(self, quantization_bits, tmp_path): + reset_random() +- zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits) ++ zero3_post_init_quantization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_post_init_quant_nvme_offload(self): ++ def test_zero3_int4_post_init_quant_nvme_offload(self, tmp_path): + reset_random() +- zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4) ++ zero3_post_init_quantization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_quantized_initialization(self, quantization_bits): ++ def test_zero3_int4_quantized_initialization(self, quantization_bits, tmp_path): + reset_random() +- zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits) ++ zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits): ++ def test_zero3_int4_quantized_initialization_cpu_offload(self, quantization_bits, tmp_path): + reset_random() +- zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits) ++ zero3_quantized_initialization_test_helper(cpu_offload=True, nvme_offload=False, bits=quantization_bits, tmp_path=tmp_path) + + @pytest.mark.skipif(device == 'cpu', reason='CPU does support FP16 GEMM') +- def test_zero3_int4_quantized_initialization_nvme_offload(self): ++ def test_zero3_int4_quantized_initialization_nvme_offload(self, tmp_path): + reset_random() +- zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4) ++ zero3_quantized_initialization_test_helper(cpu_offload=False, nvme_offload=True, bits=4, tmp_path=tmp_path) +-- +2.39.3 + From 9aba3e3048b9ee69376604307f0dd18f67775b96 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Fri, 8 Nov 2024 14:09:26 +0000 Subject: [PATCH 17/19] Readd DistributionTest nvme offload patch --- .../DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 3 ++- .../DeepSpeed-0.14.5_test-nvme-offload.patch | 17 ++++------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 61004a97493..52563639d51 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -27,6 +27,7 @@ dependencies = [ ('Triton', '2.1.0', versionsuffix), ('accelerate', '0.33.0', versionsuffix), ('PyTorch-bundle', '2.1.2', versionsuffix), # torchvision dependency for mup + ('mpi4py', '3.1.4'), ('Seaborn', '0.13.2'), # dependency for mup ('DLPack', '0.8'), ('py-cpuinfo', '9.0.0'), @@ -87,7 +88,7 @@ exts_list = [ {'DeepSpeed-0.14.5_use-eb-cutlass.patch': '43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'}, {'DeepSpeed-0.14.5_test-nvme-offload.patch': - 'a5a7af9b0d8531a5d3b2fbc01f22a856bcf0fdb55d43d60b7945240e0ea826f1'}, + '1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'}, ], }), ] diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch index 2fd36b43c21..dcff709f2ce 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_test-nvme-offload.patch @@ -1,4 +1,4 @@ -From 2b1baf3226d82923ae9b4edc18a601d2dccf6bef Mon Sep 17 00:00:00 2001 +From ddbf7ab23ce2e83747ff6a1482ac512e06da82ca Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Mon, 4 Nov 2024 15:31:55 +0100 Subject: [PATCH] Fix quantization tests @@ -11,7 +11,7 @@ disabled distributed test to avoid thread locks hanging. 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/tests/unit/inference/quantization/test_intX_quantization.py b/tests/unit/inference/quantization/test_intX_quantization.py -index 77b51fcd..f5a87dea 100644 +index 77b51fcd..9e0d7ac0 100644 --- a/tests/unit/inference/quantization/test_intX_quantization.py +++ b/tests/unit/inference/quantization/test_intX_quantization.py @@ -17,6 +17,7 @@ from transformers import AutoConfig, OPTConfig, AutoModel @@ -41,7 +41,7 @@ index 77b51fcd..f5a87dea 100644 device="nvme", pin_memory=True, - nvme_path='~/tmp_offload_dir', -+ nvme_path=tmp_path / "tmp_offload_dir", ++ nvme_path=str(tmp_path / "tmp_offload_dir"), buffer_count=5, buffer_size=1 * GB, ) @@ -73,7 +73,7 @@ index 77b51fcd..f5a87dea 100644 device="nvme", pin_memory=True, - nvme_path='~/tmp_offload_dir', -+ nvme_path=tmp_path / "tmp_offload_dir", ++ nvme_path=str(tmp_path / "tmp_offload_dir"), buffer_count=5, buffer_size=1 * GB, ) @@ -86,15 +86,6 @@ index 77b51fcd..f5a87dea 100644 input_ids = torch.ones(1, 16, dtype=torch.int32, device=device) attention_mask = torch.ones(1, 16, dtype=torch.float32, device=device) -@@ -257,7 +258,7 @@ def group_dim(request): - return request.param - - --class TestQuantizedInt(DistributedTest): -+class TestQuantizedInt(): - - def test_model_quantization(self, quantization_bits): - reset_random() @@ -376,31 +377,31 @@ class TestQuantizedInt(DistributedTest): quantization_test_helper(torch.float16, 8) From 44ae74477c23ac65025fab4817da669978df8ce5 Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Mon, 11 Nov 2024 15:35:59 +0000 Subject: [PATCH 18/19] Patch existing CUTLASS, instead of new --- .../CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb | 10 +++- ...atch => CUTLASS-3.4.0_install_tools.patch} | 6 +- .../CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb | 55 ------------------- ...DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 2 +- 4 files changed, 12 insertions(+), 61 deletions(-) rename easybuild/easyconfigs/c/CUTLASS/{CUTLASS-3.5.1_install_tools.patch => CUTLASS-3.4.0_install_tools.patch} (86%) delete mode 100644 easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb index 3322e6b4b56..08b6ff64774 100644 --- a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0-foss-2023a-CUDA-12.1.1.eb @@ -21,8 +21,13 @@ toolchain = {'name': 'foss', 'version': '2023a'} github_account = 'NVIDIA' source_urls = [GITHUB_LOWER_SOURCE] +patches = ['CUTLASS-3.4.0_install_tools.patch'] + sources = ['v%(version)s.tar.gz'] -checksums = ['49f4b854acc2a520126ceefe4f701cfe8c2b039045873e311b1f10a8ca5d5de1'] +checksums = [ + {'v%(version)s.tar.gz': '49f4b854acc2a520126ceefe4f701cfe8c2b039045873e311b1f10a8ca5d5de1'}, + {'CUTLASS-3.4.0_install_tools.patch': '52c2f17a0fb78febf7100288a25e001a8a7fa24d52c8437225146834d6efce79'}, +] builddependencies = [ ('CMake', '3.26.3'), @@ -38,12 +43,13 @@ _copts = [ '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"', '-DCUTLASS_ENABLE_CUBLAS=1', '-DCUTLASS_ENABLE_CUDNN=1', + '-DCUTLASS_ENABLE_TOOLS=1', ] configopts = ' '.join(_copts) sanity_check_paths = { 'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT], - 'dirs': ['lib/cmake'], + 'dirs': ['lib/cmake', 'tools/util/include'], } moduleclass = 'lib' diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0_install_tools.patch similarity index 86% rename from easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch rename to easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0_install_tools.patch index aede4b53fb9..910fb66faa8 100644 --- a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1_install_tools.patch +++ b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0_install_tools.patch @@ -1,4 +1,4 @@ -From fd04f818d16431ee8979728d4725f63ab7f31a05 Mon Sep 17 00:00:00 2001 +From cb006bb549901f5a15676d9d4f59647508fea21b Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg Date: Tue, 8 Oct 2024 08:24:23 +0000 Subject: [PATCH] Optionally install tools/util/include @@ -10,10 +10,10 @@ https://github.com/microsoft/DeepSpeed/blob/v0.14.5/op_builder/evoformer_attn.py 1 file changed, 8 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt -index 7419bdf5..1cee21ac 100755 +index 114d7936..fe506795 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -689,6 +689,14 @@ install( +@@ -623,6 +623,14 @@ install( PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} ) diff --git a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb deleted file mode 100644 index 0d0caa25270..00000000000 --- a/easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.5.1-foss-2023a-CUDA-12.1.1.eb +++ /dev/null @@ -1,55 +0,0 @@ -easyblock = 'CMakeMake' - -name = 'CUTLASS' -version = '3.5.1' -versionsuffix = '-CUDA-%(cudaver)s' - -homepage = 'https://github.com/NVIDIA/cutlass' -description = """CUTLASS is a collection of CUDA C++ template -abstractions for implementing high-performance matrix-matrix -multiplication (GEMM) and related computations at all levels and scales -within CUDA. It incorporates strategies for hierarchical decomposition -and data movement similar to those used to implement cuBLAS and cuDNN. -CUTLASS decomposes these "moving parts" into reusable, modular software -components abstracted by C++ template classes. Primitives for different -levels of a conceptual parallelization hierarchy can be specialized and -tuned via custom tiling sizes, data types, and other algorithmic policy. -The resulting flexibility simplifies their use as building blocks within -custom kernels and applications.""" - -toolchain = {'name': 'foss', 'version': '2023a'} - -github_account = 'NVIDIA' -source_urls = [GITHUB_LOWER_SOURCE] -patches = ['CUTLASS-3.5.1_install_tools.patch'] - -sources = ['v%(version)s.tar.gz'] -checksums = [ - {'v%(version)s.tar.gz': '20b7247cda2d257cbf8ba59ba3ca40a9211c4da61a9c9913e32b33a2c5883a36'}, - {'CUTLASS-3.5.1_install_tools.patch': '18fa5361b15848d98435b8b08bd921130718b963ca4ad47fa0db96fbe815e509'}, -] - -builddependencies = [ - ('CMake', '3.26.3'), - ('Python', '3.11.3'), -] - -dependencies = [ - ('CUDA', '12.1.1', '', SYSTEM), - ('cuDNN', '8.9.2.26', versionsuffix, SYSTEM), -] - -_copts = [ - '-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"', - '-DCUTLASS_ENABLE_CUBLAS=1', - '-DCUTLASS_ENABLE_CUDNN=1', - '-DCUTLASS_ENABLE_TOOLS=1', -] -configopts = ' '.join(_copts) - -sanity_check_paths = { - 'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT], - 'dirs': ['lib/cmake', 'tools/util/include'], -} - -moduleclass = 'lib' diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index 52563639d51..fcf33582a52 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -21,7 +21,7 @@ dependencies = [ ('Python', '3.11.3'), ('CUDA', '12.1.1', '', SYSTEM), ('NCCL', '2.18.3', versionsuffix), - ('CUTLASS', '3.5.1', versionsuffix), + ('CUTLASS', '3.4.0', versionsuffix), ('PyTorch', '2.1.2', versionsuffix), ('CuPy', '13.0.0', versionsuffix), ('Triton', '2.1.0', versionsuffix), From 8b62456c3f98fa34f662b4a751596b8fb256528a Mon Sep 17 00:00:00 2001 From: Viktor Rehnberg <35767167+VRehnberg@users.noreply.github.com> Date: Tue, 17 Jun 2025 08:56:49 +0200 Subject: [PATCH 19/19] Updates related to EB v5.0 --- .../d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb | 4 ---- 1 file changed, 4 deletions(-) diff --git a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb index fcf33582a52..5de63df98eb 100644 --- a/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb +++ b/easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5-foss-2023a-CUDA-12.1.1.eb @@ -36,8 +36,6 @@ dependencies = [ ('libaio', '0.3.113'), # for async_io (builddep only?) ] -use_pip = True - github_account = 'microsoft' exts_list = [ ('hjson', '3.1.0', { @@ -93,6 +91,4 @@ exts_list = [ }), ] -sanity_pip_check = True - moduleclass = 'ai'