Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,13 @@ toolchain = {'name': 'foss', 'version': '2023a'}

github_account = 'NVIDIA'
source_urls = [GITHUB_LOWER_SOURCE]
patches = ['CUTLASS-3.4.0_install_tools.patch']

sources = ['v%(version)s.tar.gz']
checksums = ['49f4b854acc2a520126ceefe4f701cfe8c2b039045873e311b1f10a8ca5d5de1']
checksums = [
{'v%(version)s.tar.gz': '49f4b854acc2a520126ceefe4f701cfe8c2b039045873e311b1f10a8ca5d5de1'},
{'CUTLASS-3.4.0_install_tools.patch': '52c2f17a0fb78febf7100288a25e001a8a7fa24d52c8437225146834d6efce79'},
]

builddependencies = [
('CMake', '3.26.3'),
Expand All @@ -38,12 +43,13 @@ _copts = [
'-DCUTLASS_NVCC_ARCHS="%(cuda_cc_cmake)s"',
'-DCUTLASS_ENABLE_CUBLAS=1',
'-DCUTLASS_ENABLE_CUDNN=1',
'-DCUTLASS_ENABLE_TOOLS=1',
]
configopts = ' '.join(_copts)

sanity_check_paths = {
'files': ['include/cutlass/cutlass.h', 'lib/libcutlass.%s' % SHLIB_EXT],
'dirs': ['lib/cmake'],
'dirs': ['lib/cmake', 'tools/util/include'],
}

moduleclass = 'lib'
33 changes: 33 additions & 0 deletions easybuild/easyconfigs/c/CUTLASS/CUTLASS-3.4.0_install_tools.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
From cb006bb549901f5a15676d9d4f59647508fea21b Mon Sep 17 00:00:00 2001
From: Viktor Rehnberg <[email protected]>
Date: Tue, 8 Oct 2024 08:24:23 +0000
Subject: [PATCH] Optionally install tools/util/include

DeepSpeed EvoformerAttn expects this file, see
https://github.com/microsoft/DeepSpeed/blob/v0.14.5/op_builder/evoformer_attn.py#L76
---
CMakeLists.txt | 8 ++++++++
1 file changed, 8 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 114d7936..fe506795 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -623,6 +623,14 @@ install(
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
)

+if(CUTLASS_ENABLE_TOOLS)
+ install(
+ DIRECTORY ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}/
+ DESTINATION ${CMAKE_INSTALL_PREFIX}/tools/util/include
+ )
+endif()
+
+
################################################################################

# Doxygen is available. Generate documentation
--
2.39.3

Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
Patch away dependency on ninja python package by falling back to checking
returncode of `ninja --version`.

Author: Viktor Rehnberg (Chalmers University of Technology)


diff --git a/deepspeed/env_report.py b/deepspeed/env_report.py
index 85a2f9b2..8bb64626 100644
--- a/deepspeed/env_report.py
+++ b/deepspeed/env_report.py
@@ -62,7 +62,7 @@ def ninja_installed():
try:
import ninja # noqa: F401 # type: ignore
except ImportError:
- return False
+ return (subprocess.run(["ninja", "--version"]).returncode == 0)
return True


diff --git a/op_builder/builder.py b/op_builder/builder.py
index 8dc825c7..970d18b2 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -487,7 +487,8 @@ class OpBuilder(ABC):
try:
import ninja # noqa: F401 # type: ignore
except ImportError:
- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ if subprocess.run(["ninja", "--version"]).returncode != 0:
+ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")

if isinstance(self, CUDAOpBuilder) and not self.is_rocm_pytorch():
self.build_for_cpu = not torch.cuda.is_available()
diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
index 81b15f19..cf0a1cc0 100644
--- a/op_builder/xpu/builder.py
+++ b/op_builder/xpu/builder.py
@@ -89,7 +89,8 @@ class SYCLOpBuilder(OpBuilder):
try:
import ninja # noqa: F401
except ImportError:
- raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")
+ if subprocess.run(["ninja", "--version"]).returncode != 0:
+ raise RuntimeError(f"Unable to JIT load the {self.name} op due to ninja not being installed.")

self.jit_mode = True
from intel_extension_for_pytorch.xpu.cpp_extension import load
diff --git a/requirements/requirements.txt b/requirements/requirements.txt
index 80c9f9b3..eed77fa3 100755
--- a/requirements/requirements.txt
+++ b/requirements/requirements.txt
@@ -1,5 +1,4 @@
hjson
-ninja
numpy
packaging>=20.0
psutil
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
easyblock = 'PythonBundle'

name = 'DeepSpeed'
version = '0.14.5'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = "http://www.deepspeed.ai/"
description = """
DeepSpeed is a deep learning optimization library that makes distributed training easy, efficient, and effective.
"""


toolchain = {'name': 'foss', 'version': '2023a'}

builddependencies = [
('Ninja', '1.11.1'),
('Transformers', '4.39.3'),
]

dependencies = [
('Python', '3.11.3'),
('CUDA', '12.1.1', '', SYSTEM),
('NCCL', '2.18.3', versionsuffix),
('CUTLASS', '3.4.0', versionsuffix),
('PyTorch', '2.1.2', versionsuffix),
('CuPy', '13.0.0', versionsuffix),
('Triton', '2.1.0', versionsuffix),
('accelerate', '0.33.0', versionsuffix),
('PyTorch-bundle', '2.1.2', versionsuffix), # torchvision dependency for mup
('mpi4py', '3.1.4'),
('Seaborn', '0.13.2'), # dependency for mup
('DLPack', '0.8'),
('py-cpuinfo', '9.0.0'),
('pydantic', '2.5.3'),
('tqdm', '4.66.1'),
('libaio', '0.3.113'), # for async_io (builddep only?)
]

github_account = 'microsoft'
exts_list = [
('hjson', '3.1.0', {
'checksums': ['55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75'],
}),
('nvidia-ml-py', '12.535.161', {
'checksums': ['2bcc31ff7a0ea291ed8d7fc39b149391a42c2fb1cb4256c935e692de488b4d17'],
'modulename': 'pynvml',
}),
('mup', '1.0.0', {
'checksums': ['9639e3d19f90e754f985ed444542ed2f8a049f3c0488fcb6efe150f30922cf74'],
}),
(name, version, {
'ds_build_ops_to_skip': [
# Sets DS_BUILD_<OPT>=0 http://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops
'SPARSE_ATTN', # requires PyTorch<2.0
'FP_QUANTIZER', # Untested triton version (2.1.0), only 2.3.0 and 2.3.1 are known to be compatible
'CUTLASS_OPS', # requires dskernels
'RAGGED_DEVICE_OPS', # requires dskernels
],
'fix_python_shebang_for': ['bin/*'],
'installopts': '--global-option="build_ext" --global-option="-j%(parallel)s"',
'patches': [
'DeepSpeed-0.14.2_no-ninja-dep.patch',
'DeepSpeed-0.14.5_pic-compile.patch',
'DeepSpeed-0.14.5_pdsh-env-vars.patch',
'DeepSpeed-0.14.5_use-eb-cutlass.patch',
'DeepSpeed-0.14.5_test-nvme-offload.patch',
],
'runtest': (
'ln -s $PWD/tests/ ../tests'
' && cd ../'
' && pytest tests/unit/'
' -k "not TestTensorBoard' # requires tensorboard
' and not TestWandb' # requires wandb
' and not TestCometMonitor"' # requires comet
),
# Test suite not available on pypi
'source_urls': [GITHUB_SOURCE],
'sources': [{'download_filename': 'v%(version)s.tar.gz', 'filename': SOURCE_TAR_GZ}],
'testinstall': True,
'checksums': [
{'DeepSpeed-0.14.5.tar.gz': '9f5622715cbd89c7382bfecf7fb188419ad3f2af7764dc6de35917abc6390cce'},
{'DeepSpeed-0.14.2_no-ninja-dep.patch': '03ab528096387e7f18d2a5a6f5fc20ed86d1ca8f63f0e65f266f4dda30e11776'},
{'DeepSpeed-0.14.5_pic-compile.patch': '1b9c070b77cf24351bff29bab7d23baacde31c7ea211a4bc75732ac38a99d6b0'},
{'DeepSpeed-0.14.5_pdsh-env-vars.patch':
'02f053d8de17e4e607b223e836658d8223cb26a3a7d8c9135e67b69aaa7f83a9'},
{'DeepSpeed-0.14.5_use-eb-cutlass.patch':
'43675f7c84fd0b0cea1050a4419020b377de414fc7f83d69b8010ab368964d8d'},
{'DeepSpeed-0.14.5_test-nvme-offload.patch':
'1592097867c5d4594a434cca727df134fcaa0e3ea8c595eb5951856a501cf422'},
],
}),
]

moduleclass = 'ai'
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
From aba7406021d9ea81f7c99e5d0143ed6509acc9e9 Mon Sep 17 00:00:00 2001
From: Viktor Rehnberg <[email protected]>
Date: Wed, 25 Sep 2024 09:29:23 +0000
Subject: [PATCH] Add software relevant environment variables

The multinode runner launches processes with pdsh, if LD_LIBRARY_PATH is
not included in these exports then the python .so file may not be found.
Also including what seemed important and was added from loading DeepSpeed.
(Couldn't add everything, then argumet list becomes too long).

See
- https://github.com/easybuilders/easybuild-easyconfigs/pull/21438#issuecomment-2373540098
for more details.
---
deepspeed/launcher/runner.py | 5 +++++
1 file changed, 5 insertions(+)

diff --git a/deepspeed/launcher/runner.py b/deepspeed/launcher/runner.py
index 07d1713e..e9cd61b8 100755
--- a/deepspeed/launcher/runner.py
+++ b/deepspeed/launcher/runner.py
@@ -32,6 +32,11 @@ from deepspeed.accelerator import get_accelerator

DLTS_HOSTFILE = "/job/hostfile"
EXPORT_ENVS = ['MLFLOW', 'PYTHON', 'MV2', 'UCX']
+EXPORT_ENVS += [ # Extra based on what's added by module load DeepSpeed
+ 'LD_LIBRARY_PATH', 'PATH', 'EB', 'TRITON', 'CUDA', # important
+ 'ACLOCAL', 'CMAKE', 'CPATH', 'LIBRARY_PATH', 'MPL', 'NCCL',
+ 'PKG_CONFIG_PATH', 'XDG_DATA_DIRS',
+]
Comment on lines +26 to +30
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm on the fence on if all of these should be included or not. The alternative is to reduce these and add them case by case with a .deepspeed_env file. https://www.deepspeed.ai/getting-started/#multi-node-environment-variables

EXPORT_ENVS += NEBULA_EXPORT_ENVS
DEEPSPEED_ENVIRONMENT_NAME = os.getenv("DS_ENV_FILE", ".deepspeed_env")
DEEPSPEED_ENVIRONMENT_PATHS = [os.path.expanduser("~"), '.']
--
2.39.3

141 changes: 141 additions & 0 deletions easybuild/easyconfigs/d/DeepSpeed/DeepSpeed-0.14.5_pic-compile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
From 90afd671dadf9fd6a7a221428f2c04c16d637494 Mon Sep 17 00:00:00 2001
From: Viktor Rehnberg <[email protected]>
Date: Thu, 23 May 2024 07:09:53 +0000
Subject: [PATCH] Compile with PIC

---
op_builder/builder.py | 15 ++++++++++-----
op_builder/cpu/builder.py | 3 ++-
op_builder/fused_adam.py | 4 +++-
op_builder/fused_lamb.py | 4 +++-
op_builder/fused_lion.py | 4 +++-
op_builder/xpu/builder.py | 3 ++-
6 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/op_builder/builder.py b/op_builder/builder.py
index ec7566aa..f08e1799 100644
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
@@ -288,13 +288,13 @@ class OpBuilder(ABC):
'''
Returns optional list of compiler flags to forward to nvcc when building CUDA sources
'''
- return []
+ return ['-Xcompiler', '-fPIC']

def cxx_args(self):
'''
Returns optional list of compiler flags to forward to the build
'''
- return []
+ return ['-fPIC']

def is_compatible(self, verbose=True):
'''
@@ -746,15 +746,18 @@ class CUDAOpBuilder(OpBuilder):
)

def cxx_args(self):
+ args = super().cxx_args()
if sys.platform == "win32":
- return ['-O2']
+ args += ['-O2']
else:
- return ['-O3', '-std=c++17', '-g', '-Wno-reorder']
+ args += ['-O3', '-std=c++17', '-g', '-Wno-reorder']
+ return args

def nvcc_args(self):
if self.build_for_cpu:
return []
- args = ['-O3']
+ args = super().nvcc_args()
+ args += ['-O3']
if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
args += [
@@ -835,6 +838,8 @@ class TorchCPUOpBuilder(CUDAOpBuilder):
'-lcublas',
'-g',
]
+ else:
+ args += super(CUDAOpBuilder, self).cxx_args()

CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()
diff --git a/op_builder/cpu/builder.py b/op_builder/cpu/builder.py
index d881842a..dfc5a31d 100644
--- a/op_builder/cpu/builder.py
+++ b/op_builder/cpu/builder.py
@@ -30,7 +30,8 @@ class CPUOpBuilder(OpBuilder):
return cpp_ext

def cxx_args(self):
- args = ['-O3', '-g', '-Wno-reorder']
+ args = super().cxx_args()
+ args += ['-O3', '-g', '-Wno-reorder']
CPU_ARCH = self.cpu_arch()
SIMD_WIDTH = self.simd_width()
args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
diff --git a/op_builder/fused_adam.py b/op_builder/fused_adam.py
index ac6e4eea..0c723572 100644
--- a/op_builder/fused_adam.py
+++ b/op_builder/fused_adam.py
@@ -29,7 +29,9 @@ class FusedAdamBuilder(CUDAOpBuilder):
return args + self.version_dependent_macros()

def nvcc_args(self):
- nvcc_flags = ['-O3'] + self.version_dependent_macros()
+ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
+ nvcc_flags += ['-O3']
+ nvcc_flags += self.version_dependent_macros()
if not self.is_rocm_pytorch():
nvcc_flags.extend(
['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
diff --git a/op_builder/fused_lamb.py b/op_builder/fused_lamb.py
index f0cb5577..a59b97d4 100644
--- a/op_builder/fused_lamb.py
+++ b/op_builder/fused_lamb.py
@@ -29,7 +29,9 @@ class FusedLambBuilder(CUDAOpBuilder):
return args + self.version_dependent_macros()

def nvcc_args(self):
- nvcc_flags = ['-O3'] + self.version_dependent_macros()
+ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
+ nvcc_flags += ['-O3']
+ nvcc_flags += self.version_dependent_macros()
if self.is_rocm_pytorch():
ROCM_MAJOR, ROCM_MINOR = self.installed_rocm_version()
nvcc_flags += ['-DROCM_VERSION_MAJOR=%s' % ROCM_MAJOR, '-DROCM_VERSION_MINOR=%s' % ROCM_MINOR]
diff --git a/op_builder/fused_lion.py b/op_builder/fused_lion.py
index b900a8f2..119232b5 100644
--- a/op_builder/fused_lion.py
+++ b/op_builder/fused_lion.py
@@ -29,7 +29,9 @@ class FusedLionBuilder(CUDAOpBuilder):
return args + self.version_dependent_macros()

def nvcc_args(self):
- nvcc_flags = ['-O3'] + self.version_dependent_macros()
+ nvcc_flags = super(CUDAOpBuilder, self).nvcc_args()
+ nvcc_flags += ['-O3']
+ nvcc_flags += self.version_dependent_macros()
if not self.is_rocm_pytorch():
nvcc_flags.extend(
['-allow-unsupported-compiler' if sys.platform == "win32" else '', '-lineinfo', '--use_fast_math'] +
diff --git a/op_builder/xpu/builder.py b/op_builder/xpu/builder.py
index f430b7b6..5a1a2219 100644
--- a/op_builder/xpu/builder.py
+++ b/op_builder/xpu/builder.py
@@ -52,7 +52,8 @@ class SYCLOpBuilder(OpBuilder):
return version_ge_1_1 + version_ge_1_3 + version_ge_1_5

def cxx_args(self):
- cxx_flags = [
+ cxx_flags = super().cxx_args()
+ cxx_flags += [
'-fsycl', '-fsycl-targets=spir64_gen', '-g', '-gdwarf-4', '-O3', '-std=c++17', '-fPIC', '-DMKL_ILP64',
'-fno-strict-aliasing'
]
--
2.39.3

Loading