Skip to content

Commit edf8e0a

Browse files
authored
Merge pull request #369 from trz42/nessi-2023.06-PyTorch-2.1.2-2023a-CUDA-12.1.1
{2023.06}[foss/2023a] PyTorch v2.1.2 with CUDA/12.1.1
2 parents e4acbbb + f780ca5 commit edf8e0a

File tree

7 files changed

+84
-35
lines changed

7 files changed

+84
-35
lines changed

EESSI-install-software.sh

Lines changed: 43 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ display_help() {
1717
echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)"
1818
}
1919

20+
# Function to check if a command exists
21+
function command_exists() {
22+
command -v "$1" >/dev/null 2>&1
23+
}
24+
2025
function copy_build_log() {
2126
# copy specified build log to specified directory, with some context added
2227
build_log=${1}
@@ -147,6 +152,39 @@ else
147152
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
148153
fi
149154

155+
# We need to ensure that certain files are present or updated before we source
156+
# $TOPDIR/init/eessi_environment_variables
157+
# Particularly the files we need to have present/updated in
158+
# ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
159+
# are:
160+
# - .lmod/lmodrc.lua
161+
# - .lmod/SitePackage.lua
162+
# We run scripts to create them if they don't exist or if the scripts have been
163+
# changed in the PR.
164+
165+
# Set base directory for software and for Lmod config files
166+
_eessi_software_path=${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}
167+
_lmod_cfg_dir=${_eessi_software_path}/.lmod
168+
169+
# We assume there's only one diff file that corresponds to the PR patch file
170+
pr_diff=$(ls [0-9]*.diff | head -1)
171+
172+
# Create or update ${_eessi_software_path}/.lmod/lmodrc.lua
173+
_lmodrc_file=${_lmod_cfg_dir}/lmodrc.lua
174+
_lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
175+
if [ ! -f "${_lmodrc_file}" ] || [ "${_lmodrc_changed}" == '0' ]; then
176+
python3 ${TOPDIR}/create_lmodrc.py ${_eessi_software_path}
177+
check_exit_code $? "${_lmodrc_file} created/updated" "Failed to create/update ${_lmodrc_file}"
178+
fi
179+
180+
# Create or update ${_eessi_software_path}/.lmod/SitePackage.lua
181+
_lmod_sitepackage_file=${_lmod_cfg_dir}/SitePackage.lua
182+
_sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?)
183+
if [ ! -f "${_lmod_sitepackage_file}" ] || [ "${_sitepackage_changed}" == '0' ]; then
184+
python3 ${TOPDIR}/create_lmodsitepackage.py ${_eessi_software_path}
185+
check_exit_code $? "${_lmod_sitepackage_file} created/updated" "Failed to create/update ${_lmod_sitepackage_file}"
186+
fi
187+
150188
# Set all the EESSI environment variables (respecting $EESSI_SOFTWARE_SUBDIR_OVERRIDE)
151189
# $EESSI_SILENT - don't print any messages
152190
# $EESSI_BASIC_ENV - give a basic set of environment variables
@@ -212,13 +250,11 @@ else
212250
echo "Skipping installation of CUDA SDK and cu* libraries in host_injections, since the --skip-cuda-install flag was passed"
213251
fi
214252

215-
# Install drivers in host_injections
216-
# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works;
217-
# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software)
218-
# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
219-
220-
# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway)
221-
export EESSI_OVERRIDE_GPU_CHECK=1
253+
# Install NVIDIA drivers in host_injections (if they exist)
254+
if command_exists "nvidia-smi"; then
255+
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
256+
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
257+
fi
222258

223259
# use PR patch file to determine in which easystack files stuff was added
224260
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')
@@ -268,25 +304,5 @@ else
268304
done
269305
fi
270306

271-
### add packages here
272-
273-
echo ">> Creating/updating Lmod RC file..."
274-
export LMOD_CONFIG_DIR="${EASYBUILD_INSTALLPATH}/.lmod"
275-
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
276-
lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
277-
if [ ! -f $lmod_rc_file ] || [ ${lmodrc_changed} == '0' ]; then
278-
python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH}
279-
check_exit_code $? "$lmod_rc_file created" "Failed to create $lmod_rc_file"
280-
fi
281-
282-
echo ">> Creating/updating Lmod SitePackage.lua ..."
283-
export LMOD_PACKAGE_PATH="${EASYBUILD_INSTALLPATH}/.lmod"
284-
lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua"
285-
sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?)
286-
if [ ! -f "$lmod_sitepackage_file" ] || [ "${sitepackage_changed}" == '0' ]; then
287-
python3 $TOPDIR/create_lmodsitepackage.py ${EASYBUILD_INSTALLPATH}
288-
check_exit_code $? "$lmod_sitepackage_file created" "Failed to create $lmod_sitepackage_file"
289-
fi
290-
291307
echo ">> Cleaning up ${TMPDIR}..."
292308
rm -r ${TMPDIR}

bot/build.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,11 @@ if [[ ! -z ${SHARED_FS_PATH} ]]; then
271271
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
272272
fi
273273

274+
# Don't run the Lmod GPU driver check when doing builds (may not have a GPU, and it's not relevant for vanilla builds anyway)
275+
echo "EESSI_OVERRIDE_GPU_CHECK='${EESSI_OVERRIDE_GPU_CHECK}'"
276+
export EESSI_OVERRIDE_GPU_CHECK=1
277+
echo "EESSI_OVERRIDE_GPU_CHECK='${EESSI_OVERRIDE_GPU_CHECK}'"
278+
274279
# create tmp file for output of build step
275280
build_outerr=$(mktemp build.outerr.XXXX)
276281

easystacks/pilot.nessi.no/2023.06/eessi-2023.06-eb-4.9.1-2023a.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,3 +52,6 @@ easyconfigs:
5252
- ESPResSo-4.2.2-foss-2023a.eb:
5353
options:
5454
from-pr: 20595
55+
- PyTorch-2.1.2-foss-2023a-CUDA-12.1.1.eb:
56+
options:
57+
cuda-compute-capabilities: 6.0,6.1,7.0,7.5,8.0,8.6,8.9,9.0

eb_hooks.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,29 @@ def pre_configure_hook_openblas_optarch_generic(self, *args, **kwargs):
425425
raise EasyBuildError("OpenBLAS-specific hook triggered for non-OpenBLAS easyconfig?!")
426426

427427

428+
def pre_configure_hook_pytorch_add_cupti_libdir(self, *args, **kwargs):
429+
"""
430+
Pre-configure hook for PyTorch: add directory $EESSI_SOFTWARE_PATH/software/CUDA/12.1.1/extras/CUPTI/lib64 to LIBRARY_PATH
431+
"""
432+
if self.name == 'PyTorch':
433+
if 'cudaver' in self.cfg.template_values and self.cfg.template_values['cudaver'] == '12.1.1':
434+
_cudaver = self.cfg.template_values['cudaver']
435+
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: CUDA version: '%s'" % _cudaver)
436+
_library_path = os.getenv('LIBRARY_PATH')
437+
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: library_path: '%s'", _library_path)
438+
_eessi_software_path = os.getenv('EESSI_SOFTWARE_PATH')
439+
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: eessi_software_path: '%s'", _eessi_software_path)
440+
_cupti_lib_dir = os.path.join(_eessi_software_path, 'software', 'CUDA', _cudaver, 'extras', 'CUPTI', 'lib64')
441+
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: cupti_lib_dir: '%s'", _cupti_lib_dir)
442+
if _library_path:
443+
env.setvar('LIBRARY_PATH', ':'.join([_library_path, _cupti_lib_dir]))
444+
else:
445+
env.setvar('LIBRARY_PATH', _cupti_lib_dir)
446+
print_msg("pre_configure_hook_pytorch_add_cupti_libdir: LIBRARY_PATH: '%s'", os.getenv('LIBRARY_PATH'))
447+
else:
448+
raise EasyBuildError("PyTorch-specific hook triggered for non-PyTorch easyconfig?!")
449+
450+
428451
def pre_configure_hook_libfabric_disable_psm3_x86_64_generic(self, *args, **kwargs):
429452
"""Add --disable-psm3 to libfabric configure options when building with --optarch=GENERIC on x86_64."""
430453
if self.name == 'libfabric':
@@ -560,6 +583,12 @@ def pre_test_hook_increase_max_failed_tests_arm_PyTorch(self, *args, **kwargs):
560583
"""
561584
if self.name == 'PyTorch' and self.version == '2.1.2' and get_cpu_architecture() == AARCH64:
562585
self.cfg['max_failed_tests'] = 10
586+
if 'cudaver' in self.cfg.template_values and self.cfg.template_values['cudaver'] == '12.1.1':
587+
_cudaver = self.cfg.template_values['cudaver']
588+
_runtest = self.cfg['runtest']
589+
self.cfg['runtest'] = _runtest.replace(
590+
'PYTHONUNBUFFERED',
591+
'PYTORCH_TEST_RUN_EVERYTHING_IN_SERIAL=1 PYTHONUNBUFFERED')
563592

564593

565594
def pre_single_extension_hook(ext, *args, **kwargs):
@@ -851,6 +880,7 @@ def inject_gpu_property(ec):
851880
'libfabric': pre_configure_hook_libfabric_disable_psm3_x86_64_generic,
852881
'MetaBAT': pre_configure_hook_metabat_filtered_zlib_dep,
853882
'OpenBLAS': pre_configure_hook_openblas_optarch_generic,
883+
'PyTorch': pre_configure_hook_pytorch_add_cupti_libdir,
854884
'WRF': pre_configure_hook_wrf_aarch64,
855885
'at-spi2-core': pre_configure_hook_atspi2core_filter_ld_library_path,
856886
}

eessi_container.sh

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -477,12 +477,6 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
477477
mkdir -p ${EESSI_USR_LOCAL_CUDA}
478478
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
479479
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
480-
if [[ "${NVIDIA_MODE}" == "install" ]] ; then
481-
# We need to "trick" our LMOD_RC file to allow us to load CUDA modules even without a CUDA driver
482-
# (this works because we build within a container and the LMOD_RC recognises that)
483-
touch ${EESSI_TMPDIR}/libcuda.so
484-
export SINGULARITY_CONTAINLIBS="${EESSI_TMPDIR}/libcuda.so"
485-
fi
486480
fi
487481
fi
488482

install_scripts.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,8 +112,6 @@ copy_files_by_list ${TOPDIR}/scripts ${INSTALL_PREFIX}/scripts "${script_files[@
112112
nvidia_files=(
113113
eessi-2023.06-cuda-and-libraries.yml
114114
install_cuda_and_libraries.sh
115-
install_cuda_host_injections.sh
116-
install_cuDNN_host_injections.sh
117115
link_nvidia_host_libraries.sh
118116
)
119117
copy_files_by_list ${TOPDIR}/scripts/gpu_support/nvidia ${INSTALL_PREFIX}/scripts/gpu_support/nvidia "${nvidia_files[@]}"

run_in_compat_layer_env.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ fi
2929
if [ ! -z ${http_proxy} ]; then
3030
INPUT="export http_proxy=${http_proxy}; ${INPUT}"
3131
fi
32+
if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then
33+
INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}"
34+
fi
3235
if [ ! -z ${https_proxy} ]; then
3336
INPUT="export https_proxy=${https_proxy}; ${INPUT}"
3437
fi

0 commit comments

Comments
 (0)