Skip to content

Commit 75ce850

Browse files
authored
Merge pull request #1 from ocaisa/add_gpu_support
Iterate over compat libs until we find something that works
2 parents 2ba47e4 + bb5301b commit 75ce850

File tree

5 files changed

+175
-140
lines changed

5 files changed

+175
-140
lines changed

gpu_support/add_nvidia_gpu_support.sh

Lines changed: 90 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
# Drop into the prefix shell or pipe this script into a Prefix shell with
44
# $EPREFIX/startprefix <<< /path/to/this_script.sh
55

6+
install_cuda_version="11.3.1"
7+
68
# If you want to install CUDA support on login nodes (typically without GPUs),
79
# set this variable to true. This will skip all GPU-dependent checks
810
install_wo_gpu=false
@@ -29,53 +31,8 @@ else
2931
echo "This means that all GPU-dependent tests/checks will be skipped!"
3032
fi
3133

32-
# set up basic environment variables, EasyBuild and Lmod
3334
EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash
3435

35-
current_dir=$(dirname $(realpath $0))
36-
37-
# Get arch type from EESSI environment
38-
eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}"
39-
40-
# Get OS family
41-
# TODO: needs more thorough testing
42-
os_family=$(uname | tr '[:upper:]' '[:lower:]')
43-
44-
# Get OS version
45-
# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348
46-
if [ -f /etc/os-release ]; then
47-
# freedesktop.org and systemd
48-
. /etc/os-release
49-
os=$NAME
50-
ver=$VERSION_ID
51-
if [[ "$os" == *"Rocky"* ]]; then
52-
os="rhel"
53-
fi
54-
if [[ "$os" == *"Debian"* ]]; then
55-
os="debian"
56-
fi
57-
elif type lsb_release >/dev/null 2>&1; then
58-
# linuxbase.org
59-
os=$(lsb_release -si)
60-
ver=$(lsb_release -sr)
61-
elif [ -f /etc/lsb-release ]; then
62-
# For some versions of Debian/Ubuntu without lsb_release command
63-
. /etc/lsb-release
64-
os=$DISTRIB_ID
65-
ver=$DISTRIB_RELEASE
66-
elif [ -f /etc/debian_version ]; then
67-
# Older Debian/Ubuntu/etc.
68-
os=Debian
69-
ver=$(cat /etc/debian_version)
70-
else
71-
# Fall back to uname, e.g. "Linux <version>", also works for BSD, etc.
72-
os=$(uname -s)
73-
ver=$(uname -r)
74-
fi
75-
# Convert OS version to major versions, e.g. rhel8.5 -> rhel8
76-
# TODO: needs testing for e.g. Ubuntu 20.04
77-
ver=${ver%.*}
78-
7936
##############################################################################################
8037
# Check that the CUDA driver version is adequate
8138
# (
@@ -95,41 +52,6 @@ if [[ "${install_wo_gpu}" != "true" ]]; then
9552
fi
9653
fi
9754

98-
99-
# Check if the CUDA compat libraries are installed and compatible with the target CUDA version
100-
# if not find the latest version of the compatibility libraries and install them
101-
102-
# get URL to latest CUDA compat libs, exit if URL is invalid
103-
latest_cuda_compat_url="$($(dirname "$BASH_SOURCE")/get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family})"
104-
ret=$?
105-
if [ $ret -ne 0 ]; then
106-
echo $latest_cuda_compat_url
107-
exit 1
108-
fi
109-
latest_driver_version="${latest_cuda_compat_url%-*}"
110-
latest_driver_version="${latest_driver_version##*_}"
111-
112-
install_compat_libs=false
113-
host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
114-
# libcuda.so points to actual cuda compat lib with driver version in its name
115-
# if this file exists, cuda compat libs are installed and we can compare the version
116-
if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then
117-
eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so)
118-
eessi_driver_version="${eessi_driver_version##*so.}"
119-
else
120-
eessi_driver_version=0
121-
fi
122-
123-
if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then
124-
install_compat_libs=true
125-
else
126-
echo "CUDA compat libs are up-to-date, skip installation."
127-
fi
128-
129-
if [ "${install_compat_libs}" == true ]; then
130-
source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url
131-
fi
132-
13355
###############################################################################################
13456
###############################################################################################
13557
# Install CUDA
@@ -141,7 +63,6 @@ if [ -d ${cuda_install_dir}/modules/all ]; then
14163
module use ${cuda_install_dir}/modules/all
14264
fi
14365
# only install CUDA if specified version is not found
144-
install_cuda_version="11.3.1"
14566
module avail 2>&1 | grep -i CUDA/${install_cuda_version} &> /dev/null
14667
if [[ $? -eq 0 ]]; then
14768
echo "CUDA module found! No need to install CUDA again, proceeding with tests"
@@ -164,27 +85,92 @@ else
16485
fi
16586
fi
16687

167-
cd $current_dir
168-
if [[ "${install_wo_gpu}" != "true" ]]; then
169-
source $(dirname "$BASH_SOURCE")/test_cuda
170-
else
171-
echo "Requested to install CUDA without GPUs present, so we skip final tests."
172-
echo "Instead we test if module load CUDA works as expected..."
173-
if [ -d ${cuda_install_dir}/modules/all ]; then
174-
module use ${cuda_install_dir}/modules/all/
175-
else
176-
echo "Cannot load CUDA, modules path does not exist, exiting now..."
177-
exit 1
178-
fi
179-
module load CUDA
180-
ret=$?
181-
if [ $ret -ne 0 ]; then
182-
echo "Could not load CUDA even though modules path exists..."
183-
exit 1
184-
else
185-
echo "Successfully loaded CUDA, you are good to go! :)"
186-
echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix"
187-
echo " - To use these modules:"
188-
echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/"
189-
fi
88+
# Check if the CUDA compat libraries are installed and compatible with the target CUDA version
89+
# if not find the latest version of the compatibility libraries and install them
90+
91+
# get URL to latest CUDA compat libs, exit if URL is invalid
92+
cuda_compat_urls="$($(dirname "$BASH_SOURCE")/get_cuda_compatlibs.sh)"
93+
ret=$?
94+
if [ $ret -ne 0 ]; then
95+
echo $cuda_compat_urls
96+
exit 1
19097
fi
98+
99+
# loop over the compat library versions until we get one that works for us
100+
keep_driver_check=1
101+
# Do a maximum of five attempts
102+
for value in {1..5}
103+
do
104+
latest_cuda_compat_url=$(echo $cuda_compat_urls | cut -d " " -f1)
105+
# Chomp that value out of the list
106+
cuda_compat_urls=$(echo $cuda_compat_urls | cut -d " " -f2-)
107+
latest_driver_version="${latest_cuda_compat_url%-*}"
108+
latest_driver_version="${latest_driver_version##*-}"
109+
110+
install_compat_libs=false
111+
host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
112+
# libcuda.so points to actual cuda compat lib with driver version in its name
113+
# if this file exists, cuda compat libs are installed and we can compare the version
114+
if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then
115+
eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so)
116+
eessi_driver_version="${eessi_driver_version##*so.}"
117+
else
118+
eessi_driver_version=0
119+
fi
120+
121+
if [ $keep_driver_check -eq 1 ]
122+
then
123+
# only keep the driver check for the latest version
124+
keep_driver_check=0
125+
else
126+
eessi_driver_version=0
127+
fi
128+
129+
if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then
130+
install_compat_libs=true
131+
else
132+
echo "CUDA compat libs are up-to-date, skip installation."
133+
fi
134+
135+
if [ "${install_compat_libs}" == true ]; then
136+
source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url
137+
fi
138+
139+
if [[ "${install_wo_gpu}" != "true" ]]; then
140+
bash $(dirname "$BASH_SOURCE")/test_cuda.sh
141+
if [ $? -eq 0 ]
142+
then
143+
exit 0
144+
else
145+
echo
146+
echo "It looks like your driver is not recent enough to work with that release of CUDA, consider updating!"
147+
echo "I'll try an older release to see if that will work..."
148+
echo
149+
fi
150+
else
151+
echo "Requested to install CUDA without GPUs present, so we skip final tests."
152+
echo "Instead we test if module load CUDA works as expected..."
153+
if [ -d ${cuda_install_dir}/modules/all ]; then
154+
module use ${cuda_install_dir}/modules/all/
155+
else
156+
echo "Cannot load CUDA, modules path does not exist, exiting now..."
157+
exit 1
158+
fi
159+
module load CUDA
160+
ret=$?
161+
if [ $ret -ne 0 ]; then
162+
echo "Could not load CUDA even though modules path exists..."
163+
exit 1
164+
else
165+
echo "Successfully loaded CUDA, you are good to go! :)"
166+
echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/versions/host_injections} as your EasyBuild prefix"
167+
echo " - To use these modules:"
168+
echo " module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/"
169+
fi
170+
break
171+
fi
172+
done
173+
174+
echo "Tried to install 5 different generations of compat libraries and none worked,"
175+
echo "this usually means your driver is very out of date!"
176+
exit 1

gpu_support/get_cuda_compatlibs.sh

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#!/bin/bash
2+
3+
current_dir=$(dirname $(realpath $0))
4+
5+
# Get arch type from EESSI environment
6+
if [[ -z "${EESSI_CPU_FAMILY}" ]]; then
7+
# set up basic environment variables, EasyBuild and Lmod
8+
echo Here!!
9+
EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash
10+
fi
11+
eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}"
12+
13+
# Get OS family
14+
# TODO: needs more thorough testing
15+
os_family=$(uname | tr '[:upper:]' '[:lower:]')
16+
17+
# Get OS version
18+
# TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348
19+
if [ -f /etc/os-release ]; then
20+
# freedesktop.org and systemd
21+
. /etc/os-release
22+
os=$NAME
23+
ver=$VERSION_ID
24+
if [[ "$os" == *"Rocky"* ]]; then
25+
os="rhel"
26+
fi
27+
if [[ "$os" == *"Debian"* ]]; then
28+
os="debian"
29+
fi
30+
elif type lsb_release >/dev/null 2>&1; then
31+
# linuxbase.org
32+
os=$(lsb_release -si)
33+
ver=$(lsb_release -sr)
34+
elif [ -f /etc/lsb-release ]; then
35+
# For some versions of Debian/Ubuntu without lsb_release command
36+
. /etc/lsb-release
37+
os=$DISTRIB_ID
38+
ver=$DISTRIB_RELEASE
39+
elif [ -f /etc/debian_version ]; then
40+
# Older Debian/Ubuntu/etc.
41+
os=Debian
42+
ver=$(cat /etc/debian_version)
43+
else
44+
# Fall back to uname, e.g. "Linux <version>", also works for BSD, etc.
45+
os=$(uname -s)
46+
ver=$(uname -r)
47+
fi
48+
# Convert OS version to major versions, e.g. rhel8.5 -> rhel8
49+
# TODO: needs testing for e.g. Ubuntu 20.04
50+
ver=${ver%.*}
51+
52+
# build URL for CUDA libraries
53+
cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/"${os}${ver}"/"${eessi_cpu_family}"/"
54+
# get all versions in decending order
55+
files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | sort -r )
56+
if [[ -z "${files// }" ]]; then
57+
echo "Could not find any compat lib files under" ${cuda_url}
58+
exit 1
59+
fi
60+
for file in $files; do echo "${cuda_url}$file"; done

gpu_support/get_latest_cuda_compatlibs.sh

Lines changed: 0 additions & 21 deletions
This file was deleted.

gpu_support/install_cuda_compatlibs.sh

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@
33
libs_url=$1
44

55
current_dir=$(dirname $(realpath $0))
6+
host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
7+
host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections}
68

79
# Create a general space for our NVIDIA compat drivers
810
if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
911
mkdir -p ${host_injections_dir}
10-
cd ${host_injections_dir}
1112
else
1213
echo "Cannot write to eessi host_injections space, exiting now..." >&2
1314
exit 1
1415
fi
16+
cd ${host_injections_dir}
1517

1618
# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest
1719

@@ -30,6 +32,7 @@ tmpdir=$(mktemp -d)
3032
cd $tmpdir
3133
compat_file=${libs_url##*/}
3234
wget ${libs_url}
35+
echo $compat_file
3336

3437
# Unpack it
3538
# (the requirements here are OS dependent, can we get around that?)
@@ -46,30 +49,33 @@ else
4649
exit 1
4750
fi
4851
cd $host_injections_dir
52+
cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*)
4953
# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
5054
mv -n ${tmpdir}/usr/local/cuda-* .
5155
rm -r ${tmpdir}
5256

53-
# Add a symlink that points to the latest version
54-
latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1)
55-
ln -sf ${latest_cuda_dir} latest
57+
# Add a symlink that points the latest version to the version we just installed
58+
ln -sfn ${cuda_dir} latest
5659

5760
if [ ! -e latest ] ; then
5861
echo "Symlink to latest cuda compat lib version is broken, exiting now..."
5962
exit 1
6063
fi
6164

6265
# Create the space to host the libraries
63-
host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}
64-
mkdir -p ${host_injection_libs_dir}
66+
mkdir -p ${host_injection_linker_dir}
6567
# Symlink in the path to the latest libraries
66-
if [ ! -d "${host_injection_libs_dir}/lib" ]; then
67-
ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib
68-
elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
68+
if [ ! -d "${host_injection_linker_dir}/lib" ]; then
69+
ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib
70+
elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
6971
echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
70-
echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat"
72+
echo "${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat"
7173
exit 1
7274
fi
7375

7476
# return to initial dir
7577
cd $current_dir
78+
79+
echo
80+
echo CUDA driver compatability drivers installed for CUDA version:
81+
echo ${cuda_dir/cuda-/}

0 commit comments

Comments
 (0)