33# Drop into the prefix shell or pipe this script into a Prefix shell with
44# $EPREFIX/startprefix <<< /path/to/this_script.sh
55
6+ install_cuda_version=" 11.3.1"
7+
68# If you want to install CUDA support on login nodes (typically without GPUs),
79# set this variable to true. This will skip all GPU-dependent checks
810install_wo_gpu=false
2931 echo " This means that all GPU-dependent tests/checks will be skipped!"
3032fi
3133
32- # set up basic environment variables, EasyBuild and Lmod
3334EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/versions/2021.12/init/bash
3435
35- current_dir=$( dirname $( realpath $0 ) )
36-
37- # Get arch type from EESSI environment
38- eessi_cpu_family=" ${EESSI_CPU_FAMILY:- x86_64} "
39-
40- # Get OS family
41- # TODO: needs more thorough testing
42- os_family=$( uname | tr ' [:upper:]' ' [:lower:]' )
43-
44- # Get OS version
45- # TODO: needs more thorough testing, taken from https://unix.stackexchange.com/a/6348
46- if [ -f /etc/os-release ]; then
47- # freedesktop.org and systemd
48- . /etc/os-release
49- os=$NAME
50- ver=$VERSION_ID
51- if [[ " $os " == * " Rocky" * ]]; then
52- os=" rhel"
53- fi
54- if [[ " $os " == * " Debian" * ]]; then
55- os=" debian"
56- fi
57- elif type lsb_release > /dev/null 2>&1 ; then
58- # linuxbase.org
59- os=$( lsb_release -si)
60- ver=$( lsb_release -sr)
61- elif [ -f /etc/lsb-release ]; then
62- # For some versions of Debian/Ubuntu without lsb_release command
63- . /etc/lsb-release
64- os=$DISTRIB_ID
65- ver=$DISTRIB_RELEASE
66- elif [ -f /etc/debian_version ]; then
67- # Older Debian/Ubuntu/etc.
68- os=Debian
69- ver=$( cat /etc/debian_version)
70- else
71- # Fall back to uname, e.g. "Linux <version>", also works for BSD, etc.
72- os=$( uname -s)
73- ver=$( uname -r)
74- fi
75- # Convert OS version to major versions, e.g. rhel8.5 -> rhel8
76- # TODO: needs testing for e.g. Ubuntu 20.04
77- ver=${ver% .* }
78-
7936# #############################################################################################
8037# Check that the CUDA driver version is adequate
8138# (
@@ -95,41 +52,6 @@ if [[ "${install_wo_gpu}" != "true" ]]; then
9552 fi
9653fi
9754
98-
99- # Check if the CUDA compat libraries are installed and compatible with the target CUDA version
100- # if not find the latest version of the compatibility libraries and install them
101-
102- # get URL to latest CUDA compat libs, exit if URL is invalid
103- latest_cuda_compat_url=" $( $( dirname " $BASH_SOURCE " ) /get_latest_cuda_compatlibs.sh ${os} ${ver} ${eessi_cpu_family} ) "
104- ret=$?
105- if [ $ret -ne 0 ]; then
106- echo $latest_cuda_compat_url
107- exit 1
108- fi
109- latest_driver_version=" ${latest_cuda_compat_url% -* } "
110- latest_driver_version=" ${latest_driver_version##* _} "
111-
112- install_compat_libs=false
113- host_injections_dir=" /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
114- # libcuda.so points to actual cuda compat lib with driver version in its name
115- # if this file exists, cuda compat libs are installed and we can compare the version
116- if [ -e $host_injections_dir /latest/compat/libcuda.so ]; then
117- eessi_driver_version=$( realpath $host_injections_dir /latest/compat/libcuda.so)
118- eessi_driver_version=" ${eessi_driver_version##* so.} "
119- else
120- eessi_driver_version=0
121- fi
122-
123- if [ ${latest_driver_version// ./ } -gt ${eessi_driver_version// ./ } ]; then
124- install_compat_libs=true
125- else
126- echo " CUDA compat libs are up-to-date, skip installation."
127- fi
128-
129- if [ " ${install_compat_libs} " == true ]; then
130- source $( dirname " $BASH_SOURCE " ) /install_cuda_compatlibs.sh $latest_cuda_compat_url
131- fi
132-
13355# ##############################################################################################
13456# ##############################################################################################
13557# Install CUDA
@@ -141,7 +63,6 @@ if [ -d ${cuda_install_dir}/modules/all ]; then
14163 module use ${cuda_install_dir} /modules/all
14264fi
14365# only install CUDA if specified version is not found
144- install_cuda_version=" 11.3.1"
14566module avail 2>&1 | grep -i CUDA/${install_cuda_version} & > /dev/null
14667if [[ $? -eq 0 ]]; then
14768 echo " CUDA module found! No need to install CUDA again, proceeding with tests"
@@ -164,27 +85,92 @@ else
16485 fi
16586fi
16687
167- cd $current_dir
168- if [[ " ${install_wo_gpu} " != " true" ]]; then
169- source $( dirname " $BASH_SOURCE " ) /test_cuda
170- else
171- echo " Requested to install CUDA without GPUs present, so we skip final tests."
172- echo " Instead we test if module load CUDA works as expected..."
173- if [ -d ${cuda_install_dir} /modules/all ]; then
174- module use ${cuda_install_dir} /modules/all/
175- else
176- echo " Cannot load CUDA, modules path does not exist, exiting now..."
177- exit 1
178- fi
179- module load CUDA
180- ret=$?
181- if [ $ret -ne 0 ]; then
182- echo " Could not load CUDA even though modules path exists..."
183- exit 1
184- else
185- echo " Successfully loaded CUDA, you are good to go! :)"
186- echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/ versions/ host_injections} as your EasyBuild prefix"
187- echo " - To use these modules:"
188- echo " module use ${EESSI_SOFTWARE_PATH/ versions/ host_injections} /modules/all/"
189- fi
88+ # Check if the CUDA compat libraries are installed and compatible with the target CUDA version
89+ # if not find the latest version of the compatibility libraries and install them
90+
91+ # get URL to latest CUDA compat libs, exit if URL is invalid
92+ cuda_compat_urls=" $( $( dirname " $BASH_SOURCE " ) /get_cuda_compatlibs.sh) "
93+ ret=$?
94+ if [ $ret -ne 0 ]; then
95+ echo $cuda_compat_urls
96+ exit 1
19097fi
98+
99+ # loop over the compat library versions until we get one that works for us
100+ keep_driver_check=1
101+ # Do a maximum of five attempts
102+ for value in {1..5}
103+ do
104+ latest_cuda_compat_url=$( echo $cuda_compat_urls | cut -d " " -f1)
105+ # Chomp that value out of the list
106+ cuda_compat_urls=$( echo $cuda_compat_urls | cut -d " " -f2-)
107+ latest_driver_version=" ${latest_cuda_compat_url% -* } "
108+ latest_driver_version=" ${latest_driver_version##* -} "
109+
110+ install_compat_libs=false
111+ host_injections_dir=" /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
112+ # libcuda.so points to actual cuda compat lib with driver version in its name
113+ # if this file exists, cuda compat libs are installed and we can compare the version
114+ if [ -e $host_injections_dir /latest/compat/libcuda.so ]; then
115+ eessi_driver_version=$( realpath $host_injections_dir /latest/compat/libcuda.so)
116+ eessi_driver_version=" ${eessi_driver_version##* so.} "
117+ else
118+ eessi_driver_version=0
119+ fi
120+
121+ if [ $keep_driver_check -eq 1 ]
122+ then
123+ # only keep the driver check for the latest version
124+ keep_driver_check=0
125+ else
126+ eessi_driver_version=0
127+ fi
128+
129+ if [ ${latest_driver_version// ./ } -gt ${eessi_driver_version// ./ } ]; then
130+ install_compat_libs=true
131+ else
132+ echo " CUDA compat libs are up-to-date, skip installation."
133+ fi
134+
135+ if [ " ${install_compat_libs} " == true ]; then
136+ source $( dirname " $BASH_SOURCE " ) /install_cuda_compatlibs.sh $latest_cuda_compat_url
137+ fi
138+
139+ if [[ " ${install_wo_gpu} " != " true" ]]; then
140+ bash $( dirname " $BASH_SOURCE " ) /test_cuda.sh
141+ if [ $? -eq 0 ]
142+ then
143+ exit 0
144+ else
145+ echo
146+ echo " It looks like your driver is not recent enough to work with that release of CUDA, consider updating!"
147+ echo " I'll try an older release to see if that will work..."
148+ echo
149+ fi
150+ else
151+ echo " Requested to install CUDA without GPUs present, so we skip final tests."
152+ echo " Instead we test if module load CUDA works as expected..."
153+ if [ -d ${cuda_install_dir} /modules/all ]; then
154+ module use ${cuda_install_dir} /modules/all/
155+ else
156+ echo " Cannot load CUDA, modules path does not exist, exiting now..."
157+ exit 1
158+ fi
159+ module load CUDA
160+ ret=$?
161+ if [ $ret -ne 0 ]; then
162+ echo " Could not load CUDA even though modules path exists..."
163+ exit 1
164+ else
165+ echo " Successfully loaded CUDA, you are good to go! :)"
166+ echo " - To build CUDA enabled modules use ${EESSI_SOFTWARE_PATH/ versions/ host_injections} as your EasyBuild prefix"
167+ echo " - To use these modules:"
168+ echo " module use ${EESSI_SOFTWARE_PATH/ versions/ host_injections} /modules/all/"
169+ fi
170+ break
171+ fi
172+ done
173+
174+ echo " Tried to install 5 different generations of compat libraries and none worked,"
175+ echo " this usually means your driver is very out of date!"
176+ exit 1
0 commit comments