Skip to content

Commit 2cc5ce9

Browse files
committed
Move (conditional) installation of cuda compat libs to external script
Only install cuda compat libs when either they are not installed yet or they are outdated
1 parent 7d6af69 commit 2cc5ce9

File tree

2 files changed

+87
-60
lines changed

2 files changed

+87
-60
lines changed

gpu_support/add_gpu_support.sh

Lines changed: 14 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -90,76 +90,30 @@ if [ $ret -ne 0 ]; then
9090
echo $latest_cuda_compat_url
9191
exit 1
9292
fi
93+
latest_driver_version="${latest_cuda_compat_url%-*}"
94+
latest_driver_version="${latest_driver_version##*_}"
9395

94-
# Create a general space for our NVIDIA compat drivers
96+
install_compat_libs=false
9597
host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
96-
if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
97-
mkdir -p ${host_injections_dir}
98-
cd ${host_injections_dir}
98+
# libcuda.so points to actual cuda compat lib with driver version in its name
99+
# if this file exists, cuda compat libs are installed and we can compare the version
100+
if [ -e $host_injections_dir/latest/compat/libcuda.so ]; then
101+
eessi_driver_version=$( realpath $host_injections_dir/latest/compat/libcuda.so)
102+
eessi_driver_version="${eessi_driver_version##*so.}"
99103
else
100-
echo "Cannot write to eessi host_injections space, exiting now..." >&2
101-
exit 1
104+
eessi_driver_version=0
102105
fi
103106

104-
# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest
105-
106-
driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
107-
eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
108-
if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi
109-
110-
# Check if our target CUDA is satisfied by what is installed already
111-
# TODO: Find required CUDA version and see if we need an update
112-
113-
# If not, grab the latest compat library RPM or deb
114-
# download and unpack in temporary directory, easier cleanup after installation
115-
tmpdir=$(mktemp -d)
116-
cd $tmpdir
117-
compat_file=${latest_cuda_compat_url##*/}
118-
wget ${latest_cuda_compat_url}
119-
120-
# Unpack it
121-
# (the requirements here are OS dependent, can we get around that?)
122-
# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e)
123-
# (deb files can be unpacked with ar and tar)
124-
file_extension=${compat_file##*.}
125-
if [[ ${file_extension} == "rpm" ]]; then
126-
rpm2cpio ${compat_file} | cpio -idmv
127-
elif [[ ${file_extension} == "deb" ]]; then
128-
ar x ${compat_file}
129-
tar xf data.tar.*
107+
if [ ${latest_driver_version//./} -gt ${eessi_driver_version//./} ]; then
108+
install_compat_libs=true
130109
else
131-
echo "File extension of cuda compat lib not supported, exiting now..." >&2
132-
exit 1
110+
echo "CUDA compat libs are up-to-date, skip installation."
133111
fi
134-
cd $host_injections_dir
135-
# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
136-
mv -n ${tmpdir}/usr/local/cuda-* .
137-
rm -r ${tmpdir}
138-
139-
# Add a symlink that points to the latest version
140-
latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1)
141-
ln -sf ${latest_cuda_dir} latest
142112

143-
if [ ! -e latest ] ; then
144-
echo "Symlink to latest cuda compat lib version is broken, exiting now..."
145-
exit 1
146-
fi
147-
148-
# Create the space to host the libraries
149-
host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}
150-
mkdir -p ${host_injection_libs_dir}
151-
# Symlink in the path to the latest libraries
152-
if [ ! -d "${host_injection_libs_dir}/lib" ]; then
153-
ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib
154-
elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
155-
echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
156-
echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat"
157-
exit 1
113+
if [ "${install_compat_libs}" == true ]; then
114+
source $(dirname "$BASH_SOURCE")/install_cuda_compatlibs.sh $latest_cuda_compat_url
158115
fi
159116

160-
# return to initial dir
161-
cd $current_dir
162-
163117
###############################################################################################
164118
###############################################################################################
165119
# Install CUDA
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#!/bin/bash
2+
3+
libs_url=$1
4+
5+
current_dir=$(dirname $(realpath $0))
6+
7+
# Create a general space for our NVIDIA compat drivers
8+
if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
9+
mkdir -p ${host_injections_dir}
10+
cd ${host_injections_dir}
11+
else
12+
echo "Cannot write to eessi host_injections space, exiting now..." >&2
13+
exit 1
14+
fi
15+
16+
# Check if we have any version installed by checking for the existence of /cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest
17+
18+
driver_cuda_version=$(nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
19+
eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
20+
if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then echo "You need to update your CUDA compatability libraries"; fi
21+
22+
# Check if our target CUDA is satisfied by what is installed already
23+
# TODO: Find required CUDA version and see if we need an update
24+
25+
# If not, grab the latest compat library RPM or deb
26+
# download and unpack in temporary directory, easier cleanup after installation
27+
tmpdir=$(mktemp -d)
28+
cd $tmpdir
29+
compat_file=${libs_url##*/}
30+
wget ${libs_url}
31+
32+
# Unpack it
33+
# (the requirements here are OS dependent, can we get around that?)
34+
# (for rpms looks like we can use https://gitweb.gentoo.org/repo/proj/prefix.git/tree/eclass/rpm.eclass?id=d7fc8cf65c536224bace1d22c0cd85a526490a1e)
35+
# (deb files can be unpacked with ar and tar)
36+
file_extension=${compat_file##*.}
37+
if [[ ${file_extension} == "rpm" ]]; then
38+
rpm2cpio ${compat_file} | cpio -idmv
39+
elif [[ ${file_extension} == "deb" ]]; then
40+
ar x ${compat_file}
41+
tar xf data.tar.*
42+
else
43+
echo "File extension of cuda compat lib not supported, exiting now..." >&2
44+
exit 1
45+
fi
46+
cd $host_injections_dir
47+
# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
48+
mv -n ${tmpdir}/usr/local/cuda-* .
49+
rm -r ${tmpdir}
50+
51+
# Add a symlink that points to the latest version
52+
latest_cuda_dir=$(find . -maxdepth 1 -type d | grep -i cuda | sort | tail -n1)
53+
ln -sf ${latest_cuda_dir} latest
54+
55+
if [ ! -e latest ] ; then
56+
echo "Symlink to latest cuda compat lib version is broken, exiting now..."
57+
exit 1
58+
fi
59+
60+
# Create the space to host the libraries
61+
host_injection_libs_dir=/cvmfs/pilot.eessi-hpc.org/host_injections/${EESSI_PILOT_VERSION}/compat/${os_family}/${eessi_cpu_family}
62+
mkdir -p ${host_injection_libs_dir}
63+
# Symlink in the path to the latest libraries
64+
if [ ! -d "${host_injection_libs_dir}/lib" ]; then
65+
ln -s ${host_injections_dir}/latest/compat ${host_injection_libs_dir}/lib
66+
elif [ ! "${host_injection_libs_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
67+
echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
68+
echo "${host_injection_libs_dir}/lib should point to ${host_injections_dir}/latest/compat"
69+
exit 1
70+
fi
71+
72+
# return to initial dir
73+
cd $current_dir

0 commit comments

Comments
 (0)