EESSI · huebner-m · Apr 14, 2022 · May 12, 2022 · May 12, 2022 · May 13, 2022
diff --git a/EESSI-pilot-install-software.sh b/EESSI-pilot-install-software.sh
@@ -160,6 +160,12 @@ fail_msg="Installation of ${GCC_EC} failed!"
 $EB ${GCC_EC} --robot --from-pr 14453 GCCcore-9.3.0.eb
 check_exit_code $? "${ok_msg}" "${fail_msg}"
 
+# install CUDA
+ok_msg="CUDA installed, off to a good (?) start!"
+fail_msg="Failed to install CUDA, woopsie..."
+$EB CUDA-11.3.1.eb --robot
+check_exit_code $? "${ok_msg}" "${fail_msg}"
+
 # install CMake with custom easyblock that patches CMake when --sysroot is used
 echo ">> Install CMake with fixed easyblock to take into account --sysroot"
 ok_msg="CMake installed!"

diff --git a/eb_hooks.py b/eb_hooks.py
@@ -7,7 +7,7 @@
 from easybuild.tools.systemtools import AARCH64, POWER, get_cpu_architecture
 
 EESSI_RPATH_OVERRIDE_ATTR = 'orig_rpath_override_dirs'
-
+CUDA_ENABLED_TOOLCHAINS = ["fosscuda", "gcccuda", "gimpic", "giolfc", "gmklc", "golfc", "gomklc", "gompic", "goolfc", "iccifortcuda", "iimklc", "iimpic", "intelcuda", "iomklc", "iompic", "nvompic", "nvpsmpic"]
 
 def get_eessi_envvar(eessi_envvar):
     """Get an EESSI environment variable from the environment"""
@@ -41,13 +41,38 @@ def get_rpath_override_dirs(software_name):
 
     return rpath_injection_dirs
 
+def inject_gpu_property(ec):
+    ec_dict = ec.asdict()
+    # Check if CUDA is in the dependencies, if so add the GPU Lmod tag
+    if (
+        "CUDA" in [dep[0] for dep in iter(ec_dict["dependencies"])]
+        or ec_dict["toolchain"]["name"] in CUDA_ENABLED_TOOLCHAINS
+    ):
+        ec.log.info("[parse hook] Injecting gpu as Lmod arch property and envvar with CUDA version")
+        key = "modluafooter"
+        value = 'add_property("arch","gpu")'
+        cuda_version = 0
+        for dep in iter(ec_dict["dependencies"]):
+            if "CUDA" in dep[0]:
+                cuda_version = dep[1]
+                ec_dict["dependencies"].remove(dep)
+                ec_dict["builddependencies"].append(dep) if dep not in ec_dict["builddependencies"] else ec_dict["builddependencies"]
+        value = "\n".join([value, 'setenv("EESSICUDAVERSION","%s")' % (cuda_version)])
+        if key in ec_dict:
+            if not value in ec_dict[key]:
+                ec[key] = "\n".join([ec_dict[key], value])
+        else:
+            ec[key] = value
+    return ec
 
 def parse_hook(ec, *args, **kwargs):
     """Main parse hook: trigger custom functions based on software name."""
 
     # determine path to Prefix installation in compat layer via $EPREFIX
     eprefix = get_eessi_envvar('EPREFIX')
 
+    ec = inject_gpu_property(ec)
+
     if ec.name in PARSE_HOOKS:
         PARSE_HOOKS[ec.name](ec, eprefix)
 
@@ -103,6 +128,57 @@ def cgal_toolchainopts_precise(ec, eprefix):
         raise EasyBuildError("CGAL-specific hook triggered for non-CGAL easyconfig?!")
 
 
+def post_package_hook(self, *args, **kwargs):
+    """Delete CUDA files we are not allowed to ship and replace them with a symlink to a possible installation under host_injections."""
+    if self.name == 'CUDA':
+        cuda_version = self.installdir.split('/')[-1]
+        print_msg("Attempt to get compat libs for CUDA version: %s" % (cuda_version))
+        # install compat libraries and run test
+        # if the test works, move it to EESSI_SOFTWARE_PATH so we can ship the compiled test
+        os.system("export INSTALL_CUDA_VERSION=%s && export SAVE_COMPILED_TEST=true && ./gpu_support/add_nvidia_gpu_support.sh" % (cuda_version))
+        print_msg("Replacing CUDA stuff we cannot ship with symlinks...")
+        # read CUDA EULA
+        eula_path = os.path.join(self.installdir, 'EULA.txt')
+        tmp_buffer = []
+        with open(eula_path) as infile:
+            copy = False
+            for line in infile:
+                if line.strip() == '2.6. Attachment A':
+                    copy = True
+                    continue
+                elif line.strip() == '2.7. Attachment B':
+                    copy = False
+                    continue
+                elif copy:
+                    tmp_buffer.append(line)
+        # create whitelist without file extensions, not really needed and they only complicate things
+        whitelist = []
+        file_extensions = ['.so', '.a', '.h', '.bc']
+        for tmp in tmp_buffer:
+            for word in tmp.split():
+                if any(ext in word for ext in file_extensions):
+                    whitelist.append(word.split('.')[0])
+        # add compiled test to whitelist so we can ship it with EESSI
+        whitelist.append('deviceQuery')
+        whitelist = list(set(whitelist))
+        # iterate over all files in the CUDA path
+        for root, dirs, files in os.walk(self.installdir):
+            for filename in files:
+                # we only really care about real files, i.e. not symlinks
+                if not os.path.islink(os.path.join(root, filename)):
+                    # check if the current file is part of the whitelist
+                    basename = filename.split('.')[0]
+                    if basename not in whitelist:
+                        # if it is not in the whitelist, delete the file and create a symlink to host_injections
+                        source = os.path.join(root, filename)
+                        target = source.replace('versions', 'host_injections')
+                        os.remove(source)
+                        # have to create subdirs if they don't exit, otherwise the symlink creation fails
+                        if not os.path.isdir(os.path.dirname(target)):
+                            os.makedirs(os.path.dirname(target))
+                        os.symlink(target, source)
+
+
 def fontconfig_add_fonts(ec, eprefix):
     """Inject --with-add-fonts configure option for fontconfig."""
     if ec.name == 'fontconfig':

diff --git a/gpu_support/README.md b/gpu_support/README.md
@@ -0,0 +1,27 @@
+# How to add GPU support
+The collection of scripts in this directory enables you to add GPU support to your setup.
+Note that currently this means that CUDA support can be added for Nvidia GPUs. AMD GPUs are not yet supported (feel free to contribute that though!).
+To enable the usage of the CUDA runtime in your setup, simply run the following script:
+```
+./add_nvidia_gpu_support.sh
+```
+This script will install the compatibility libraries (and only those by default!) you need to use the shipped runtime environment of CUDA.
+
+If you plan on using the full CUDA suite, i.e. if you want to load the CUDA module, you will have to modify the script execution as follows:
+```
+export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh
+```
+This will again install the needed compatibility libraries as well as the whole CUDA suite.
+
+If you need a different CUDA version than what is shipped with EESSI, you can also specify that particular version for the script:
+```
+export INSTALL_CUDA_VERSION=xx.y.z && export INSTALL_CUDA=true && ./add_nvidia_gpu_support.sh
+```
+Please note, however, that versions for which the runtime is not shipped with EESSI are not installed in the default modules path.
+Thus, you will have to add the following to your modules path to get access to your custom CUDA version:
+```
+module use ${EESSI_SOFTWARE_PATH/versions/host_injections}/modules/all/
+```
+## Prerequisites and tips
+* You need write permissions to `/cvmfs/pilot.eessi-hpc.org/host_injections` (which by default is a symlink to `/opt/eessi` but can be configured in your CVMFS config file to point somewhere else). If you would like to make a system-wide installation you should change this in your configuration to point somewhere on a shared filesystem.
+* If you want to install CUDA on a node without GPUs (e.g. on a login node where you want to be able to compile your CUDA-enabled code), you should `export INSTALL_WO_GPU=true` in order to skip checks and tests that can only succeed if you have access to a GPU. This approach is not recommended as there is a chance the CUDA compatibility library installed is not compatible with the existing CUDA driver on GPU nodes (and this will not be detected).
diff --git a/gpu_support/add_amd_gpu_support.sh b/gpu_support/add_amd_gpu_support.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+cat << EOF
+This is not implemented yet :(
+
+If you would like to contribute this support there are a few things you will
+need to consider:
+- We will need to change the Lmod property added to GPU software so we can
+  distinguish AMD and Nvidia GPUs
+- Support should be implemented in user space, if this is not possible (e.g.,
+  requires a driver update) you need to tell the user what to do
+- Support needs to be _verified_ and a trigger put in place (like the existence
+  of a particular path) so we can tell Lmod to display the associated modules
+EOF
diff --git a/gpu_support/add_nvidia_gpu_support.sh b/gpu_support/add_nvidia_gpu_support.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+# Drop into the prefix shell or pipe this script into a Prefix shell with
+#   $EPREFIX/startprefix <<< /path/to/this_script.sh
+
+install_cuda="${INSTALL_CUDA:=false}"
+install_cuda_version="${INSTALL_CUDA_VERSION:=11.3.1}"
+install_p7zip_version="${INSTALL_P7ZIP_VERSION:=17.04-GCCcore-10.3.0}"
+
+# If you want to install CUDA support on login nodes (typically without GPUs),
+# set this variable to true. This will skip all GPU-dependent checks
+install_wo_gpu=false
+[ "$INSTALL_WO_GPU" = true ] && install_wo_gpu=true
+
+# verify existence of nvidia-smi or this is a waste of time
+# Check if nvidia-smi exists and can be executed without error
+if [[ "${install_wo_gpu}" != "true" ]]; then
+  if command -v nvidia-smi > /dev/null 2>&1; then
+    nvidia-smi > /dev/null 2>&1
+    if [ $? -ne 0 ]; then
+      echo "nvidia-smi was found but returned error code, exiting now..." >&2
+      echo "If you do not have a GPU on this device but wish to force the installation,"
+      echo "please set the environment variable INSTALL_WO_GPU=true"
+      exit 1
+    fi
+    echo "nvidia-smi found, continue setup."
+  else
+    echo "nvidia-smi not found, exiting now..." >&2
+    echo "If you do not have a GPU on this device but wish to force the installation,"
+    echo "please set the environment variable INSTALL_WO_GPU=true"
+    exit 1
+  fi
+else
+  echo "You requested to install CUDA without GPUs present."
+  echo "This means that all GPU-dependent tests/checks will be skipped!"
+fi
+
+EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash
+
+##############################################################################################
+# Check that the CUDA driver version is adequate
+# (
+#  needs to be r450 or r470 which are LTS, other production branches are acceptable but not
+#  recommended, below r450 is not compatible [with an exception we will not explore,see
+#  https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers]
+# )
+# only check first number in case of multiple GPUs
+if [[ "${install_wo_gpu}" != "true" ]]; then
+  driver_major_version=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | tail -n1)
+  driver_major_version="${driver_major_version%%.*}"
+  # Now check driver_version for compatability
+  # Check driver is at least LTS driver R450, see https://docs.nvidia.com/datacenter/tesla/drivers/#cuda-drivers
+  if (( $driver_major_version < 450 )); then
+    echo "Your NVIDIA driver version is too old, please update first.."
+    exit 1
+  fi
+fi
+
+###############################################################################################
+# Install CUDA
+cuda_install_dir="${EESSI_SOFTWARE_PATH/versions/host_injections}"
+mkdir -p ${cuda_install_dir}
+if [ "${install_cuda}" != false ]; then
+  bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda.sh ${install_cuda_version} ${cuda_install_dir}
+fi
+###############################################################################################
+# Prepare installation of CUDA compat libraries, i.e. install p7zip if it is missing
+$(dirname "$BASH_SOURCE")/cuda_utils/prepare_cuda_compatlibs.sh ${install_p7zip_version} ${cuda_install_dir}
+###############################################################################################
+# Try installing five different versions of CUDA compat libraries until the test works.
+# Otherwise, give up
+bash $(dirname "$BASH_SOURCE")/cuda_utils/install_cuda_compatlibs_loop.sh ${cuda_install_dir} ${install_cuda_version}
+
+cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt"
-cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt"
+cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/eessi_compat_version.txt"
-cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/version.txt"
+cuda_version_file="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia/latest/eessi_compat_version.txt"
+echo ${install_cuda_version} > ${cuda_version_file}
diff --git a/gpu_support/cuda_utils/get_cuda_compatlibs.sh b/gpu_support/cuda_utils/get_cuda_compatlibs.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# Get arch type from EESSI environment
+if [[ -z "${EESSI_CPU_FAMILY}" ]]; then
+  # set up basic environment variables, EasyBuild and Lmod
+  EESSI_SILENT=1 source /cvmfs/pilot.eessi-hpc.org/latest/init/bash
+fi
+eessi_cpu_family="${EESSI_CPU_FAMILY:-x86_64}"
+
+# build URL for CUDA libraries
+# take rpm file for compat libs from rhel8 folder, deb and rpm files contain the same libraries
+cuda_url="https://developer.download.nvidia.com/compute/cuda/repos/rhel8/"${eessi_cpu_family}"/"
+# get all versions in decending order
+files=$(curl -s "${cuda_url}" | grep 'cuda-compat' | sed 's/<\/\?[^>]\+>//g' | xargs -n1 | /cvmfs/pilot.eessi-hpc.org/latest/compat/linux/${eessi_cpu_family}/bin/sort -r --version-sort )
+if [[ -z "${files// }" ]]; then
+        echo "Could not find any compat lib files under" ${cuda_url}
+        exit 1
+fi
+for file in $files; do echo "${cuda_url}$file"; done
diff --git a/gpu_support/cuda_utils/install_cuda.sh b/gpu_support/cuda_utils/install_cuda.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+install_cuda_version=$1
+cuda_install_dir=$2
+
+# TODO: Can we do a trimmed install?
+# Only install CUDA if specified version is not found.
+# This is only relevant for users, the shipped CUDA installation will
+# always be in versions instead of host_injections and have symlinks pointing
+# to host_injections for everything we're not allowed to ship
+if [ -f ${cuda_install_dir}/software/CUDA/${install_cuda_version}/EULA.txt ]; then
+  echo "CUDA software found! No need to install CUDA again, proceeding with tests"
+else
+  # - as an installation location just use $EESSI_SOFTWARE_PATH but replacing `versions` with `host_injections`
+  #   (CUDA is a binary installation so no need to worry too much about this)
+  # The install is pretty fat, you need lots of space for download/unpack/install (~3*5GB), need to do a space check before we proceed
+  avail_space=$(df --output=avail ${cuda_install_dir}/ | tail -n 1 | awk '{print $1}')
+  if (( ${avail_space} < 16000000 )); then
+    echo "Need more disk space to install CUDA, exiting now..."
+    exit 1
+  fi
+  # install cuda in host_injections
+  module load EasyBuild
+  # we need the --rebuild option and a random dir for the module if the module file is shipped with EESSI
+  if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then
+    tmpdir=$(mktemp -d)
+    extra_args="--rebuild --installpath-modules=${tmpdir}"
+  fi
+  eb ${extra_args} --installpath=${cuda_install_dir}/ CUDA-${install_cuda_version}.eb
+  ret=$?
+  if [ $ret -ne 0 ]; then
+    echo "CUDA installation failed, please check EasyBuild logs..."
+    exit 1
+  fi
+  # clean up tmpdir if it exists
+  if [ -f ${EESSI_SOFTWARE_PATH}/modules/all/CUDA/${install_cuda_version}.lua ]; then
+    rm -rf ${tmpdir}
+  fi
+fi
diff --git a/gpu_support/cuda_utils/install_cuda_compatlibs.sh b/gpu_support/cuda_utils/install_cuda_compatlibs.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+libs_url=$1
+cuda_install_dir=$2
+
+current_dir=$(dirname $(realpath $0))
+host_injections_dir="/cvmfs/pilot.eessi-hpc.org/host_injections/nvidia"
+host_injection_linker_dir=${EESSI_EPREFIX/versions/host_injections}
+
+# Create a general space for our NVIDIA compat drivers
+if [ -w /cvmfs/pilot.eessi-hpc.org/host_injections ]; then
+  mkdir -p ${host_injections_dir}
+else
+  echo "Cannot write to eessi host_injections space, exiting now..." >&2
+  exit 1
+fi
+cd ${host_injections_dir}
+
+# Check if our target CUDA is satisfied by what is installed already
+# TODO: Find required CUDA version and see if we need an update
+driver_cuda_version=$(nvidia-smi  -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
+eessi_cuda_version=$(LD_LIBRARY_PATH=${host_injections_dir}/latest/compat/:$LD_LIBRARY_PATH nvidia-smi  -q --display=COMPUTE | grep CUDA | awk 'NF>1{print $NF}' | sed s/\\.//)
+if [[ $driver_cuda_version =~ ^[0-9]+$ ]]; then
+  if [ "$driver_cuda_version" -gt "$eessi_cuda_version" ]; then  echo "You need to update your CUDA compatability libraries"; fi
+fi
+
+# If not, grab the latest compat library RPM or deb
+# Download and unpack in temporary directory, easier cleanup after installation
+tmpdir=$(mktemp -d)
+cd $tmpdir
+compat_file=${libs_url##*/}
+wget ${libs_url}
+echo $compat_file
+
+# Unpack it
+# rpm files are the default for all OSes
+# Keep support for deb files in case it is needed in the future
+file_extension=${compat_file##*.}
+if [[ ${file_extension} == "rpm" ]]; then
+  # p7zip is installed under host_injections for now, make that known to the environment
+  if [ -d ${cuda_install_dir}/modules/all ]; then
+    module use ${cuda_install_dir}/modules/all/
+  fi
+  # Load p7zip to extract files from rpm file
+  module load p7zip
+  # Extract .cpio
+  7z x ${compat_file}
+  # Extract lib*
+  7z x ${compat_file/rpm/cpio}
+  # Restore symlinks
+  cd usr/local/cuda-*/compat
+  ls *.so *.so.? | xargs -i -I % sh -c '{ echo -n ln -sf" "; cat %; echo " "%; }'| xargs -i sh -c "{}"
+  cd -
+elif [[ ${file_extension} == "deb" ]]; then
+  ar x ${compat_file}
+  tar xf data.tar.*
+else
+  echo "File extension of cuda compat lib not supported, exiting now..." >&2
+  exit 1
+fi
+cd $host_injections_dir
+cuda_dir=$(basename ${tmpdir}/usr/local/cuda-*)
+# TODO: This would prevent error messages if folder already exists, but could be problematic if only some files are missing in destination dir
+rm -rf ${cuda_dir}
+mv -n ${tmpdir}/usr/local/cuda-* .
+rm -r ${tmpdir}
+
+# Add a symlink that points the latest version to the version we just installed
+ln -sfn ${cuda_dir} latest
+
+if [ ! -e latest ] ; then
+  echo "Symlink to latest cuda compat lib version is broken, exiting now..."
+  exit 1
+fi
+
+# Create the space to host the libraries
+mkdir -p ${host_injection_linker_dir}
+# Symlink in the path to the latest libraries
+if [ ! -d "${host_injection_linker_dir}/lib" ]; then
+  ln -s ${host_injections_dir}/latest/compat ${host_injection_linker_dir}/lib
+elif [ ! "${host_injection_linker_dir}/lib" -ef "${host_injections_dir}/latest/compat" ]; then
+  echo "CUDA compat libs symlink exists but points to the wrong location, please fix this..."
+  echo "${host_injection_linker_dir}/lib should point to ${host_injections_dir}/latest/compat"
+  exit 1
+fi
+
+# return to initial dir
+cd $current_dir
+
+echo
+echo CUDA driver compatability drivers installed for CUDA version:
+echo ${cuda_dir/cuda-/}