Skip to content

Commit 22e87e7

Browse files
authored
Merge pull request #306 from trz42/split_lmod_hooks
split Lmod hooks merge ok despite some stupidly failing CI workflow
2 parents 5325fda + f331263 commit 22e87e7

File tree

4 files changed

+169
-111
lines changed

4 files changed

+169
-111
lines changed

EESSI-install-software.sh

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -248,11 +248,21 @@ fi
248248
### add packages here
249249

250250
echo ">> Creating/updating Lmod cache..."
251-
export LMOD_RC="${EASYBUILD_INSTALLPATH}/.lmod/lmodrc.lua"
251+
export LMOD_CONFIG_DIR="${EASYBUILD_INSTALLPATH}/.lmod"
252+
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
252253
lmodrc_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodrc.py$' > /dev/null; echo $?)
253-
if [ ! -f $LMOD_RC ] || [ ${lmodrc_changed} == '0' ]; then
254+
if [ ! -f $lmod_rc_file ] || [ ${lmodrc_changed} == '0' ]; then
254255
python3 $TOPDIR/create_lmodrc.py ${EASYBUILD_INSTALLPATH}
255-
check_exit_code $? "$LMOD_RC created" "Failed to create $LMOD_RC"
256+
check_exit_code $? "$lmod_rc_file created" "Failed to create $lmod_rc_file"
257+
fi
258+
259+
echo ">> Creating/updating Lmod SitePackage.lua ..."
260+
export LMOD_PACKAGE_PATH="${EASYBUILD_INSTALLPATH}/.lmod"
261+
lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua"
262+
sitepackage_changed=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^create_lmodsitepackage.py$' > /dev/null; echo $?)
263+
if [ ! -f "$lmod_sitepackage_file" ] || [ "${sitepackage_changed}" == '0' ]; then
264+
python3 $TOPDIR/create_lmodsitepackage.py ${EASYBUILD_INSTALLPATH}
265+
check_exit_code $? "$lmod_sitepackage_file created" "Failed to create $lmod_sitepackage_file"
256266
fi
257267

258268
$TOPDIR/update_lmod_cache.sh ${EPREFIX} ${EASYBUILD_INSTALLPATH}

create_lmodrc.py

Lines changed: 0 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -17,113 +17,6 @@
1717
}
1818
"""
1919

20-
GPU_LMOD_RC ="""require("strict")
21-
local hook = require("Hook")
22-
local open = io.open
23-
24-
local function read_file(path)
25-
local file = open(path, "rb") -- r read mode and b binary mode
26-
if not file then return nil end
27-
local content = file:read "*a" -- *a or *all reads the whole file
28-
file:close()
29-
return content
30-
end
31-
32-
local function eessi_cuda_enabled_load_hook(t)
33-
local frameStk = require("FrameStk"):singleton()
34-
local mt = frameStk:mt()
35-
local simpleName = string.match(t.modFullName, "(.-)/")
36-
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
37-
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
38-
-- to load the CUDA module and print an informative message on how to set up GPU support for NESSI
39-
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
40-
if simpleName == 'CUDA' then
41-
-- get the full host_injections path
42-
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
43-
-- build final path where the CUDA software should be installed
44-
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
45-
local cudaDirExists = isDir(cudaEasyBuildDir)
46-
if not cudaDirExists then
47-
local advice = "but while the module file exists, the actual software is not entirely shipped with NESSI "
48-
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where NESSI "
49-
advice = advice .. "can find it.\\n"
50-
advice = advice .. refer_to_docs
51-
LmodError("\\nYou requested to load ", simpleName, " ", advice)
52-
end
53-
end
54-
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the NESSI linker,
55-
-- otherwise, refuse to load the requested module and print error message
56-
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
57-
if haveGpu then
58-
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
59-
local cudaVersionFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
60-
local cudaDriverFile = "/cvmfs/pilot.nessi.no/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
61-
local cudaDriverExists = isFile(cudaDriverFile)
62-
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
63-
if not (cudaDriverExists or singularityCudaExists) then
64-
local advice = "which relies on the CUDA runtime environment and driver libraries. "
65-
advice = advice .. "In order to be able to use the module, you will need "
66-
advice = advice .. "to make sure NESSI can find the GPU driver libraries on your host system.\\n"
67-
advice = advice .. refer_to_docs
68-
LmodError("\\nYou requested to load ", simpleName, " ", advice)
69-
else
70-
-- CUDA driver exists, now we check its version to see if an update is needed
71-
if cudaDriverExists then
72-
local cudaVersion = read_file(cudaVersionFile)
73-
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
74-
-- driver CUDA versions don't give a patch version for CUDA
75-
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
76-
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
77-
local driver_libs_need_update = false
78-
if major < major_req then
79-
driver_libs_need_update = true
80-
elseif major == major_req then
81-
if minor < minor_req then
82-
driver_libs_need_update = true
83-
end
84-
end
85-
if driver_libs_need_update == true then
86-
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
87-
advice = advice .. "Please update your CUDA driver libraries and then "
88-
advice = advice .. "let NESSI know about the update.\\n"
89-
advice = advice .. refer_to_docs
90-
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
91-
end
92-
end
93-
end
94-
end
95-
end
96-
97-
local function eessi_openmpi_load_hook(t)
98-
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
99-
-- to work around hang/crash due to bug in OpenMPI;
100-
-- see https://gitlab.com/eessi/support/-/issues/41
101-
local frameStk = require("FrameStk"):singleton()
102-
local mt = frameStk:mt()
103-
local moduleName = string.match(t.modFullName, "(.-)/")
104-
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
105-
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
106-
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
107-
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
108-
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
109-
if ompiMcaBtl == nil then
110-
setenv("OMPI_MCA_btl", "^smcuda")
111-
else
112-
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
113-
end
114-
end
115-
end
116-
117-
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
118-
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
119-
function eessi_load_hook(t)
120-
eessi_cuda_enabled_load_hook(t)
121-
eessi_openmpi_load_hook(t)
122-
end
123-
124-
125-
hook.register("load", eessi_load_hook)
126-
"""
12720

12821
def error(msg):
12922
sys.stderr.write("ERROR: %s\n" % msg)
@@ -143,7 +36,6 @@ def error(msg):
14336
'dot_lmod': DOT_LMOD,
14437
'prefix': prefix,
14538
}
146-
lmodrc_txt += '\n' + GPU_LMOD_RC
14739
try:
14840
os.makedirs(os.path.dirname(lmodrc_path), exist_ok=True)
14941
with open(lmodrc_path, 'w') as fp:

create_lmodsitepackage.py

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
#!/usr/bin/env python3
2+
#
3+
# Create SitePackage.lua configuration file for Lmod.
4+
#
5+
import os
6+
import sys
7+
8+
DOT_LMOD = '.lmod'
9+
10+
hook_txt ="""require("strict")
11+
local hook = require("Hook")
12+
local open = io.open
13+
14+
local function read_file(path)
15+
local file = open(path, "rb") -- r read mode and b binary mode
16+
if not file then return nil end
17+
local content = file:read "*a" -- *a or *all reads the whole file
18+
file:close()
19+
return content
20+
end
21+
22+
local function eessi_cuda_enabled_load_hook(t)
23+
local frameStk = require("FrameStk"):singleton()
24+
local mt = frameStk:mt()
25+
local simpleName = string.match(t.modFullName, "(.-)/")
26+
-- If we try to load CUDA itself, check if the full CUDA SDK was installed on the host in host_injections.
27+
-- This is required for end users to build additional CUDA software. If the full SDK isn't present, refuse
28+
-- to load the CUDA module and print an informative message on how to set up GPU support for EESSI
29+
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/gpu/.\\n"
30+
if simpleName == 'CUDA' then
31+
-- get the full host_injections path
32+
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
33+
-- build final path where the CUDA software should be installed
34+
local cudaEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
35+
local cudaDirExists = isDir(cudaEasyBuildDir)
36+
if not cudaDirExists then
37+
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
38+
advice = advice .. "due to licencing. You will need to install a full copy of the CUDA SDK where EESSI "
39+
advice = advice .. "can find it.\\n"
40+
advice = advice .. refer_to_docs
41+
LmodError("\\nYou requested to load ", simpleName, " ", advice)
42+
end
43+
end
44+
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
45+
-- otherwise, refuse to load the requested module and print error message
46+
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
47+
if haveGpu then
48+
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
49+
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
50+
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
51+
local cudaDriverExists = isFile(cudaDriverFile)
52+
local singularityCudaExists = isFile("/.singularity.d/libs/libcuda.so")
53+
if not (cudaDriverExists or singularityCudaExists) then
54+
local advice = "which relies on the CUDA runtime environment and driver libraries. "
55+
advice = advice .. "In order to be able to use the module, you will need "
56+
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n"
57+
advice = advice .. refer_to_docs
58+
LmodError("\\nYou requested to load ", simpleName, " ", advice)
59+
else
60+
-- CUDA driver exists, now we check its version to see if an update is needed
61+
if cudaDriverExists then
62+
local cudaVersion = read_file(cudaVersionFile)
63+
local cudaVersion_req = os.getenv("EESSICUDAVERSION")
64+
-- driver CUDA versions don't give a patch version for CUDA
65+
local major, minor = string.match(cudaVersion, "(%d+)%.(%d+)")
66+
local major_req, minor_req, patch_req = string.match(cudaVersion_req, "(%d+)%.(%d+)%.(%d+)")
67+
local driver_libs_need_update = false
68+
if major < major_req then
69+
driver_libs_need_update = true
70+
elseif major == major_req then
71+
if minor < minor_req then
72+
driver_libs_need_update = true
73+
end
74+
end
75+
if driver_libs_need_update == true then
76+
local advice = "but the module you want to load requires CUDA " .. cudaVersion_req .. ". "
77+
advice = advice .. "Please update your CUDA driver libraries and then "
78+
advice = advice .. "let EESSI know about the update.\\n"
79+
advice = advice .. refer_to_docs
80+
LmodError("\\nYour driver CUDA version is ", cudaVersion, " ", advice)
81+
end
82+
end
83+
end
84+
end
85+
end
86+
87+
local function eessi_openmpi_load_hook(t)
88+
-- disable smcuda BTL when loading OpenMPI module for aarch64/neoverse_v1,
89+
-- to work around hang/crash due to bug in OpenMPI;
90+
-- see https://gitlab.com/eessi/support/-/issues/41
91+
local frameStk = require("FrameStk"):singleton()
92+
local mt = frameStk:mt()
93+
local moduleName = string.match(t.modFullName, "(.-)/")
94+
local cpuTarget = os.getenv("EESSI_SOFTWARE_SUBDIR") or ""
95+
if (moduleName == "OpenMPI") and (cpuTarget == "aarch64/neoverse_v1") then
96+
local msg = "Adding '^smcuda' to $OMPI_MCA_btl to work around bug in OpenMPI"
97+
LmodMessage(msg .. " (see https://gitlab.com/eessi/support/-/issues/41)")
98+
local ompiMcaBtl = os.getenv("OMPI_MCA_btl")
99+
if ompiMcaBtl == nil then
100+
setenv("OMPI_MCA_btl", "^smcuda")
101+
else
102+
setenv("OMPI_MCA_btl", ompiMcaBtl .. ",^smcuda")
103+
end
104+
end
105+
end
106+
107+
-- Combine both functions into a single one, as we can only register one function as load hook in lmod
108+
-- Also: make it non-local, so it can be imported and extended by other lmodrc files if needed
109+
function eessi_load_hook(t)
110+
eessi_cuda_enabled_load_hook(t)
111+
eessi_openmpi_load_hook(t)
112+
end
113+
114+
115+
hook.register("load", eessi_load_hook)
116+
"""
117+
118+
def error(msg):
119+
sys.stderr.write("ERROR: %s\n" % msg)
120+
sys.exit(1)
121+
122+
123+
if len(sys.argv) != 2:
124+
error("Usage: %s <software prefix>" % sys.argv[0])
125+
126+
prefix = sys.argv[1]
127+
128+
if not os.path.exists(prefix):
129+
error("Prefix directory %s does not exist!" % prefix)
130+
131+
sitepackage_path = os.path.join(prefix, DOT_LMOD, 'SitePackage.lua')
132+
try:
133+
os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True)
134+
with open(sitepackage_path, 'w') as fp:
135+
fp.write(hook_txt)
136+
137+
except (IOError, OSError) as err:
138+
error("Failed to create %s: %s" % (sitepackage_path, err))
139+
140+
print(sitepackage_path)

init/eessi_environment_variables

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,22 @@ if [ -d $EESSI_PREFIX ]; then
8585
false
8686
fi
8787

88+
export LMOD_CONFIG_DIR="$EESSI_SOFTWARE_PATH/.lmod"
89+
lmod_rc_file="$LMOD_CONFIG_DIR/lmodrc.lua"
90+
if [ -f $lmod_rc_file ]; then
91+
show_msg "Found Lmod configuration file at $lmod_rc_file"
92+
else
93+
error "Lmod configuration file not found at $lmod_rc_file"
94+
fi
95+
96+
export LMOD_PACKAGE_PATH="$EESSI_SOFTWARE_PATH/.lmod"
97+
lmod_sitepackage_file="$LMOD_PACKAGE_PATH/SitePackage.lua"
98+
if [ -f $lmod_sitepackage_file ]; then
99+
show_msg "Found Lmod SitePackage.lua file at $lmod_sitepackage_file"
100+
else
101+
error "Lmod SitePackage.lua file not found at $lmod_sitepackage_file"
102+
fi
103+
88104
else
89105
error "NESSI software layer at $EESSI_SOFTWARE_PATH not found!"
90106
fi

0 commit comments

Comments
 (0)