Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions cwltool/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
return (cv.data, int(ag.data))


def cuda_check(cuda_req: CWLObjectType) -> int:
def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
try:
vmin = float(str(cuda_req["cudaVersionMin"]))
version, devices = cuda_version_and_device_count()
Expand All @@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int:
"CUDA version '%s' is less than minimum version '%s'", version, vmin
)
return 0
dmin = cast(int, cuda_req.get("deviceCountMin", 1))
dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
if devices < dmin:
if requestCount > devices:
_logger.warning(
"Requested at least %d GPU devices but only %d available", dmin, devices
"Requested %d GPU devices but only %d available", requestCount, devices
)
return 0
return min(dmax, devices)
return requestCount
except Exception as e:
_logger.warning("Error checking CUDA requirements: %s", e)
return 0
9 changes: 2 additions & 7 deletions cwltool/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,8 @@ def create_runtime(
if runtimeContext.rm_container:
runtime.append("--rm")

cuda_req, _ = self.builder.get_requirement(
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
# Checked earlier that the device count is non-zero in _setup
count = cuda_check(cuda_req)
runtime.append("--gpus=" + str(count))
if self.builder.resources.get("cudaDeviceCount"):
runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"]))

cidfile_path = None # type: Optional[str]
# add parameters to docker to write a container ID file
Expand Down
3 changes: 3 additions & 0 deletions cwltool/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ def select_resources(
result["tmpdirSize"] = math.ceil(request["tmpdirMin"])
result["outdirSize"] = math.ceil(request["outdirMin"])

if "cudaDeviceCount" in request:
result["cudaDeviceCount"] = request["cudaDeviceCount"]

return result

def _runner(self, job, runtime_context, TMPDIR_LOCK):
Expand Down
34 changes: 25 additions & 9 deletions cwltool/extensions-v1.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,29 @@ $graph:

See https://docs.nvidia.com/deploy/cuda-compatibility/ for
details.
cudaComputeCapabilityMin:
type: string
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
deviceCountMin:
type: int?
cudaComputeCapability:
type:
- 'string'
- 'string[]'
doc: |
CUDA hardware capability required to run the software, in X.Y
format.

* If this is a single value, it defines only the minimum
compute capability. GPUs with higher capability are also
accepted.

* If it is an array value, then only select GPUs with compute
capabilities that explicitly appear in the array.
cudaDeviceCountMin:
type: ['null', int, cwl:Expression]
default: 1
doc: Minimum number of GPU devices to request, default 1.
deviceCountMax:
type: int?
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
doc: |
Minimum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMax`. If neither are specified,
default 1.
cudaDeviceCountMax:
type: ['null', int, cwl:Expression]
doc: |
Maximum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMin`.
34 changes: 25 additions & 9 deletions cwltool/extensions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,29 @@ $graph:

See https://docs.nvidia.com/deploy/cuda-compatibility/ for
details.
cudaComputeCapabilityMin:
type: string
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
deviceCountMin:
type: int?
cudaComputeCapability:
type:
- 'string'
- 'string[]'
doc: |
CUDA hardware capability required to run the software, in X.Y
format.

* If this is a single value, it defines only the minimum
compute capability. GPUs with higher capability are also
accepted.

* If it is an array value, then only select GPUs with compute
capabilities that explicitly appear in the array.
cudaDeviceCountMin:
type: ['null', int, cwl:Expression]
default: 1
doc: Minimum number of GPU devices to request, default 1.
deviceCountMax:
type: int?
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
doc: |
Minimum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMax`. If neither are specified,
default 1.
cudaDeviceCountMax:
type: ['null', int, cwl:Expression]
doc: |
Maximum number of GPU devices to request. If not specified,
same as `cudaDeviceCountMin`.
5 changes: 4 additions & 1 deletion cwltool/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import functools
import itertools
import logging
import math
import os
import re
import shutil
Expand Down Expand Up @@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None:
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
count = cuda_check(cuda_req)
count = cuda_check(
cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"])
)
if count == 0:
raise WorkflowException("Could not satisfy CUDARequirement")

Expand Down
32 changes: 26 additions & 6 deletions cwltool/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,7 @@ def evalResources(
resourceReq, _ = self.get_requirement("ResourceRequirement")
if resourceReq is None:
resourceReq = {}

cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None)
if cwl_version == "v1.0":
ram = 1024
Expand All @@ -995,20 +996,34 @@ def evalResources(
"outdirMin": 1024,
"outdirMax": 1024,
}
for a in ("cores", "ram", "tmpdir", "outdir"):

cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
if cudaReq:
request["cudaDeviceCountMin"] = 1
request["cudaDeviceCountMax"] = 1

for rsc, a in (
(resourceReq, "cores"),
(resourceReq, "ram"),
(resourceReq, "tmpdir"),
(resourceReq, "outdir"),
(cudaReq, "cudaDeviceCount"),
):
if rsc is None:
continue
mn = mx = None # type: Optional[Union[int, float]]
if resourceReq.get(a + "Min"):
if rsc.get(a + "Min"):
mn = cast(
Union[int, float],
eval_resource(
builder, cast(Union[str, int, float], resourceReq[a + "Min"])
builder, cast(Union[str, int, float], rsc[a + "Min"])
),
)
if resourceReq.get(a + "Max"):
if rsc.get(a + "Max"):
mx = cast(
Union[int, float],
eval_resource(
builder, cast(Union[str, int, float], resourceReq[a + "Max"])
builder, cast(Union[str, int, float], rsc[a + "Max"])
),
)
if mn is None:
Expand All @@ -1022,13 +1037,18 @@ def evalResources(

request_evaluated = cast(Dict[str, Union[int, float]], request)
if runtimeContext.select_resources is not None:
# Call select resources hook
return runtimeContext.select_resources(request_evaluated, runtimeContext)
return {

defaultReq = {
"cores": request_evaluated["coresMin"],
"ram": math.ceil(request_evaluated["ramMin"]),
"tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]),
"outdirSize": math.ceil(request_evaluated["outdirMin"]),
}
if cudaReq:
defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"]
return defaultReq

def validate_hints(
self, avsc_names: Names, hints: List[CWLObjectType], strict: bool
Expand Down
6 changes: 1 addition & 5 deletions cwltool/singularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,7 @@ def create_runtime(
else:
runtime.extend(["--net", "--network", "none"])

cuda_req, _ = self.builder.get_requirement(
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
# Checked earlier that the device count is non-zero in _setup
if self.builder.resources.get("cudaDeviceCount"):
runtime.append("--nv")

for name, value in self.environment.items():
Expand Down
136 changes: 135 additions & 1 deletion tests/test_cuda.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,24 @@
import mock
import pytest
from schema_salad.avro import schema

from cwltool.builder import Builder
from cwltool.context import LoadingContext, RuntimeContext
from cwltool.cuda import cuda_version_and_device_count
from cwltool.errors import WorkflowException
from cwltool.job import CommandLineJob
from cwltool.load_tool import load_tool
from cwltool.main import main
from cwltool.pathmapper import MapperEnt, PathMapper
from cwltool.process import use_custom_schema, use_standard_schema
from cwltool.stdfsaccess import StdFsAccess
from cwltool.update import INTERNAL_VERSION, ORIGINAL_CWLVERSION
from cwltool.utils import CWLObjectType

from .util import get_data, needs_docker, needs_singularity_3_or_newer

from unittest.mock import MagicMock

cuda_version = cuda_version_and_device_count()


Expand Down Expand Up @@ -39,7 +53,127 @@ def test_cuda_singularity() -> None:
def test_cuda_no_container() -> None:
params = [
"--enable-ext",
"--singularity",
get_data("tests/wf/nvidia-smi.cwl"),
]
assert main(params) == 0


@pytest.mark.skipif(
cuda_version[0] == "", reason="nvidia-smi required for CUDA not detected"
)
def test_cuda_cc_list() -> None:
params = [
"--enable-ext",
get_data("tests/wf/nvidia-smi-cc.cwl"),
]
assert main(params) == 0


def _makebuilder(cudaReq: CWLObjectType) -> Builder:
return Builder(
{},
[],
[],
{},
schema.Names(),
[cudaReq],
[],
{"cudaDeviceCount": 1},
None,
None,
StdFsAccess,
StdFsAccess(""),
None,
0.1,
False,
False,
False,
"",
"",
"",
"",
INTERNAL_VERSION,
"docker",
)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check(makedirs: MagicMock, check_output: MagicMock) -> None:

runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "1.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""

jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
jb._setup(runtime_context)


@mock.patch("subprocess.check_output")
@mock.patch("os.makedirs")
def test_cuda_job_setup_check_err(makedirs: MagicMock, check_output: MagicMock) -> None:

runtime_context = RuntimeContext({})

cudaReq: CWLObjectType = {
"class": "http://commonwl.org/cwltool#CUDARequirement",
"cudaVersionMin": "2.0",
"cudaComputeCapability": "1.0",
}
builder = _makebuilder(cudaReq)

check_output.return_value = """
<nvidia>
<attached_gpus>1</attached_gpus>
<cuda_version>1.0</cuda_version>
</nvidia>
"""
jb = CommandLineJob(builder, {}, PathMapper, [], [], "")
with pytest.raises(WorkflowException):
jb._setup(runtime_context)


def test_cuda_eval_resource_range() -> None:
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())

joborder = {} # type: CWLObjectType
loadingContext = LoadingContext({"do_update": True})
runtime_context = RuntimeContext({})

tool = load_tool(get_data("tests/wf/nvidia-smi-range.cwl"), loadingContext)
builder = _makebuilder(tool.requirements[0])
builder.job = joborder

resources = tool.evalResources(builder, runtime_context)

assert resources["cudaDeviceCount"] == 2


def test_cuda_eval_resource_max() -> None:
with open(get_data("cwltool/extensions-v1.1.yml")) as res:
use_custom_schema("v1.2", "http://commonwl.org/cwltool", res.read())

joborder = {} # type: CWLObjectType
loadingContext = LoadingContext({"do_update": True})
runtime_context = RuntimeContext({})

tool = load_tool(get_data("tests/wf/nvidia-smi-max.cwl"), loadingContext)
builder = _makebuilder(tool.requirements[0])
builder.job = joborder

resources = tool.evalResources(builder, runtime_context)

assert resources["cudaDeviceCount"] == 4
Loading