Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions cwltool/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def cuda_version_and_device_count() -> Tuple[str, int]:
return (cv.data, int(ag.data))


def cuda_check(cuda_req: CWLObjectType) -> int:
def cuda_check(cuda_req: CWLObjectType, requestCount: int) -> int:
try:
vmin = float(str(cuda_req["cudaVersionMin"]))
version, devices = cuda_version_and_device_count()
Expand All @@ -31,14 +31,12 @@ def cuda_check(cuda_req: CWLObjectType) -> int:
"CUDA version '%s' is less than minimum version '%s'", version, vmin
)
return 0
dmin = cast(int, cuda_req.get("deviceCountMin", 1))
dmax = cast(int, cuda_req.get("deviceCountMax", dmin))
if devices < dmin:
if requestCount > devices:
_logger.warning(
"Requested at least %d GPU devices but only %d available", dmin, devices
"Requested %d GPU devices but only %d available", requestCount, devices
)
return 0
return min(dmax, devices)
return requestCount
except Exception as e:
_logger.warning("Error checking CUDA requirements: %s", e)
return 0
9 changes: 2 additions & 7 deletions cwltool/docker.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,8 @@ def create_runtime(
if runtimeContext.rm_container:
runtime.append("--rm")

cuda_req, _ = self.builder.get_requirement(
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
# Checked earlier that the device count is non-zero in _setup
count = cuda_check(cuda_req)
runtime.append("--gpus=" + str(count))
if self.builder.resources.get("cudaDeviceCount"):
runtime.append("--gpus=" + str(self.builder.resources["cudaDeviceCount"]))

cidfile_path = None # type: Optional[str]
# add parameters to docker to write a container ID file
Expand Down
3 changes: 3 additions & 0 deletions cwltool/executors.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,9 @@ def select_resources(
result["tmpdirSize"] = math.ceil(request["tmpdirMin"])
result["outdirSize"] = math.ceil(request["outdirMin"])

if "cudaDeviceCount" in request:
result["cudaDeviceCount"] = request["cudaDeviceCount"]

return result

def _runner(self, job, runtime_context, TMPDIR_LOCK):
Expand Down
25 changes: 18 additions & 7 deletions cwltool/extensions-v1.1.yml
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,24 @@ $graph:

See https://docs.nvidia.com/deploy/cuda-compatibility/ for
details.
cudaComputeCapabilityMin:
type: string
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
deviceCountMin:
type: int?
cudaComputeCapability:
type:
- 'string'
- 'string[]'
doc: |
CUDA hardware capability required to run the software, in X.Y
format.

* If this is a single value, it defines only the minimum
compute capability. GPUs with higher capability are also
accepted.

* If it is an array value, then only select GPUs with compute
capabilities that explicitly appear in the array.
cudaDeviceCountMin:
type: ['null', int, cwl:Expression]
default: 1
doc: Minimum number of GPU devices to request, default 1.
deviceCountMax:
type: int?
cudaDeviceCountMax:
type: ['null', int, cwl:Expression]
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
25 changes: 18 additions & 7 deletions cwltool/extensions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,24 @@ $graph:

See https://docs.nvidia.com/deploy/cuda-compatibility/ for
details.
cudaComputeCapabilityMin:
type: string
doc: Minimum CUDA hardware capability required to run the software, in X.Y format.
deviceCountMin:
type: int?
cudaComputeCapability:
type:
- 'string'
- 'string[]'
doc: |
CUDA hardware capability required to run the software, in X.Y
format.

* If this is a single value, it defines only the minimum
compute capability. GPUs with higher capability are also
accepted.

* If it is an array value, then only select GPUs with compute
capabilities that explicitly appear in the array.
cudaDeviceCountMin:
type: ['null', int, cwl:Expression]
default: 1
doc: Minimum number of GPU devices to request, default 1.
deviceCountMax:
type: int?
cudaDeviceCountMax:
type: ['null', int, cwl:Expression]
doc: Maximum number of GPU devices to request. If not specified, same as `deviceCountMin`.
5 changes: 4 additions & 1 deletion cwltool/job.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import functools
import itertools
import logging
import math
import os
import re
import shutil
Expand Down Expand Up @@ -180,7 +181,9 @@ def _setup(self, runtimeContext: RuntimeContext) -> None:
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
count = cuda_check(cuda_req)
count = cuda_check(
cuda_req, math.ceil(self.builder.resources["cudaDeviceCount"])
)
if count == 0:
raise WorkflowException("Could not satisfy CUDARequirement")

Expand Down
32 changes: 26 additions & 6 deletions cwltool/process.py
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,7 @@ def evalResources(
resourceReq, _ = self.get_requirement("ResourceRequirement")
if resourceReq is None:
resourceReq = {}

cwl_version = self.metadata.get(ORIGINAL_CWLVERSION, None)
if cwl_version == "v1.0":
ram = 1024
Expand All @@ -995,20 +996,34 @@ def evalResources(
"outdirMin": 1024,
"outdirMax": 1024,
}
for a in ("cores", "ram", "tmpdir", "outdir"):

cudaReq, _ = self.get_requirement("http://commonwl.org/cwltool#CUDARequirement")
if cudaReq:
request["cudaDeviceCountMin"] = 1
request["cudaDeviceCountMax"] = 1

for rsc, a in (
(resourceReq, "cores"),
(resourceReq, "ram"),
(resourceReq, "tmpdir"),
(resourceReq, "outdir"),
(cudaReq, "cudaDeviceCount"),
):
if rsc is None:
continue
mn = mx = None # type: Optional[Union[int, float]]
if resourceReq.get(a + "Min"):
if rsc.get(a + "Min"):
mn = cast(
Union[int, float],
eval_resource(
builder, cast(Union[str, int, float], resourceReq[a + "Min"])
builder, cast(Union[str, int, float], rsc[a + "Min"])
),
)
if resourceReq.get(a + "Max"):
if rsc.get(a + "Max"):
mx = cast(
Union[int, float],
eval_resource(
builder, cast(Union[str, int, float], resourceReq[a + "Max"])
builder, cast(Union[str, int, float], rsc[a + "Max"])
),
)
if mn is None:
Expand All @@ -1022,13 +1037,18 @@ def evalResources(

request_evaluated = cast(Dict[str, Union[int, float]], request)
if runtimeContext.select_resources is not None:
# Call select resources hook
return runtimeContext.select_resources(request_evaluated, runtimeContext)
return {

defaultReq = {
"cores": request_evaluated["coresMin"],
"ram": math.ceil(request_evaluated["ramMin"]),
"tmpdirSize": math.ceil(request_evaluated["tmpdirMin"]),
"outdirSize": math.ceil(request_evaluated["outdirMin"]),
}
if cudaReq:
defaultReq["cudaDeviceCount"] = request_evaluated["cudaDeviceCountMin"]
return defaultReq

def validate_hints(
self, avsc_names: Names, hints: List[CWLObjectType], strict: bool
Expand Down
6 changes: 1 addition & 5 deletions cwltool/singularity.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,11 +434,7 @@ def create_runtime(
else:
runtime.extend(["--net", "--network", "none"])

cuda_req, _ = self.builder.get_requirement(
"http://commonwl.org/cwltool#CUDARequirement"
)
if cuda_req:
# Checked earlier that the device count is non-zero in _setup
if self.builder.resources.get("cudaDeviceCount"):
runtime.append("--nv")

for name, value in self.environment.items():
Expand Down
2 changes: 1 addition & 1 deletion tests/wf/nvidia-smi-container.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ $namespaces:
requirements:
cwltool:CUDARequirement:
cudaVersionMin: "1.0"
cudaComputeCapabilityMin: "1.0"
cudaComputeCapability: "1.0"
DockerRequirement:
dockerPull: "nvidia/cuda:11.4.2-runtime-ubuntu20.04"
inputs: []
Expand Down
8 changes: 6 additions & 2 deletions tests/wf/nvidia-smi.cwl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@ $namespaces:
requirements:
cwltool:CUDARequirement:
cudaVersionMin: "1.0"
cudaComputeCapabilityMin: "1.0"
inputs: []
cudaComputeCapability: "1.0"
cudaDeviceCountMin: $(inputs.gpus)
inputs:
gpus:
type: int
default: 1
outputs: []
# Assume this will exit non-zero (resulting in a failing test case) if
# nvidia-smi doesn't detect any devices.
Expand Down