diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b2b5a2a6..b0434017 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -19,13 +19,11 @@ jobs: - compute-matrix - build-conda - test-conda - - test-conda-ctypes-binding - test-simulator - build-wheels - build-wheels-windows - test-wheels-windows - test-wheels - - test-wheels-ctypes-binding - test-wheels-deps-wheels - test-thirdparty - build-docs @@ -80,18 +78,6 @@ jobs: script: "ci/test_conda.sh" run_codecov: false matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - test-conda-ctypes-binding: - needs: - - build-conda - - compute-matrix - uses: ./.github/workflows/conda-python-tests.yaml - with: - build_type: pull-request - script: "ci/test_conda_ctypes_binding.sh" - run_codecov: false - # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version". - matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | split(".") | .[0] | tonumber >= 12))) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) test-simulator: needs: - build-conda @@ -124,17 +110,6 @@ jobs: build_type: pull-request script: "ci/test_wheel.sh false" matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - test-wheels-ctypes-binding: - needs: - - build-wheels - - compute-matrix - uses: ./.github/workflows/wheels-test.yaml - with: - build_type: pull-request - script: "ci/test_wheel_ctypes_binding.sh" - # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version". - matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }} - matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | split(".") | .[0] | tonumber >= 12))) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) test-wheels-deps-wheels: needs: - build-wheels diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh deleted file mode 100755 index eaedde07..00000000 --- a/ci/test_conda_ctypes_binding.sh +++ /dev/null @@ -1,70 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: BSD-2-Clause - -set -euo pipefail - -. /opt/conda/etc/profile.d/conda.sh - -CTK_PACKAGE_DEPENDENCIES=( - "cuda-nvcc-impl" - "cuda-nvrtc" - "cuda-cuobjdump" - "libcurand-dev" -) - -rapids-logger "Install testing dependencies" -# TODO: Replace with rapids-dependency-file-generator -DEPENDENCIES=( - "c-compiler" - "cxx-compiler" - "${CTK_PACKAGE_DEPENDENCIES[@]}" - "cuda-python" - "cuda-version=${CUDA_VER%.*}" - "make" - "psutil" - "pytest" - "pytest-xdist" - "cffi" - "ml_dtypes" - "python=${RAPIDS_PY_VERSION}" - "numba-cuda" -) -rapids-mamba-retry create \ - -n test \ - --strict-channel-priority \ - --channel "`pwd`/conda-repo" \ - --channel conda-forge \ - "${DEPENDENCIES[@]}" - -# Temporarily allow unbound variables for conda activation. -set +u -conda activate test -set -u - -pip install filecheck - -rapids-print-env - -rapids-logger "Check GPU usage" -nvidia-smi - -rapids-logger "Build test binaries" -export NUMBA_CUDA_TEST_BIN_DIR=`pwd`/testing -pushd $NUMBA_CUDA_TEST_BIN_DIR -make -j $(nproc) - -rapids-logger "Show Numba system info" -python -m numba --sysinfo - -EXITCODE=0 -trap "EXITCODE=1" ERR -set +e - -rapids-logger "Run Tests" -NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR pytest -v - -popd - -rapids-logger "Test script exiting with value: $EXITCODE" -exit ${EXITCODE} diff --git a/ci/test_wheel_ctypes_binding.sh b/ci/test_wheel_ctypes_binding.sh deleted file mode 100755 index e6795c8a..00000000 --- a/ci/test_wheel_ctypes_binding.sh +++ /dev/null @@ -1,37 +0,0 @@ -#!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: BSD-2-Clause - -set -euo pipefail - -CUDA_VER_MAJOR=${CUDA_VER%.*.*} - -rapids-logger "Install wheel with testing dependencies" -package=$(realpath wheel/numba_cuda*.whl) -echo "Package path: $package" -python -m pip install \ - "${package}" \ - cuda-python \ - --group test - -# FIXME: Find a way to build the tests that does not depend on the CUDA Python bindings -#rapids-logger "Build tests" -rapids-logger "Copy and cd into test binaries dir" -export NUMBA_CUDA_TEST_BIN_DIR=`pwd`/testing -pushd $NUMBA_CUDA_TEST_BIN_DIR -# make -j $(nproc) - -# Prevent the testsuite trying to use the test binaries -unset NUMBA_CUDA_TEST_BIN_DIR - -rapids-logger "Check GPU usage" -nvidia-smi - -rapids-logger "Show Numba system info" -NUMBA_CUDA_USE_NVIDIA_BINDING=0 python -m numba --sysinfo - -rapids-logger "Run Tests" -# NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m pytest --pyargs numba.cuda.tests -v -NUMBA_CUDA_USE_NVIDIA_BINDING=0 python -m pytest -v - -popd diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst index 1076764e..1bcee9a7 100644 --- a/docs/source/reference/envvars.rst +++ b/docs/source/reference/envvars.rst @@ -103,12 +103,11 @@ target. Enable warnings if a kernel is launched with host memory which forces a copy to and from the device. This option is on by default (default value is 1). -.. envvar:: NUMBA_CUDA_USE_NVIDIA_BINDING +.. note:: - When set to 1, Numba will attempt to use the `NVIDIA CUDA Python binding - `_ to make calls to the driver API - instead of using its own ctypes binding. This defaults to 1 (on). Set to - 0 to use the ctypes bindings. + Numba-CUDA always uses the NVIDIA CUDA Python bindings. The legacy ctypes + bindings and the ``NUMBA_CUDA_USE_NVIDIA_BINDING`` environment variable have + been removed. .. envvar:: NUMBA_CUDA_INCLUDE_PATH diff --git a/docs/source/user/bindings.rst b/docs/source/user/bindings.rst index d675e4a2..244c0f08 100644 --- a/docs/source/user/bindings.rst +++ b/docs/source/user/bindings.rst @@ -5,25 +5,22 @@ CUDA Bindings ============= -Numba supports two bindings to the CUDA Driver APIs: its own internal bindings -based on ctypes, and the official `NVIDIA CUDA Python bindings -`_. Functionality is equivalent between -the two bindings. - -The internal bindings are used by default. If the NVIDIA bindings are installed, -then they can be used by setting the environment variable -``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once -Numba has been imported, the selected binding cannot be changed. +Numba-CUDA uses the official `NVIDIA CUDA Python bindings +`_ for all CUDA Driver interactions. +Numba-CUDA previously provided its own internal ctypes-based bindings; the +public APIs exposing those bindings are kept for compatibility, but if you +need to interact directly with the CUDA Driver or other CUDA libraries we +recommend using the `cuda-python `_ +package directly. Per-Thread Default Streams -------------------------- Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to -the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings, -set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to -``1`` instead of Numba's environmnent variable -:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`. +the NVIDIA bindings. To use PTDS, set the environment variable +``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` instead of Numba's +environment variable :envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`. .. seealso:: @@ -35,13 +32,5 @@ set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to Roadmap ------- -In Numba 0.56, the NVIDIA Bindings will be used by default, if they are -installed. - -In future versions of Numba: - -- The internal bindings will be deprecated. -- The internal bindings will be removed. - -At present, no specific release is planned for the deprecation or removal of -the internal bindings. +The ctypes-based internal bindings have been removed in favor of the NVIDIA +bindings. Future work focuses on expanding usage of ``cuda.core`` APIs. diff --git a/docs/source/user/installation.rst b/docs/source/user/installation.rst index 8acfb775..82b3cd21 100644 --- a/docs/source/user/installation.rst +++ b/docs/source/user/installation.rst @@ -61,14 +61,12 @@ Configuration CUDA Bindings ------------- -Numba supports interacting with the CUDA Driver API via either the `NVIDIA CUDA -Python bindings `_ or its own ctypes-based -bindings. Functionality is equivalent between the two binding choices. The -NVIDIA bindings are the default, and the ctypes bindings are now deprecated. - -If you do not want to use the NVIDIA bindings, the (deprecated) ctypes bindings -can be enabled by setting the environment variable -:envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING` to ``"0"``. +Numba-CUDA uses the `NVIDIA CUDA Python bindings `_ +for interacting with the CUDA Driver API. Numba-CUDA previously provided its own +internal ctypes-based bindings; the public APIs exposing those bindings are kept +for compatibility, but if you need to interact directly with the CUDA Driver or +other CUDA libraries we recommend using the `cuda-python `_ +package directly. .. _cudatoolkit-lookup: @@ -79,22 +77,8 @@ CUDA Driver and Toolkit search paths Default behavior ~~~~~~~~~~~~~~~~ -When using the NVIDIA bindings, searches for the CUDA driver and toolkit -libraries use its `built-in path-finding logic `_. - -Ctypes bindings (deprecated) behavior -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -When using the ctypes bindings, Numba searches for a CUDA toolkit installation -in the following order: - -1. Conda-installed CUDA Toolkit packages -2. Pip-installed CUDA Toolkit packages -3. The environment variable ``CUDA_HOME``, which points to the directory of the - installed CUDA toolkit (i.e. ``/home/user/cuda-12``) -4. System-wide installation at exactly ``/usr/local/cuda`` on Linux platforms. - Versioned installation paths (i.e. ``/usr/local/cuda-12.0``) are intentionally - ignored. Users can use ``CUDA_HOME`` to select specific versions. +Searches for the CUDA driver and toolkit libraries use the NVIDIA bindings' +`built-in path-finding logic `_. In addition to the CUDA toolkit libraries, which can be installed by conda into an environment or installed system-wide by the `CUDA SDK installer diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py index a27c8fbb..9362985c 100644 --- a/numba_cuda/numba/cuda/__init__.py +++ b/numba_cuda/numba/cuda/__init__.py @@ -8,65 +8,15 @@ import sys -# Enable pynvjitlink based on the following precedence: -# 1. Config setting "CUDA_ENABLE_PYNVJITLINK" (highest priority) -# 2. Environment variable "NUMBA_CUDA_ENABLE_PYNVJITLINK" -# 3. Auto-detection of pynvjitlink module (lowest priority) - -pynvjitlink_auto_enabled = False - -if getattr(config, "CUDA_ENABLE_PYNVJITLINK", None) is None: - if ( - _pynvjitlink_enabled_in_env := _readenv( - "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, None - ) - ) is not None: - config.CUDA_ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env - else: - pynvjitlink_auto_enabled = ( - importlib.util.find_spec("pynvjitlink") is not None - ) - config.CUDA_ENABLE_PYNVJITLINK = pynvjitlink_auto_enabled - -# Upstream numba sets CUDA_USE_NVIDIA_BINDING to 0 by default, so it always -# exists. Override, but not if explicitly set to 0 in the envioronment. -_nvidia_binding_enabled_in_env = _readenv( - "NUMBA_CUDA_USE_NVIDIA_BINDING", bool, None -) -if _nvidia_binding_enabled_in_env is False: - USE_NV_BINDING = False -else: - USE_NV_BINDING = True - config.CUDA_USE_NVIDIA_BINDING = USE_NV_BINDING -if config.CUDA_USE_NVIDIA_BINDING: - if not ( - importlib.util.find_spec("cuda") - and importlib.util.find_spec("cuda.bindings") - ): - raise ImportError( - "CUDA bindings not found. Please pip install the " - "cuda-bindings package. Alternatively, install " - "numba-cuda[cuXY], where XY is the required CUDA " - "version, to install the binding automatically. " - "If no CUDA bindings are desired, set the env var " - "NUMBA_CUDA_USE_NVIDIA_BINDING=0 to enable ctypes " - "bindings." - ) - -if config.CUDA_ENABLE_PYNVJITLINK: - if USE_NV_BINDING and not pynvjitlink_auto_enabled: - warnings.warn( - "Explicitly enabling pynvjitlink is no longer necessary. " - "NVIDIA bindings are enabled. cuda.core will be used " - "in place of pynvjitlink." - ) - elif pynvjitlink_auto_enabled: - # Ignore the fact that pynvjitlink is enabled, because that was an - # automatic decision based on discovering pynvjitlink was present; the - # user didn't ask for it - pass - else: - raise RuntimeError("nvJitLink requires the NVIDIA CUDA bindings. ") +# Require NVIDIA CUDA bindings at import time +if not ( + importlib.util.find_spec("cuda") + and importlib.util.find_spec("cuda.bindings") +): + raise ImportError( + "NVIDIA CUDA Python bindings not found. Install the 'cuda' package " + "(e.g. pip install nvidia-cuda-python or numba-cuda[cuXY])." + ) if config.ENABLE_CUDASIM: from .simulator_init import * diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py index 9fe8cd7e..dbb2d94c 100644 --- a/numba_cuda/numba/cuda/api.py +++ b/numba_cuda/numba/cuda/api.py @@ -8,7 +8,6 @@ import contextlib import os -from numba.cuda.cudadrv import drvapi import numpy as np from .cudadrv import devicearray, devices, driver @@ -48,10 +47,7 @@ def from_cuda_array_interface(desc, owner=None, sync=True): ) size = driver.memory_size_from_info(shape, strides, dtype.itemsize) - if config.CUDA_USE_NVIDIA_BINDING: - cudevptr_class = driver.binding.CUdeviceptr - else: - cudevptr_class = drvapi.cu_device_ptr + cudevptr_class = driver.binding.CUdeviceptr devptr = cudevptr_class(desc["data"][0]) data = driver.MemoryPointer( current_context(), devptr, size=size, owner=owner diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py index c40253c1..3ad26b36 100644 --- a/numba_cuda/numba/cuda/codegen.py +++ b/numba_cuda/numba/cuda/codegen.py @@ -26,10 +26,7 @@ def run_nvdisasm(cubin, flags): try: fd, fname = tempfile.mkstemp() with open(fname, "wb") as f: - if config.CUDA_USE_NVIDIA_BINDING: - f.write(cubin.code) - else: - f.write(cubin) + f.write(cubin.code) try: cp = subprocess.run( diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py index d0e863be..1337d77e 100644 --- a/numba_cuda/numba/cuda/cudadrv/driver.py +++ b/numba_cuda/numba/cuda/cudadrv/driver.py @@ -37,8 +37,6 @@ c_char_p, addressof, c_void_p, - c_float, - c_uint, c_uint8, ) import contextlib @@ -52,26 +50,27 @@ from numba.cuda import utils, serialize from .error import CudaSupportError, CudaDriverError from .drvapi import API_PROTOTYPES -from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid +from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj from .mappings import FILE_EXTENSION_MAP from .linkable_code import LinkableCode, LTOIR, Fatbin, Object from numba.cuda.utils import cached_file_read from numba.cuda.cudadrv import enums, drvapi, nvrtc -USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING +from cuda.bindings import driver as binding +from cuda.core.experimental import ( + Linker, + LinkerOptions, + ObjectCode, +) -if USE_NV_BINDING: - from cuda.bindings import driver as binding - from cuda.core.experimental import ( - Linker, - LinkerOptions, - ObjectCode, - ) +# For backwards compatibility: indicate that the NVIDIA CUDA Python bindings are +# in use. Older code checks this flag to branch on binding-specific behavior. +USE_NV_BINDING = True - # There is no definition of the default stream in the Nvidia bindings (nor - # is there at the C/C++ level), so we define it here so we don't need to - # use a magic number 0 in places where we want the default stream. - CU_STREAM_DEFAULT = 0 +# There is no definition of the default stream in the Nvidia bindings (nor +# is there at the C/C++ level), so we define it here so we don't need to +# use a magic number 0 in places where we want the default stream. +CU_STREAM_DEFAULT = 0 MIN_REQUIRED_CC = (3, 5) @@ -83,16 +82,6 @@ _py_decref.argtypes = [ctypes.py_object] _py_incref.argtypes = [ctypes.py_object] -USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING - -if USE_NV_BINDING: - from cuda.bindings import driver as binding - - # There is no definition of the default stream in the Nvidia bindings (nor - # is there at the C/C++ level), so we define it here so we don't need to - # use a magic number 0 in places where we want the default stream. - CU_STREAM_DEFAULT = 0 - def make_logger(): logger = logging.getLogger(__name__) @@ -120,20 +109,27 @@ def make_logger(): @functools.cache def _have_nvjitlink(): - if not USE_NV_BINDING: - return False try: from cuda.bindings._internal import nvjitlink as nvjitlink_internal from cuda.bindings._internal.utils import NotSupportedError except ImportError: return False + try: - return ( + if ( nvjitlink_internal._inspect_function_pointer("__nvJitLinkVersion") - != 0 - ) + == 0 + ): + return False + try: + from cuda.bindings import nvjitlink + + if nvjitlink.version() < (12, 3): + return False + except Exception: + return False + return True except (RuntimeError, NotSupportedError): - # no driver return False @@ -312,10 +308,7 @@ def __getattr__(self, fname): "Error at driver init: \n%s:" % self.initialization_error ) - if USE_NV_BINDING: - return self._cuda_python_wrap_fn(fname) - else: - return self._ctypes_wrap_fn(fname) + return self._cuda_python_wrap_fn(fname) def _ctypes_wrap_fn(self, fname, libfn=None): # Wrap a CUDA driver function by default @@ -375,12 +368,8 @@ def safe_cuda_api_call(*args): def _find_api(self, fname): # We use alternatively-named functions for PTDS with the Numba ctypes - # binding. For the NVidia binding, it handles linking to the correct - # variant. - if config.CUDA_PER_THREAD_DEFAULT_STREAM and not USE_NV_BINDING: - variants = ("_v2_ptds", "_v2_ptsz", "_ptds", "_ptsz", "_v2", "") - else: - variants = ("_v2", "") + # binding. It handles linking to the correct variant. + variants = ("_v2", "") if fname in ("cuCtxGetDevice", "cuCtxSynchronize"): return getattr(self.lib, fname) @@ -437,12 +426,7 @@ def get_device(self, devnum=0): return weakref.proxy(dev) def get_device_count(self): - if USE_NV_BINDING: - return self.cuDeviceGetCount() - - count = c_int() - self.cuDeviceGetCount(byref(count)) - return count.value + return self.cuDeviceGetCount() def list_devices(self): """Returns a list of active devices""" @@ -459,11 +443,7 @@ def pop_active_context(self): """ with self.get_active_context() as ac: if ac.devnum is not None: - if USE_NV_BINDING: - popped = drvapi.cu_context(int(driver.cuCtxPopCurrent())) - else: - popped = drvapi.cu_context() - driver.cuCtxPopCurrent(byref(popped)) + popped = drvapi.cu_context(int(driver.cuCtxPopCurrent())) return popped def get_active_context(self): @@ -474,14 +454,8 @@ def get_version(self): """ Returns the CUDA Driver version as a tuple (major, minor). """ - if USE_NV_BINDING: - version = driver.cuDriverGetVersion() - else: - dv = ctypes.c_int(0) - driver.cuDriverGetVersion(ctypes.byref(dv)) - version = dv.value - # The version is encoded as (1000 * major) + (10 * minor) + version = driver.cuDriverGetVersion() major = version // 1000 minor = (version - (major * 1000)) // 10 return (major, minor) @@ -504,26 +478,16 @@ def __enter__(self): hctx, devnum = self._tls_cache.ctx_devnum # Not cached. Query the driver API. else: - if USE_NV_BINDING: - hctx = driver.cuCtxGetCurrent() - if int(hctx) == 0: - hctx = None - else: - hctx = drvapi.cu_context(int(hctx)) + hctx = driver.cuCtxGetCurrent() + if int(hctx) == 0: + hctx = None else: - hctx = drvapi.cu_context(0) - driver.cuCtxGetCurrent(byref(hctx)) - hctx = hctx if hctx.value else None + hctx = drvapi.cu_context(int(hctx)) if hctx is None: devnum = None else: - if USE_NV_BINDING: - devnum = int(driver.cuCtxGetDevice()) - else: - hdevice = drvapi.cu_device() - driver.cuCtxGetDevice(byref(hdevice)) - devnum = hdevice.value + devnum = int(driver.cuCtxGetDevice()) self._tls_cache.ctx_devnum = (hctx, devnum) is_top = True @@ -582,15 +546,9 @@ def from_identity(self, identity): raise RuntimeError(errmsg) def __init__(self, devnum): - if USE_NV_BINDING: - result = driver.cuDeviceGet(devnum) - self.id = result - got_devnum = int(result) - else: - result = c_int() - driver.cuDeviceGet(byref(result), devnum) - got_devnum = result.value - self.id = got_devnum + result = driver.cuDeviceGet(devnum) + self.id = result + got_devnum = int(result) msg = f"Driver returned device {got_devnum} instead of {devnum}" if devnum != got_devnum: @@ -606,25 +564,14 @@ def __init__(self, devnum): # Read name bufsz = 128 - - if USE_NV_BINDING: - buf = driver.cuDeviceGetName(bufsz, self.id) - name = buf.split(b"\x00")[0] - else: - buf = (c_char * bufsz)() - driver.cuDeviceGetName(buf, bufsz, self.id) - name = buf.value + buf = driver.cuDeviceGetName(bufsz, self.id) + name = buf.split(b"\x00")[0] self.name = name # Read UUID - if USE_NV_BINDING: - uuid = driver.cuDeviceGetUuid(self.id) - uuid_vals = tuple(uuid.bytes) - else: - uuid = cu_uuid() - driver.cuDeviceGetUuid(byref(uuid), self.id) - uuid_vals = tuple(bytes(uuid)) + uuid = driver.cuDeviceGetUuid(self.id) + uuid_vals = tuple(uuid.bytes) b = "%02x" b2 = b * 2 @@ -647,20 +594,10 @@ def __repr__(self): def __getattr__(self, attr): """Read attributes lazily""" - if USE_NV_BINDING: - code = getattr( - binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}" - ) - value = driver.cuDeviceGetAttribute(code, self.id) - else: - try: - code = DEVICE_ATTRIBUTES[attr] - except KeyError: - raise AttributeError(attr) - - result = c_int() - driver.cuDeviceGetAttribute(byref(result), code, self.id) - value = result.value + code = getattr( + binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}" + ) + value = driver.cuDeviceGetAttribute(code, self.id) setattr(self, attr, value) return value @@ -686,12 +623,8 @@ def get_primary_context(self): met_requirement_for_device(self) # create primary context - if USE_NV_BINDING: - hctx = driver.cuDevicePrimaryCtxRetain(self.id) - hctx = drvapi.cu_context(int(hctx)) - else: - hctx = drvapi.cu_context() - driver.cuDevicePrimaryCtxRetain(byref(hctx), self.id) + hctx = driver.cuDevicePrimaryCtxRetain(self.id) + hctx = drvapi.cu_context(int(hctx)) ctx = Context(weakref.proxy(self), hctx) self.primary_context = ctx @@ -879,11 +812,7 @@ def _attempt_allocation(self, allocator): return allocator() except CudaAPIError as e: # is out-of-memory? - if USE_NV_BINDING: - oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY - else: - oom_code = enums.CUDA_ERROR_OUT_OF_MEMORY - + oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY if e.code == oom_code: # clear pending deallocations self.deallocations.clear() @@ -906,29 +835,15 @@ def memhostalloc(self, size, mapped=False, portable=False, wc=False): if wc: flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED - if USE_NV_BINDING: - - def allocator(): - return driver.cuMemHostAlloc(size, flags) - - if mapped: - pointer = self._attempt_allocation(allocator) - else: - pointer = allocator() + def allocator(): + return driver.cuMemHostAlloc(size, flags) - alloc_key = pointer + if mapped: + pointer = self._attempt_allocation(allocator) else: - pointer = c_void_p() + pointer = allocator() - def allocator(): - driver.cuMemHostAlloc(byref(pointer), size, flags) - - if mapped: - self._attempt_allocation(allocator) - else: - allocator() - - alloc_key = pointer.value + alloc_key = pointer finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped) ctx = weakref.proxy(self.context) @@ -946,13 +861,7 @@ def mempin(self, owner, pointer, size, mapped=False): It is recommended that this method is not overridden by EMM Plugin implementations - instead, use the :class:`BaseCUDAMemoryManager`. """ - if isinstance(pointer, int) and not USE_NV_BINDING: - pointer = c_void_p(pointer) - - if USE_NV_BINDING: - alloc_key = pointer - else: - alloc_key = pointer.value + alloc_key = pointer # possible flags are "portable" (between context) # and "device-map" (map host memory to device thus no need @@ -985,37 +894,19 @@ def allocator(): ) def memallocmanaged(self, size, attach_global): - if USE_NV_BINDING: - - def allocator(): - ma_flags = binding.CUmemAttach_flags - - if attach_global: - flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value - else: - flags = ma_flags.CU_MEM_ATTACH_HOST.value - - return driver.cuMemAllocManaged(size, flags) - - ptr = self._attempt_allocation(allocator) - - alloc_key = ptr - - else: - ptr = drvapi.cu_device_ptr() + def allocator(): + ma_flags = binding.CUmemAttach_flags - def allocator(): - flags = c_uint() - if attach_global: - flags = enums.CU_MEM_ATTACH_GLOBAL - else: - flags = enums.CU_MEM_ATTACH_HOST + if attach_global: + flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value + else: + flags = ma_flags.CU_MEM_ATTACH_HOST.value - driver.cuMemAllocManaged(byref(ptr), size, flags) + return driver.cuMemAllocManaged(size, flags) - self._attempt_allocation(allocator) + ptr = self._attempt_allocation(allocator) - alloc_key = ptr.value + alloc_key = ptr finalizer = _alloc_finalizer(self, ptr, alloc_key, size) ctx = weakref.proxy(self.context) @@ -1055,13 +946,8 @@ def get_ipc_handle(self, memory): populated with the underlying ``ipc_mem_handle``. """ base, end = device_extents(memory) - if USE_NV_BINDING: - ipchandle = driver.cuIpcGetMemHandle(base) - offset = int(memory.handle) - int(base) - else: - ipchandle = drvapi.cu_ipc_mem_handle() - driver.cuIpcGetMemHandle(byref(ipchandle), base) - offset = memory.handle.value - base + ipchandle = driver.cuIpcGetMemHandle(base) + offset = int(memory.handle) - int(base) source_info = self.context.device.get_device_identity() return IpcHandle( @@ -1080,21 +966,11 @@ def initialize(self): self.deallocations.memory_capacity = self.get_memory_info().total def memalloc(self, size): - if USE_NV_BINDING: - - def allocator(): - return driver.cuMemAlloc(size) - - ptr = self._attempt_allocation(allocator) - alloc_key = ptr - else: - ptr = drvapi.cu_device_ptr() - - def allocator(): - driver.cuMemAlloc(byref(ptr), size) + def allocator(): + return driver.cuMemAlloc(size) - self._attempt_allocation(allocator) - alloc_key = ptr.value + ptr = self._attempt_allocation(allocator) + alloc_key = ptr finalizer = _alloc_finalizer(self, ptr, alloc_key, size) ctx = weakref.proxy(self.context) @@ -1103,15 +979,7 @@ def allocator(): return mem.own() def get_memory_info(self): - if USE_NV_BINDING: - free, total = driver.cuMemGetInfo() - else: - free = c_size_t() - total = c_size_t() - driver.cuMemGetInfo(byref(free), byref(total)) - free = free.value - total = total.value - + free, total = driver.cuMemGetInfo() return MemoryInfo(free=free, total=total) @property @@ -1309,10 +1177,7 @@ def get_active_blocks_per_multiprocessor( :param memsize: per-block dynamic shared memory usage intended, in bytes """ args = (func, blocksize, memsize, flags) - if USE_NV_BINDING: - return self._cuda_python_active_blocks_per_multiprocessor(*args) - else: - return self._ctypes_active_blocks_per_multiprocessor(*args) + return self._cuda_python_active_blocks_per_multiprocessor(*args) def _cuda_python_active_blocks_per_multiprocessor( self, func, blocksize, memsize, flags @@ -1352,10 +1217,7 @@ def get_max_potential_block_size( handle """ args = (func, b2d_func, memsize, blocksizelimit, flags) - if USE_NV_BINDING: - return self._cuda_python_max_potential_block_size(*args) - else: - return self._ctypes_max_potential_block_size(*args) + return self._cuda_python_max_potential_block_size(*args) def _ctypes_max_potential_block_size( self, func, b2d_func, memsize, blocksizelimit, flags @@ -1404,10 +1266,7 @@ def push(self): """ Pushes this context on the current CPU Thread. """ - if USE_NV_BINDING: - driver.cuCtxPushCurrent(self.handle.value) - else: - driver.cuCtxPushCurrent(self.handle) + driver.cuCtxPushCurrent(self.handle.value) self.prepare_for_use() def pop(self): @@ -1443,11 +1302,7 @@ def get_ipc_handle(self, memory): def open_ipc_handle(self, handle, size): # open the IPC handle to get the device pointer flags = 1 # CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS - if USE_NV_BINDING: - dptr = driver.cuIpcOpenMemHandle(handle, flags) - else: - dptr = drvapi.cu_device_ptr() - driver.cuIpcOpenMemHandle(byref(dptr), handle, flags) + dptr = driver.cuIpcOpenMemHandle(handle, flags) # wrap it return MemoryPointer( @@ -1463,28 +1318,17 @@ def can_access_peer(self, peer_device): """Returns a bool indicating whether the peer access between the current and peer device is possible. """ - if USE_NV_BINDING: - peer_device = binding.CUdevice(peer_device) - can_access_peer = driver.cuDeviceCanAccessPeer( - self.device.id, peer_device - ) - else: - can_access_peer = c_int() - driver.cuDeviceCanAccessPeer( - byref(can_access_peer), - self.device.id, - peer_device, - ) + peer_device = binding.CUdevice(peer_device) + can_access_peer = driver.cuDeviceCanAccessPeer( + self.device.id, peer_device + ) return bool(can_access_peer) def create_module_ptx(self, ptx): if isinstance(ptx, str): ptx = ptx.encode("utf8") - if USE_NV_BINDING: - image = ObjectCode.from_ptx(ptx) - else: - image = c_char_p(ptx) + image = ObjectCode.from_ptx(ptx) return self.create_module_image(image) def create_module_image( @@ -1493,56 +1337,37 @@ def create_module_image( module = load_module_image( self, image, setup_callbacks, teardown_callbacks ) - if USE_NV_BINDING: - key = module.handle - else: - key = module.handle.value + key = module.handle self.modules[key] = module return weakref.proxy(module) def unload_module(self, module): - if USE_NV_BINDING: - key = module.handle - else: - key = module.handle.value + key = module.handle del self.modules[key] def get_default_stream(self): - if USE_NV_BINDING: - handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT))) - else: - handle = drvapi.cu_stream(drvapi.CU_STREAM_DEFAULT) + handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT))) return Stream(weakref.proxy(self), handle, None) def get_legacy_default_stream(self): - if USE_NV_BINDING: - handle = drvapi.cu_stream( - int(binding.CUstream(binding.CU_STREAM_LEGACY)) - ) - else: - handle = drvapi.cu_stream(drvapi.CU_STREAM_LEGACY) + handle = drvapi.cu_stream( + int(binding.CUstream(binding.CU_STREAM_LEGACY)) + ) return Stream(weakref.proxy(self), handle, None) def get_per_thread_default_stream(self): - if USE_NV_BINDING: - handle = drvapi.cu_stream( - int(binding.CUstream(binding.CU_STREAM_PER_THREAD)) - ) - else: - handle = drvapi.cu_stream(drvapi.CU_STREAM_PER_THREAD) + handle = drvapi.cu_stream( + int(binding.CUstream(binding.CU_STREAM_PER_THREAD)) + ) return Stream(weakref.proxy(self), handle, None) def create_stream(self): - if USE_NV_BINDING: - # The default stream creation flag, specifying that the created - # stream synchronizes with stream 0 (this is different from the - # default stream, which we define also as CU_STREAM_DEFAULT when - # the NV binding is in use). - flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value - handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags))) - else: - handle = drvapi.cu_stream() - driver.cuStreamCreate(byref(handle), 0) + # The default stream creation flag, specifying that the created + # stream synchronizes with stream 0 (this is different from the + # default stream, which we define also as CU_STREAM_DEFAULT when + # the NV binding is in use). + flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value + handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags))) return Stream( weakref.proxy(self), handle, @@ -1552,21 +1377,14 @@ def create_stream(self): def create_external_stream(self, ptr): if not isinstance(ptr, int): raise TypeError("ptr for external stream must be an int") - if USE_NV_BINDING: - handle = drvapi.cu_stream(int(binding.CUstream(ptr))) - else: - handle = drvapi.cu_stream(ptr) + handle = drvapi.cu_stream(int(binding.CUstream(ptr))) return Stream(weakref.proxy(self), handle, None, external=True) def create_event(self, timing=True): flags = 0 if not timing: flags |= enums.CU_EVENT_DISABLE_TIMING - if USE_NV_BINDING: - handle = drvapi.cu_event(int(driver.cuEventCreate(flags))) - else: - handle = drvapi.cu_event() - driver.cuEventCreate(byref(handle), flags) + handle = drvapi.cu_event(int(driver.cuEventCreate(flags))) return Event( weakref.proxy(self), handle, @@ -1601,14 +1419,9 @@ def load_module_image( """ image must be a pointer """ - if USE_NV_BINDING: - return load_module_image_cuda_python( - context, image, setup_callbacks, teardown_callbacks - ) - else: - return load_module_image_ctypes( - context, image, setup_callbacks, teardown_callbacks - ) + return load_module_image_cuda_python( + context, image, setup_callbacks, teardown_callbacks + ) def load_module_image_ctypes( @@ -1769,11 +1582,7 @@ def core(): def _module_finalizer(context, handle): dealloc = context.deallocations modules = context.modules - - if USE_NV_BINDING: - key = handle - else: - key = handle.value + key = handle def core(): shutting_down = utils.shutting_down # early bind @@ -1843,10 +1652,7 @@ def open(self, context): from numba import cuda srcdev = Device.from_identity(self.source_info) - if USE_NV_BINDING: - srcdev_id = int(srcdev.id) - else: - srcdev_id = srcdev.id + srcdev_id = int(srcdev.id) impl = _CudaIpcImpl(parent=self.parent) # Open context on the source device. @@ -1968,10 +1774,7 @@ def close(self): def __reduce__(self): # Preprocess the IPC handle, which is defined as a byte array. - if USE_NV_BINDING: - preprocessed_handle = self.handle.reserved - else: - preprocessed_handle = tuple(self.handle.reserved) + preprocessed_handle = self.handle.reserved args = ( self.__class__, preprocessed_handle, @@ -1983,10 +1786,7 @@ def __reduce__(self): @classmethod def _rebuild(cls, handle_ary, size, source_info, offset): - if USE_NV_BINDING: - handle = binding.CUipcMemHandle() - else: - handle = drvapi.cu_ipc_mem_handle() + handle = binding.CUipcMemHandle() handle.reserved = handle_ary return cls( base=None, @@ -2032,7 +1832,7 @@ class MemoryPointer(object): __cuda_memory__ = True def __init__(self, context, pointer, size, owner=None, finalizer=None): - if USE_NV_BINDING and isinstance(pointer, ctypes.c_void_p): + if isinstance(pointer, ctypes.c_void_p): pointer = binding.CUdeviceptr(pointer.value) self.context = context @@ -2067,10 +1867,7 @@ def free(self): def memset(self, byte, count=None, stream=0): count = self.size if count is None else count if stream: - if USE_NV_BINDING: - handle = stream.handle.value - else: - handle = stream.handle + handle = stream.handle.value driver.cuMemsetD8Async(self.device_pointer, byte, count, handle) else: driver.cuMemsetD8(self.device_pointer, byte, count) @@ -2091,12 +1888,9 @@ def view(self, start, stop=None): base = self.device_pointer_value + start if size < 0: raise RuntimeError("size cannot be negative") - if USE_NV_BINDING: - pointer = binding.CUdeviceptr() - ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr()) - ctypes_ptr.value = base - else: - pointer = drvapi.cu_device_ptr(base) + pointer = binding.CUdeviceptr() + ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr()) + ctypes_ptr.value = base view = MemoryPointer(self.context, pointer, size, owner=self.owner) if isinstance(self.owner, (MemoryPointer, OwnedPointer)): @@ -2108,16 +1902,11 @@ def view(self, start, stop=None): @property def device_ctypes_pointer(self): - if USE_NV_BINDING: - return drvapi.cu_device_ptr(int(self.device_pointer)) - return self.device_pointer + return drvapi.cu_device_ptr(int(self.device_pointer)) @property def device_pointer_value(self): - if USE_NV_BINDING: - return int(self.device_pointer) or None - else: - return self.device_pointer.value + return int(self.device_pointer) or None class AutoFreePointer(MemoryPointer): @@ -2162,13 +1951,8 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None): self.owned = owner self.host_pointer = pointer - if USE_NV_BINDING: - devptr = driver.cuMemHostGetDevicePointer(pointer, 0) - self._bufptr_ = self.host_pointer - else: - devptr = drvapi.cu_device_ptr() - driver.cuMemHostGetDevicePointer(byref(devptr), pointer, 0) - self._bufptr_ = self.host_pointer.value + devptr = driver.cuMemHostGetDevicePointer(pointer, 0) + self._bufptr_ = self.host_pointer self.device_pointer = devptr super(MappedMemory, self).__init__( @@ -2212,10 +1996,7 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None): # For buffer interface self._buflen_ = self.size - if USE_NV_BINDING: - self._bufptr_ = self.host_pointer - else: - self._bufptr_ = self.host_pointer.value + self._bufptr_ = self.host_pointer if finalizer is not None: weakref.finalize(self, finalizer) @@ -2253,10 +2034,7 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None): # For buffer interface self._buflen_ = self.size - if USE_NV_BINDING: - self._bufptr_ = self.device_pointer - else: - self._bufptr_ = self.device_pointer.value + self._bufptr_ = self.device_pointer def own(self): return ManagedOwnedPointer(weakref.proxy(self)) @@ -2332,10 +2110,7 @@ def synchronize(self): Wait for all commands in this stream to execute. This will commit any pending memory transfers. """ - if USE_NV_BINDING: - handle = self.handle.value - else: - handle = self.handle + handle = self.handle.value driver.cuStreamSynchronize(handle) @contextlib.contextmanager @@ -2379,15 +2154,11 @@ def add_callback(self, callback, arg=None): """ data = (self, callback, arg) _py_incref(data) - if USE_NV_BINDING: - ptr = int.from_bytes(self._stream_callback, byteorder="little") - stream_callback = binding.CUstreamCallback(ptr) - # The callback needs to receive a pointer to the data PyObject - data = id(data) - handle = self.handle.value - else: - stream_callback = self._stream_callback - handle = self.handle + ptr = int.from_bytes(self._stream_callback, byteorder="little") + stream_callback = binding.CUstreamCallback(ptr) + # The callback needs to receive a pointer to the data PyObject + data = id(data) + handle = self.handle.value driver.cuStreamAddCallback(handle, stream_callback, data, 0) @staticmethod @@ -2465,34 +2236,23 @@ def record(self, stream=0): queued in the stream at the time of the call to ``record()`` has been completed. """ - if USE_NV_BINDING: - hstream = stream.handle.value if stream else binding.CUstream(0) - handle = self.handle.value - else: - hstream = stream.handle if stream else 0 - handle = self.handle + hstream = stream.handle.value if stream else binding.CUstream(0) + handle = self.handle.value driver.cuEventRecord(handle, hstream) def synchronize(self): """ Synchronize the host thread for the completion of the event. """ - if USE_NV_BINDING: - handle = self.handle.value - else: - handle = self.handle + handle = self.handle.value driver.cuEventSynchronize(handle) def wait(self, stream=0): """ All future works submitted to stream will wait util the event completes. """ - if USE_NV_BINDING: - hstream = stream.handle.value if stream else binding.CUstream(0) - handle = self.handle.value - else: - hstream = stream.handle if stream else 0 - handle = self.handle + hstream = stream.handle.value if stream else binding.CUstream(0) + handle = self.handle.value flags = 0 driver.cuStreamWaitEvent(hstream, handle, flags) @@ -2504,14 +2264,7 @@ def event_elapsed_time(evtstart, evtend): """ Compute the elapsed time between two events in milliseconds. """ - if USE_NV_BINDING: - return driver.cuEventElapsedTime( - evtstart.handle.value, evtend.handle.value - ) - else: - msec = c_float() - driver.cuEventElapsedTime(byref(msec), evtstart.handle, evtend.handle) - return msec.value + return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value) class Module(metaclass=ABCMeta): @@ -2729,12 +2482,8 @@ def launch_kernel( param_ptrs = [addressof(arg) for arg in args] params = (c_void_p * len(param_ptrs))(*param_ptrs) - if USE_NV_BINDING: - params_for_launch = addressof(params) - extra = 0 - else: - params_for_launch = params - extra = None + params_for_launch = addressof(params) + extra = 0 if cooperative: driver.cuLaunchCooperativeKernel( @@ -2777,10 +2526,7 @@ def new( lto=None, additional_flags=None, ): - if USE_NV_BINDING: - linker = _Linker - else: - linker = CtypesLinker + linker = _Linker params = (max_registers, lineinfo, cc) if linker is _Linker: @@ -3217,21 +2963,12 @@ def get_devptr_for_active_ctx(ptr): pointer. """ if ptr != 0: - if USE_NV_BINDING: - ptr_attrs = binding.CUpointer_attribute - attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER - ptrobj = binding.CUdeviceptr(ptr) - return driver.cuPointerGetAttribute(attr, ptrobj) - else: - devptr = drvapi.cu_device_ptr() - attr = enums.CU_POINTER_ATTRIBUTE_DEVICE_POINTER - driver.cuPointerGetAttribute(byref(devptr), attr, ptr) - return devptr + ptr_attrs = binding.CUpointer_attribute + attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER + ptrobj = binding.CUdeviceptr(ptr) + return driver.cuPointerGetAttribute(attr, ptrobj) else: - if USE_NV_BINDING: - return binding.CUdeviceptr() - else: - return drvapi.cu_device_ptr() + return binding.CUdeviceptr() def device_extents(devmem): @@ -3242,15 +2979,8 @@ def device_extents(devmem): of the device memory view that can be a subsection of the entire allocation. """ devptr = device_ctypes_pointer(devmem) - if USE_NV_BINDING: - s, n = driver.cuMemGetAddressRange(devptr.value) - return int(s), int(binding.CUdeviceptr(int(s) + n)) - else: - s = drvapi.cu_device_ptr() - n = c_size_t() - driver.cuMemGetAddressRange(byref(s), byref(n), devptr) - s, n = s.value, n.value - return s, s + n + s, n = driver.cuMemGetAddressRange(devptr.value) + return int(s), int(binding.CUdeviceptr(int(s) + n)) def device_memory_size(devmem): @@ -3376,10 +3106,7 @@ def host_to_device(dst, src, size, stream=0): if stream: assert isinstance(stream, Stream) fn = driver.cuMemcpyHtoDAsync - if USE_NV_BINDING: - handle = stream.handle.value - else: - handle = stream.handle + handle = stream.handle.value varargs.append(handle) else: fn = driver.cuMemcpyHtoD @@ -3398,10 +3125,7 @@ def device_to_host(dst, src, size, stream=0): if stream: assert isinstance(stream, Stream) fn = driver.cuMemcpyDtoHAsync - if USE_NV_BINDING: - handle = stream.handle.value - else: - handle = stream.handle + handle = stream.handle.value varargs.append(handle) else: fn = driver.cuMemcpyDtoH @@ -3420,10 +3144,7 @@ def device_to_device(dst, src, size, stream=0): if stream: assert isinstance(stream, Stream) fn = driver.cuMemcpyDtoDAsync - if USE_NV_BINDING: - handle = stream.handle.value - else: - handle = stream.handle + handle = stream.handle.value varargs.append(handle) else: fn = driver.cuMemcpyDtoD @@ -3447,10 +3168,7 @@ def device_memset(dst, val, size, stream=0): if stream: assert isinstance(stream, Stream) fn = driver.cuMemsetD8Async - if USE_NV_BINDING: - handle = stream.handle.value - else: - handle = stream.handle + handle = stream.handle.value varargs.append(handle) else: fn = driver.cuMemsetD8 diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py index 5c45299d..d74fe6e4 100644 --- a/numba_cuda/numba/cuda/cudadrv/mappings.py +++ b/numba_cuda/numba/cuda/cudadrv/mappings.py @@ -1,28 +1,14 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-2-Clause -from numba.cuda import config -from . import enums +from cuda.bindings.driver import CUjitInputType -if config.CUDA_USE_NVIDIA_BINDING: - from cuda.bindings.driver import CUjitInputType - - FILE_EXTENSION_MAP = { - "o": CUjitInputType.CU_JIT_INPUT_OBJECT, - "ptx": CUjitInputType.CU_JIT_INPUT_PTX, - "a": CUjitInputType.CU_JIT_INPUT_LIBRARY, - "lib": CUjitInputType.CU_JIT_INPUT_LIBRARY, - "cubin": CUjitInputType.CU_JIT_INPUT_CUBIN, - "fatbin": CUjitInputType.CU_JIT_INPUT_FATBINARY, - "ltoir": CUjitInputType.CU_JIT_INPUT_NVVM, - } -else: - FILE_EXTENSION_MAP = { - "o": enums.CU_JIT_INPUT_OBJECT, - "ptx": enums.CU_JIT_INPUT_PTX, - "a": enums.CU_JIT_INPUT_LIBRARY, - "lib": enums.CU_JIT_INPUT_LIBRARY, - "cubin": enums.CU_JIT_INPUT_CUBIN, - "fatbin": enums.CU_JIT_INPUT_FATBINARY, - "ltoir": enums.CU_JIT_INPUT_NVVM, - } +FILE_EXTENSION_MAP = { + "o": CUjitInputType.CU_JIT_INPUT_OBJECT, + "ptx": CUjitInputType.CU_JIT_INPUT_PTX, + "a": CUjitInputType.CU_JIT_INPUT_LIBRARY, + "lib": CUjitInputType.CU_JIT_INPUT_LIBRARY, + "cubin": CUjitInputType.CU_JIT_INPUT_CUBIN, + "fatbin": CUjitInputType.CU_JIT_INPUT_FATBINARY, + "ltoir": CUjitInputType.CU_JIT_INPUT_NVVM, +} diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py index cef0b576..8d861fcc 100644 --- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py +++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py @@ -1,23 +1,19 @@ # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: BSD-2-Clause -from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER -from enum import IntEnum from numba.cuda.cudadrv.error import ( CCSupportError, - NvrtcError, - NvrtcBuiltinOperationFailure, - NvrtcCompilationError, - NvrtcSupportError, ) from numba.cuda import config from numba.cuda.cuda_paths import get_cuda_paths from numba.cuda.utils import _readenv -import functools import os -import threading import warnings +import functools + +from cuda.core.experimental import Program, ProgramOptions +from cuda.bindings import nvrtc as bindings_nvrtc NVRTC_EXTRA_SEARCH_PATHS = _readenv( "NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, "" @@ -25,268 +21,13 @@ if not hasattr(config, "CUDA_NVRTC_EXTRA_SEARCH_PATHS"): config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS -# Opaque handle for compilation unit -nvrtc_program = c_void_p - -# Result code -nvrtc_result = c_int - -if config.CUDA_USE_NVIDIA_BINDING: - from cuda.bindings import nvrtc as bindings_nvrtc - from cuda.core.experimental import Program, ProgramOptions - - -class NvrtcResult(IntEnum): - NVRTC_SUCCESS = 0 - NVRTC_ERROR_OUT_OF_MEMORY = 1 - NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2 - NVRTC_ERROR_INVALID_INPUT = 3 - NVRTC_ERROR_INVALID_PROGRAM = 4 - NVRTC_ERROR_INVALID_OPTION = 5 - NVRTC_ERROR_COMPILATION = 6 - NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7 - NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8 - NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9 - NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10 - NVRTC_ERROR_INTERNAL_ERROR = 11 - - -_nvrtc_lock = threading.Lock() - -class NvrtcProgram: - """ - A class for managing the lifetime of nvrtcProgram instances. Instances of - the class own an nvrtcProgram; when an instance is deleted, the underlying - nvrtcProgram is destroyed using the appropriate NVRTC API. - """ - - def __init__(self, nvrtc, handle): - self._nvrtc = nvrtc - self._handle = handle - - @property - def handle(self): - return self._handle - - def __del__(self): - if self._handle: - self._nvrtc.destroy_program(self) - - -class NVRTC: - """ - Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API - calls. - - The sole instance of this class is a process-wide singleton, similar to the - NVVM interface. Initialization is protected by a lock and uses the standard - (for Numba) open_cudalib function to load the NVRTC library. - """ - - _PROTOTYPES = { - # nvrtcResult nvrtcVersion(int *major, int *minor) - "nvrtcVersion": (nvrtc_result, POINTER(c_int), POINTER(c_int)), - # nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, - # const char *src, - # const char *name, - # int numHeaders, - # const char * const *headers, - # const char * const *includeNames) - "nvrtcCreateProgram": ( - nvrtc_result, - nvrtc_program, - c_char_p, - c_char_p, - c_int, - POINTER(c_char_p), - POINTER(c_char_p), - ), - # nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog); - "nvrtcDestroyProgram": (nvrtc_result, POINTER(nvrtc_program)), - # nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, - # int numOptions, - # const char * const *options) - "nvrtcCompileProgram": ( - nvrtc_result, - nvrtc_program, - c_int, - POINTER(c_char_p), - ), - # nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet); - "nvrtcGetPTXSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), - # nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx); - "nvrtcGetPTX": (nvrtc_result, nvrtc_program, c_char_p), - # nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, - # size_t *cubinSizeRet); - "nvrtcGetCUBINSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), - # nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin); - "nvrtcGetCUBIN": (nvrtc_result, nvrtc_program, c_char_p), - # nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, - # size_t *logSizeRet); - "nvrtcGetProgramLogSize": ( - nvrtc_result, - nvrtc_program, - POINTER(c_size_t), - ), - # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log); - "nvrtcGetProgramLog": (nvrtc_result, nvrtc_program, c_char_p), - # nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs); - "nvrtcGetNumSupportedArchs": (nvrtc_result, POINTER(c_int)), - # nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs); - "nvrtcGetSupportedArchs": (nvrtc_result, POINTER(c_int)), - # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet); - "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)), - # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto); - "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p), - } - - # Singleton reference - __INSTANCE = None - - def __new__(cls): - with _nvrtc_lock: - if config.CUDA_USE_NVIDIA_BINDING: - raise RuntimeError( - "NVRTC objects should not be used with cuda-python bindings" - ) - if cls.__INSTANCE is None: - from numba.cuda.cudadrv.libs import open_cudalib - - cls.__INSTANCE = inst = object.__new__(cls) - try: - lib = open_cudalib("nvrtc") - except OSError as e: - cls.__INSTANCE = None - raise NvrtcSupportError("NVRTC cannot be loaded") from e - - # Find & populate functions - for name, proto in inst._PROTOTYPES.items(): - func = getattr(lib, name) - func.restype = proto[0] - func.argtypes = proto[1:] - - @functools.wraps(func) - def checked_call(*args, func=func, name=name): - error = func(*args) - if error == NvrtcResult.NVRTC_ERROR_COMPILATION: - raise NvrtcCompilationError() - elif ( - error - == NvrtcResult.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE - ): - raise NvrtcBuiltinOperationFailure() - elif error != NvrtcResult.NVRTC_SUCCESS: - try: - error_name = NvrtcResult(error).name - except ValueError: - error_name = ( - "Unknown nvrtc_result " - f"(error code: {error})" - ) - msg = f"Failed to call {name}: {error_name}" - raise NvrtcError(msg) - - setattr(inst, name, checked_call) - - return cls.__INSTANCE - - @functools.cache - def get_supported_archs(self): - """ - Get Supported Architectures by NVRTC as list of arch tuples. - """ - num = c_int() - self.nvrtcGetNumSupportedArchs(byref(num)) - archs = (c_int * num.value)() - self.nvrtcGetSupportedArchs(archs) - return [(archs[i] // 10, archs[i] % 10) for i in range(num.value)] - - def get_version(self): - """ - Get the NVRTC version as a tuple (major, minor). - """ - major = c_int() - minor = c_int() - self.nvrtcVersion(byref(major), byref(minor)) - return major.value, minor.value - - def create_program(self, src, name): - """ - Create an NVRTC program with managed lifetime. - """ - if isinstance(src, str): - src = src.encode() - if isinstance(name, str): - name = name.encode() - - handle = nvrtc_program() - - # The final three arguments are for passing the contents of headers - - # this is not supported, so there are 0 headers and the header names - # and contents are null. - self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None) - return NvrtcProgram(self, handle) - - def compile_program(self, program, options): - """ - Compile an NVRTC program. Compilation may fail due to a user error in - the source; this function returns ``True`` if there is a compilation - error and ``False`` on success. - """ - # We hold a list of encoded options to ensure they can't be collected - # prior to the call to nvrtcCompileProgram - encoded_options = [opt.encode() for opt in options] - option_pointers = [c_char_p(opt) for opt in encoded_options] - c_options_type = c_char_p * len(options) - c_options = c_options_type(*option_pointers) - try: - self.nvrtcCompileProgram(program.handle, len(options), c_options) - return False - except (NvrtcCompilationError, NvrtcBuiltinOperationFailure): - return True - - def destroy_program(self, program): - """ - Destroy an NVRTC program. - """ - self.nvrtcDestroyProgram(byref(program.handle)) - - def get_compile_log(self, program): - """ - Get the compile log as a Python string. - """ - log_size = c_size_t() - self.nvrtcGetProgramLogSize(program.handle, byref(log_size)) - - log = (c_char * log_size.value)() - self.nvrtcGetProgramLog(program.handle, log) - - return log.value.decode() - - def get_ptx(self, program): - """ - Get the compiled PTX as a Python string. - """ - ptx_size = c_size_t() - self.nvrtcGetPTXSize(program.handle, byref(ptx_size)) - - ptx = (c_char * ptx_size.value)() - self.nvrtcGetPTX(program.handle, ptx) - - return ptx.value.decode() - - def get_lto(self, program): - """ - Get the compiled LTOIR as a Python bytes object. - """ - lto_size = c_size_t() - self.nvrtcGetLTOIRSize(program.handle, byref(lto_size)) - - lto = b" " * lto_size.value - self.nvrtcGetLTOIR(program.handle, lto) - - return lto +@functools.cache +def _get_nvrtc_version(): + retcode, major, minor = bindings_nvrtc.nvrtcVersion() + if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS: + raise RuntimeError(f"{retcode.name} when calling nvrtcVersion()") + return (major, minor) def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False): @@ -308,17 +49,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False): :return: The compiled PTX or LTOIR and compilation log :rtype: tuple """ - - if config.CUDA_USE_NVIDIA_BINDING: - retcode, *version = bindings_nvrtc.nvrtcVersion() - if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError( - f"{retcode.name} when calling nvrtcGetSupportedArchs()" - ) - version = tuple(version) - else: - nvrtc = NVRTC() - version = nvrtc.get_version() + version = _get_nvrtc_version() ver_str = lambda version: ".".join(str(v) for v in version) supported_ccs = get_supported_ccs() @@ -345,10 +76,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False): # being optimized away. major, minor = found - if config.CUDA_USE_NVIDIA_BINDING: - arch = f"sm_{major}{minor}" - else: - arch = f"--gpu-architecture=compute_{major}{minor}" + arch = f"sm_{major}{minor}" cuda_include_dir = get_cuda_paths()["include_dir"].info cuda_includes = [f"{cuda_include_dir}"] @@ -381,76 +109,35 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False): includes = [numba_include, *cuda_includes, nrt_include, *extra_includes] - if config.CUDA_USE_NVIDIA_BINDING: - options = ProgramOptions( - arch=arch, - include_path=includes, - relocatable_device_code=True, - link_time_optimization=ltoir, - name=name, - debug=debug, - lineinfo=lineinfo, - ) - - class Logger: - def __init__(self): - self.log = [] - - def write(self, msg): - self.log.append(msg) - - logger = Logger() - if isinstance(src, bytes): - src = src.decode("utf8") - - prog = Program(src, "c++", options=options) - result = prog.compile("ltoir" if ltoir else "ptx", logs=logger) - log = "" - if logger.log: - log = logger.log - joined_logs = "\n".join(log) - warnings.warn(f"NVRTC log messages: {joined_logs}") - return result, log - - else: - program = nvrtc.create_program(src, name) - includes = [f"-I{path}" for path in includes] - options = [ - arch, - *includes, - "-rdc", - "true", - ] - - if ltoir: - options.append("-dlto") - if lineinfo: - options.append("-lineinfo") - if debug: - options.append("-G") - - # Compile the program - compile_error = nvrtc.compile_program(program, options) - - # Get log from compilation - log = nvrtc.get_compile_log(program) - - # If the compile failed, provide the log in an exception - if compile_error: - msg = f"NVRTC Compilation failure whilst compiling {name}:\n\n{log}" - raise NvrtcError(msg) - - # Otherwise, if there's any content in the log, present it as a warning - if log: - msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}" - warnings.warn(msg) - - if ltoir: - ltoir = nvrtc.get_lto(program) - return ltoir, log - else: - ptx = nvrtc.get_ptx(program) - return ptx, log + options = ProgramOptions( + arch=arch, + include_path=includes, + relocatable_device_code=True, + link_time_optimization=ltoir, + name=name, + debug=debug, + lineinfo=lineinfo, + ) + + class Logger: + def __init__(self): + self.log = [] + + def write(self, msg): + self.log.append(msg) + + logger = Logger() + if isinstance(src, bytes): + src = src.decode("utf8") + + prog = Program(src, "c++", options=options) + result = prog.compile("ltoir" if ltoir else "ptx", logs=logger) + log = "" + if logger.log: + log = logger.log + joined_logs = "\n".join(log) + warnings.warn(f"NVRTC log messages: {joined_logs}") + return result, log def find_closest_arch(mycc): @@ -498,12 +185,9 @@ def get_lowest_supported_cc(): def get_supported_ccs(): - if config.CUDA_USE_NVIDIA_BINDING: - retcode, archs = bindings_nvrtc.nvrtcGetSupportedArchs() - if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError( - f"{retcode.name} when calling nvrtcGetSupportedArchs()" - ) - return [(arch // 10, arch % 10) for arch in archs] - else: - return NVRTC().get_supported_archs() + retcode, archs = bindings_nvrtc.nvrtcGetSupportedArchs() + if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS: + raise RuntimeError( + f"{retcode.name} when calling nvrtcGetSupportedArchs()" + ) + return [(arch // 10, arch % 10) for arch in archs] diff --git a/numba_cuda/numba/cuda/cudadrv/runtime.py b/numba_cuda/numba/cuda/cudadrv/runtime.py index ebe615e0..004d4184 100644 --- a/numba_cuda/numba/cuda/cudadrv/runtime.py +++ b/numba_cuda/numba/cuda/cudadrv/runtime.py @@ -8,23 +8,12 @@ to the runtime anymore. This file is provided to maintain the existing API. """ -from numba.cuda import config -from numba.cuda.cudadrv.nvrtc import NVRTC +from numba.cuda.cudadrv.nvrtc import _get_nvrtc_version class Runtime: def get_version(self): - if config.CUDA_USE_NVIDIA_BINDING: - from cuda.bindings import nvrtc - - retcode, *version = nvrtc.nvrtcVersion() - if retcode != nvrtc.nvrtcResult.NVRTC_SUCCESS: - raise RuntimeError( - f"{retcode.name} when calling nvrtcGetVersion()" - ) - return tuple(version) - else: - return NVRTC().get_version() + return _get_nvrtc_version() runtime = Runtime() diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py index c3671266..972c869e 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py @@ -3,7 +3,6 @@ import numpy as np import warnings -from numba.cuda import config from numba.cuda.testing import unittest from numba.cuda.testing import ( skip_on_cudasim, @@ -15,7 +14,6 @@ from numba.cuda import require_context from numba.cuda.tests.support import ignore_internal_warnings from numba import cuda, void, float64, int64, int32, typeof, float32 -from numba.cuda.cudadrv.error import NvrtcError CONST1D = np.arange(10, dtype=np.float64) @@ -179,23 +177,27 @@ def test_linking_cu_log_warning(self): def kernel(x): bar(x) - self.assertEqual(len(w), 1, "Expected warnings from NVRTC") + nvrtc_log_warnings = [ + wi for wi in w if "NVRTC log messages" in str(wi.message) + ] + self.assertEqual( + len(nvrtc_log_warnings), 1, "Expected warnings from NVRTC" + ) # Check the warning refers to the log messages - self.assertIn("NVRTC log messages", str(w[0].message)) + self.assertIn("NVRTC log messages", str(nvrtc_log_warnings[0].message)) # Check the message pertaining to the unused variable is provided - self.assertIn("declared but never referenced", str(w[0].message)) + self.assertIn( + "declared but never referenced", str(nvrtc_log_warnings[0].message) + ) def test_linking_cu_error(self): bar = cuda.declare_device("bar", "int32(int32)") link = str(test_data_dir / "error.cu") - if config.CUDA_USE_NVIDIA_BINDING: - from cuda.core.experimental._utils.cuda_utils import NVRTCError + from cuda.core.experimental._utils.cuda_utils import NVRTCError - errty = NVRTCError - else: - errty = NvrtcError + errty = NVRTCError with self.assertRaises(errty) as e: @cuda.jit("void(int32)", link=[link]) @@ -204,11 +206,7 @@ def kernel(x): msg = e.exception.args[0] # Check the error message refers to the NVRTC compile - nvrtc_err_str = ( - "NVRTC_ERROR_COMPILATION" - if config.CUDA_USE_NVIDIA_BINDING - else "NVRTC Compilation failure" - ) + nvrtc_err_str = "NVRTC_ERROR_COMPILATION" self.assertIn(nvrtc_err_str, msg) # Check the expected error in the CUDA source is reported self.assertIn('identifier "SYNTAX" is undefined', msg) diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py b/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py index 8a4f5b44..e74d97e9 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py @@ -17,10 +17,7 @@ if not config.ENABLE_CUDASIM: from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD - if config.CUDA_USE_NVIDIA_BINDING: - from cuda.bindings.driver import CUmodule as cu_module_type - else: - from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type + from cuda.bindings.driver import CUmodule as cu_module_type def wipe_all_modules_in_context(): @@ -35,8 +32,6 @@ def wipe_all_modules_in_context(): def get_hashable_handle_value(handle): - if not config.CUDA_USE_NVIDIA_BINDING: - handle = handle.value return handle diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py index 17b54d85..d7344541 100644 --- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py +++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py @@ -45,10 +45,8 @@ @unittest.skipIf( - not config.CUDA_USE_NVIDIA_BINDING - or not TEST_BIN_DIR - or not _have_nvjitlink(), - "NVIDIA cuda bindings not enabled or nvJitLink not installed or new enough (>12.3)", + not TEST_BIN_DIR or not _have_nvjitlink(), + "nvJitLink not installed or new enough (>12.3)", ) @skip_on_cudasim("Linking unsupported in the simulator") class TestLinker(CUDATestCase): diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py index 5142c539..45f5634c 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py @@ -557,12 +557,9 @@ def f(z, x, y): link_obj = LinkableCode.from_path(link) if link_obj.kind == "cu": # if link is a cu file, result contains a compiled object code - if cuda.config.CUDA_USE_NVIDIA_BINDING: - from cuda.core.experimental import ObjectCode + from cuda.core.experimental import ObjectCode - assert isinstance(code_list[1], ObjectCode) - else: - assert isinstance(code_list[1], bytes) + assert isinstance(code_list[1], ObjectCode) else: assert code_list[1].kind == link_obj.kind @@ -581,13 +578,10 @@ def f(z, x, y): ) assert len(code_list) == 2 - if cuda.config.CUDA_USE_NVIDIA_BINDING: - self.assertRegex( - str(code_list[1].code.decode()), - r"\.file.*test_device_functions", - ) - else: - self.assertRegex(code_list[1], r"\.file.*test_device_functions") + self.assertRegex( + str(code_list[1].code.decode()), + r"\.file.*test_device_functions", + ) @unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.") def test_compile_all_debug(self): @@ -604,12 +598,9 @@ def f(z, x, y): ) assert len(code_list) == 2 - if cuda.config.CUDA_USE_NVIDIA_BINDING: - self.assertRegex( - str(code_list[1].code.decode()), r"\.section\s+\.debug_info" - ) - else: - self.assertRegex(code_list[1], r"\.section\s+\.debug_info") + self.assertRegex( + str(code_list[1].code.decode()), r"\.section\s+\.debug_info" + ) @skip_on_cudasim("Compilation unsupported in the simulator") diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py index 48669e7d..3ea51a1a 100644 --- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py +++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py @@ -4,7 +4,7 @@ from numba import cuda from numba.core.errors import TypingError from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim -from numba.cuda import config +from numba.cuda.cudadrv import driver def noop(x): @@ -95,7 +95,7 @@ def kernel_func(): @skip_on_cudasim("Simulator does not use nvjitlink") @unittest.skipIf( - config.CUDA_USE_NVIDIA_BINDING, "NVIDIA cuda bindings enabled" + driver._have_nvjitlink(), "nvJitLink available; LTO should not error" ) def test_lto_without_nvjitlink_error(self): with self.assertRaisesRegex(RuntimeError, "LTO requires nvjitlink"): diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py index b87f4d35..2757d576 100644 --- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py +++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py @@ -173,16 +173,7 @@ def test_nrt_detect_linked_ptx_file(self): cc = get_current_device().compute_capability ptx, _ = compile(src, "external_nrt.cu", cc) - @cuda.jit( - link=[ - PTXSource( - ptx.code - if config.CUDA_USE_NVIDIA_BINDING - else ptx.encode(), - nrt=True, - ) - ] - ) + @cuda.jit(link=[PTXSource(ptx.code, nrt=True)]) def kernel(): allocate_deallocate_handle() diff --git a/pyproject.toml b/pyproject.toml index 54ee6511..12ed8284 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -21,7 +21,7 @@ authors = [ license = "BSD-2-clause" license-files = ["LICENSE", "LICENSE.numba"] requires-python = ">=3.9" -dependencies = ["numba>=0.60.0"] +dependencies = ["numba>=0.60.0", "cuda-bindings>=12.9.1,<14.0.0", "cuda-core>=0.3.2,<0.4.0dev0"] [project.optional-dependencies] cu12 = [