diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index b2b5a2a6..b0434017 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,13 +19,11 @@ jobs:
- compute-matrix
- build-conda
- test-conda
- - test-conda-ctypes-binding
- test-simulator
- build-wheels
- build-wheels-windows
- test-wheels-windows
- test-wheels
- - test-wheels-ctypes-binding
- test-wheels-deps-wheels
- test-thirdparty
- build-docs
@@ -80,18 +78,6 @@ jobs:
script: "ci/test_conda.sh"
run_codecov: false
matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
- test-conda-ctypes-binding:
- needs:
- - build-conda
- - compute-matrix
- uses: ./.github/workflows/conda-python-tests.yaml
- with:
- build_type: pull-request
- script: "ci/test_conda_ctypes_binding.sh"
- run_codecov: false
- # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version".
- matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
- matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | split(".") | .[0] | tonumber >= 12))) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
test-simulator:
needs:
- build-conda
@@ -124,17 +110,6 @@ jobs:
build_type: pull-request
script: "ci/test_wheel.sh false"
matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
- test-wheels-ctypes-binding:
- needs:
- - build-wheels
- - compute-matrix
- uses: ./.github/workflows/wheels-test.yaml
- with:
- build_type: pull-request
- script: "ci/test_wheel_ctypes_binding.sh"
- # This selects "ARCH=amd64 and CUDA >=12, with the latest supported Python for each CUDA major version".
- matrix: ${{ needs.compute-matrix.outputs.TEST_MATRIX }}
- matrix_filter: map(select(.ARCH == "amd64" and (.CUDA_VER | split(".") | .[0] | tonumber >= 12))) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
test-wheels-deps-wheels:
needs:
- build-wheels
diff --git a/ci/test_conda_ctypes_binding.sh b/ci/test_conda_ctypes_binding.sh
deleted file mode 100755
index eaedde07..00000000
--- a/ci/test_conda_ctypes_binding.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-2-Clause
-
-set -euo pipefail
-
-. /opt/conda/etc/profile.d/conda.sh
-
-CTK_PACKAGE_DEPENDENCIES=(
- "cuda-nvcc-impl"
- "cuda-nvrtc"
- "cuda-cuobjdump"
- "libcurand-dev"
-)
-
-rapids-logger "Install testing dependencies"
-# TODO: Replace with rapids-dependency-file-generator
-DEPENDENCIES=(
- "c-compiler"
- "cxx-compiler"
- "${CTK_PACKAGE_DEPENDENCIES[@]}"
- "cuda-python"
- "cuda-version=${CUDA_VER%.*}"
- "make"
- "psutil"
- "pytest"
- "pytest-xdist"
- "cffi"
- "ml_dtypes"
- "python=${RAPIDS_PY_VERSION}"
- "numba-cuda"
-)
-rapids-mamba-retry create \
- -n test \
- --strict-channel-priority \
- --channel "`pwd`/conda-repo" \
- --channel conda-forge \
- "${DEPENDENCIES[@]}"
-
-# Temporarily allow unbound variables for conda activation.
-set +u
-conda activate test
-set -u
-
-pip install filecheck
-
-rapids-print-env
-
-rapids-logger "Check GPU usage"
-nvidia-smi
-
-rapids-logger "Build test binaries"
-export NUMBA_CUDA_TEST_BIN_DIR=`pwd`/testing
-pushd $NUMBA_CUDA_TEST_BIN_DIR
-make -j $(nproc)
-
-rapids-logger "Show Numba system info"
-python -m numba --sysinfo
-
-EXITCODE=0
-trap "EXITCODE=1" ERR
-set +e
-
-rapids-logger "Run Tests"
-NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR pytest -v
-
-popd
-
-rapids-logger "Test script exiting with value: $EXITCODE"
-exit ${EXITCODE}
diff --git a/ci/test_wheel_ctypes_binding.sh b/ci/test_wheel_ctypes_binding.sh
deleted file mode 100755
index e6795c8a..00000000
--- a/ci/test_wheel_ctypes_binding.sh
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: BSD-2-Clause
-
-set -euo pipefail
-
-CUDA_VER_MAJOR=${CUDA_VER%.*.*}
-
-rapids-logger "Install wheel with testing dependencies"
-package=$(realpath wheel/numba_cuda*.whl)
-echo "Package path: $package"
-python -m pip install \
- "${package}" \
- cuda-python \
- --group test
-
-# FIXME: Find a way to build the tests that does not depend on the CUDA Python bindings
-#rapids-logger "Build tests"
-rapids-logger "Copy and cd into test binaries dir"
-export NUMBA_CUDA_TEST_BIN_DIR=`pwd`/testing
-pushd $NUMBA_CUDA_TEST_BIN_DIR
-# make -j $(nproc)
-
-# Prevent the testsuite trying to use the test binaries
-unset NUMBA_CUDA_TEST_BIN_DIR
-
-rapids-logger "Check GPU usage"
-nvidia-smi
-
-rapids-logger "Show Numba system info"
-NUMBA_CUDA_USE_NVIDIA_BINDING=0 python -m numba --sysinfo
-
-rapids-logger "Run Tests"
-# NUMBA_CUDA_USE_NVIDIA_BINDING=0 NUMBA_CUDA_TEST_BIN_DIR=$NUMBA_CUDA_TEST_BIN_DIR python -m pytest --pyargs numba.cuda.tests -v
-NUMBA_CUDA_USE_NVIDIA_BINDING=0 python -m pytest -v
-
-popd
diff --git a/docs/source/reference/envvars.rst b/docs/source/reference/envvars.rst
index 1076764e..1bcee9a7 100644
--- a/docs/source/reference/envvars.rst
+++ b/docs/source/reference/envvars.rst
@@ -103,12 +103,11 @@ target.
Enable warnings if a kernel is launched with host memory which forces a copy to and
from the device. This option is on by default (default value is 1).
-.. envvar:: NUMBA_CUDA_USE_NVIDIA_BINDING
+.. note::
- When set to 1, Numba will attempt to use the `NVIDIA CUDA Python binding
- `_ to make calls to the driver API
- instead of using its own ctypes binding. This defaults to 1 (on). Set to
- 0 to use the ctypes bindings.
+ Numba-CUDA always uses the NVIDIA CUDA Python bindings. The legacy ctypes
+ bindings and the ``NUMBA_CUDA_USE_NVIDIA_BINDING`` environment variable have
+ been removed.
.. envvar:: NUMBA_CUDA_INCLUDE_PATH
diff --git a/docs/source/user/bindings.rst b/docs/source/user/bindings.rst
index d675e4a2..244c0f08 100644
--- a/docs/source/user/bindings.rst
+++ b/docs/source/user/bindings.rst
@@ -5,25 +5,22 @@
CUDA Bindings
=============
-Numba supports two bindings to the CUDA Driver APIs: its own internal bindings
-based on ctypes, and the official `NVIDIA CUDA Python bindings
-`_. Functionality is equivalent between
-the two bindings.
-
-The internal bindings are used by default. If the NVIDIA bindings are installed,
-then they can be used by setting the environment variable
-``NUMBA_CUDA_USE_NVIDIA_BINDING`` to ``1`` prior to the import of Numba. Once
-Numba has been imported, the selected binding cannot be changed.
+Numba-CUDA uses the official `NVIDIA CUDA Python bindings
+`_ for all CUDA Driver interactions.
+Numba-CUDA previously provided its own internal ctypes-based bindings; the
+public APIs exposing those bindings are kept for compatibility, but if you
+need to interact directly with the CUDA Driver or other CUDA libraries we
+recommend using the `cuda-python `_
+package directly.
Per-Thread Default Streams
--------------------------
Responsibility for handling Per-Thread Default Streams (PTDS) is delegated to
-the NVIDIA bindings when they are in use. To use PTDS with the NVIDIA bindings,
-set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to
-``1`` instead of Numba's environmnent variable
-:envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`.
+the NVIDIA bindings. To use PTDS, set the environment variable
+``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to ``1`` instead of Numba's
+environment variable :envvar:`NUMBA_CUDA_PER_THREAD_DEFAULT_STREAM`.
.. seealso::
@@ -35,13 +32,5 @@ set the environment variable ``CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM`` to
Roadmap
-------
-In Numba 0.56, the NVIDIA Bindings will be used by default, if they are
-installed.
-
-In future versions of Numba:
-
-- The internal bindings will be deprecated.
-- The internal bindings will be removed.
-
-At present, no specific release is planned for the deprecation or removal of
-the internal bindings.
+The ctypes-based internal bindings have been removed in favor of the NVIDIA
+bindings. Future work focuses on expanding usage of ``cuda.core`` APIs.
diff --git a/docs/source/user/installation.rst b/docs/source/user/installation.rst
index 8acfb775..82b3cd21 100644
--- a/docs/source/user/installation.rst
+++ b/docs/source/user/installation.rst
@@ -61,14 +61,12 @@ Configuration
CUDA Bindings
-------------
-Numba supports interacting with the CUDA Driver API via either the `NVIDIA CUDA
-Python bindings `_ or its own ctypes-based
-bindings. Functionality is equivalent between the two binding choices. The
-NVIDIA bindings are the default, and the ctypes bindings are now deprecated.
-
-If you do not want to use the NVIDIA bindings, the (deprecated) ctypes bindings
-can be enabled by setting the environment variable
-:envvar:`NUMBA_CUDA_USE_NVIDIA_BINDING` to ``"0"``.
+Numba-CUDA uses the `NVIDIA CUDA Python bindings `_
+for interacting with the CUDA Driver API. Numba-CUDA previously provided its own
+internal ctypes-based bindings; the public APIs exposing those bindings are kept
+for compatibility, but if you need to interact directly with the CUDA Driver or
+other CUDA libraries we recommend using the `cuda-python `_
+package directly.
.. _cudatoolkit-lookup:
@@ -79,22 +77,8 @@ CUDA Driver and Toolkit search paths
Default behavior
~~~~~~~~~~~~~~~~
-When using the NVIDIA bindings, searches for the CUDA driver and toolkit
-libraries use its `built-in path-finding logic `_.
-
-Ctypes bindings (deprecated) behavior
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-When using the ctypes bindings, Numba searches for a CUDA toolkit installation
-in the following order:
-
-1. Conda-installed CUDA Toolkit packages
-2. Pip-installed CUDA Toolkit packages
-3. The environment variable ``CUDA_HOME``, which points to the directory of the
- installed CUDA toolkit (i.e. ``/home/user/cuda-12``)
-4. System-wide installation at exactly ``/usr/local/cuda`` on Linux platforms.
- Versioned installation paths (i.e. ``/usr/local/cuda-12.0``) are intentionally
- ignored. Users can use ``CUDA_HOME`` to select specific versions.
+Searches for the CUDA driver and toolkit libraries use the NVIDIA bindings'
+`built-in path-finding logic `_.
In addition to the CUDA toolkit libraries, which can be installed by conda into
an environment or installed system-wide by the `CUDA SDK installer
diff --git a/numba_cuda/numba/cuda/__init__.py b/numba_cuda/numba/cuda/__init__.py
index a27c8fbb..9362985c 100644
--- a/numba_cuda/numba/cuda/__init__.py
+++ b/numba_cuda/numba/cuda/__init__.py
@@ -8,65 +8,15 @@
import sys
-# Enable pynvjitlink based on the following precedence:
-# 1. Config setting "CUDA_ENABLE_PYNVJITLINK" (highest priority)
-# 2. Environment variable "NUMBA_CUDA_ENABLE_PYNVJITLINK"
-# 3. Auto-detection of pynvjitlink module (lowest priority)
-
-pynvjitlink_auto_enabled = False
-
-if getattr(config, "CUDA_ENABLE_PYNVJITLINK", None) is None:
- if (
- _pynvjitlink_enabled_in_env := _readenv(
- "NUMBA_CUDA_ENABLE_PYNVJITLINK", bool, None
- )
- ) is not None:
- config.CUDA_ENABLE_PYNVJITLINK = _pynvjitlink_enabled_in_env
- else:
- pynvjitlink_auto_enabled = (
- importlib.util.find_spec("pynvjitlink") is not None
- )
- config.CUDA_ENABLE_PYNVJITLINK = pynvjitlink_auto_enabled
-
-# Upstream numba sets CUDA_USE_NVIDIA_BINDING to 0 by default, so it always
-# exists. Override, but not if explicitly set to 0 in the envioronment.
-_nvidia_binding_enabled_in_env = _readenv(
- "NUMBA_CUDA_USE_NVIDIA_BINDING", bool, None
-)
-if _nvidia_binding_enabled_in_env is False:
- USE_NV_BINDING = False
-else:
- USE_NV_BINDING = True
- config.CUDA_USE_NVIDIA_BINDING = USE_NV_BINDING
-if config.CUDA_USE_NVIDIA_BINDING:
- if not (
- importlib.util.find_spec("cuda")
- and importlib.util.find_spec("cuda.bindings")
- ):
- raise ImportError(
- "CUDA bindings not found. Please pip install the "
- "cuda-bindings package. Alternatively, install "
- "numba-cuda[cuXY], where XY is the required CUDA "
- "version, to install the binding automatically. "
- "If no CUDA bindings are desired, set the env var "
- "NUMBA_CUDA_USE_NVIDIA_BINDING=0 to enable ctypes "
- "bindings."
- )
-
-if config.CUDA_ENABLE_PYNVJITLINK:
- if USE_NV_BINDING and not pynvjitlink_auto_enabled:
- warnings.warn(
- "Explicitly enabling pynvjitlink is no longer necessary. "
- "NVIDIA bindings are enabled. cuda.core will be used "
- "in place of pynvjitlink."
- )
- elif pynvjitlink_auto_enabled:
- # Ignore the fact that pynvjitlink is enabled, because that was an
- # automatic decision based on discovering pynvjitlink was present; the
- # user didn't ask for it
- pass
- else:
- raise RuntimeError("nvJitLink requires the NVIDIA CUDA bindings. ")
+# Require NVIDIA CUDA bindings at import time
+if not (
+ importlib.util.find_spec("cuda")
+ and importlib.util.find_spec("cuda.bindings")
+):
+ raise ImportError(
+ "NVIDIA CUDA Python bindings not found. Install the 'cuda' package "
+ "(e.g. pip install nvidia-cuda-python or numba-cuda[cuXY])."
+ )
if config.ENABLE_CUDASIM:
from .simulator_init import *
diff --git a/numba_cuda/numba/cuda/api.py b/numba_cuda/numba/cuda/api.py
index 9fe8cd7e..dbb2d94c 100644
--- a/numba_cuda/numba/cuda/api.py
+++ b/numba_cuda/numba/cuda/api.py
@@ -8,7 +8,6 @@
import contextlib
import os
-from numba.cuda.cudadrv import drvapi
import numpy as np
from .cudadrv import devicearray, devices, driver
@@ -48,10 +47,7 @@ def from_cuda_array_interface(desc, owner=None, sync=True):
)
size = driver.memory_size_from_info(shape, strides, dtype.itemsize)
- if config.CUDA_USE_NVIDIA_BINDING:
- cudevptr_class = driver.binding.CUdeviceptr
- else:
- cudevptr_class = drvapi.cu_device_ptr
+ cudevptr_class = driver.binding.CUdeviceptr
devptr = cudevptr_class(desc["data"][0])
data = driver.MemoryPointer(
current_context(), devptr, size=size, owner=owner
diff --git a/numba_cuda/numba/cuda/codegen.py b/numba_cuda/numba/cuda/codegen.py
index c40253c1..3ad26b36 100644
--- a/numba_cuda/numba/cuda/codegen.py
+++ b/numba_cuda/numba/cuda/codegen.py
@@ -26,10 +26,7 @@ def run_nvdisasm(cubin, flags):
try:
fd, fname = tempfile.mkstemp()
with open(fname, "wb") as f:
- if config.CUDA_USE_NVIDIA_BINDING:
- f.write(cubin.code)
- else:
- f.write(cubin)
+ f.write(cubin.code)
try:
cp = subprocess.run(
diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
index d0e863be..1337d77e 100644
--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
@@ -37,8 +37,6 @@
c_char_p,
addressof,
c_void_p,
- c_float,
- c_uint,
c_uint8,
)
import contextlib
@@ -52,26 +50,27 @@
from numba.cuda import utils, serialize
from .error import CudaSupportError, CudaDriverError
from .drvapi import API_PROTOTYPES
-from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj, cu_uuid
+from .drvapi import cu_occupancy_b2d_size, cu_stream_callback_pyobj
from .mappings import FILE_EXTENSION_MAP
from .linkable_code import LinkableCode, LTOIR, Fatbin, Object
from numba.cuda.utils import cached_file_read
from numba.cuda.cudadrv import enums, drvapi, nvrtc
-USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
+from cuda.bindings import driver as binding
+from cuda.core.experimental import (
+ Linker,
+ LinkerOptions,
+ ObjectCode,
+)
-if USE_NV_BINDING:
- from cuda.bindings import driver as binding
- from cuda.core.experimental import (
- Linker,
- LinkerOptions,
- ObjectCode,
- )
+# For backwards compatibility: indicate that the NVIDIA CUDA Python bindings are
+# in use. Older code checks this flag to branch on binding-specific behavior.
+USE_NV_BINDING = True
- # There is no definition of the default stream in the Nvidia bindings (nor
- # is there at the C/C++ level), so we define it here so we don't need to
- # use a magic number 0 in places where we want the default stream.
- CU_STREAM_DEFAULT = 0
+# There is no definition of the default stream in the Nvidia bindings (nor
+# is there at the C/C++ level), so we define it here so we don't need to
+# use a magic number 0 in places where we want the default stream.
+CU_STREAM_DEFAULT = 0
MIN_REQUIRED_CC = (3, 5)
@@ -83,16 +82,6 @@
_py_decref.argtypes = [ctypes.py_object]
_py_incref.argtypes = [ctypes.py_object]
-USE_NV_BINDING = config.CUDA_USE_NVIDIA_BINDING
-
-if USE_NV_BINDING:
- from cuda.bindings import driver as binding
-
- # There is no definition of the default stream in the Nvidia bindings (nor
- # is there at the C/C++ level), so we define it here so we don't need to
- # use a magic number 0 in places where we want the default stream.
- CU_STREAM_DEFAULT = 0
-
def make_logger():
logger = logging.getLogger(__name__)
@@ -120,20 +109,27 @@ def make_logger():
@functools.cache
def _have_nvjitlink():
- if not USE_NV_BINDING:
- return False
try:
from cuda.bindings._internal import nvjitlink as nvjitlink_internal
from cuda.bindings._internal.utils import NotSupportedError
except ImportError:
return False
+
try:
- return (
+ if (
nvjitlink_internal._inspect_function_pointer("__nvJitLinkVersion")
- != 0
- )
+ == 0
+ ):
+ return False
+ try:
+ from cuda.bindings import nvjitlink
+
+ if nvjitlink.version() < (12, 3):
+ return False
+ except Exception:
+ return False
+ return True
except (RuntimeError, NotSupportedError):
- # no driver
return False
@@ -312,10 +308,7 @@ def __getattr__(self, fname):
"Error at driver init: \n%s:" % self.initialization_error
)
- if USE_NV_BINDING:
- return self._cuda_python_wrap_fn(fname)
- else:
- return self._ctypes_wrap_fn(fname)
+ return self._cuda_python_wrap_fn(fname)
def _ctypes_wrap_fn(self, fname, libfn=None):
# Wrap a CUDA driver function by default
@@ -375,12 +368,8 @@ def safe_cuda_api_call(*args):
def _find_api(self, fname):
# We use alternatively-named functions for PTDS with the Numba ctypes
- # binding. For the NVidia binding, it handles linking to the correct
- # variant.
- if config.CUDA_PER_THREAD_DEFAULT_STREAM and not USE_NV_BINDING:
- variants = ("_v2_ptds", "_v2_ptsz", "_ptds", "_ptsz", "_v2", "")
- else:
- variants = ("_v2", "")
+ # binding. It handles linking to the correct variant.
+ variants = ("_v2", "")
if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
return getattr(self.lib, fname)
@@ -437,12 +426,7 @@ def get_device(self, devnum=0):
return weakref.proxy(dev)
def get_device_count(self):
- if USE_NV_BINDING:
- return self.cuDeviceGetCount()
-
- count = c_int()
- self.cuDeviceGetCount(byref(count))
- return count.value
+ return self.cuDeviceGetCount()
def list_devices(self):
"""Returns a list of active devices"""
@@ -459,11 +443,7 @@ def pop_active_context(self):
"""
with self.get_active_context() as ac:
if ac.devnum is not None:
- if USE_NV_BINDING:
- popped = drvapi.cu_context(int(driver.cuCtxPopCurrent()))
- else:
- popped = drvapi.cu_context()
- driver.cuCtxPopCurrent(byref(popped))
+ popped = drvapi.cu_context(int(driver.cuCtxPopCurrent()))
return popped
def get_active_context(self):
@@ -474,14 +454,8 @@ def get_version(self):
"""
Returns the CUDA Driver version as a tuple (major, minor).
"""
- if USE_NV_BINDING:
- version = driver.cuDriverGetVersion()
- else:
- dv = ctypes.c_int(0)
- driver.cuDriverGetVersion(ctypes.byref(dv))
- version = dv.value
-
# The version is encoded as (1000 * major) + (10 * minor)
+ version = driver.cuDriverGetVersion()
major = version // 1000
minor = (version - (major * 1000)) // 10
return (major, minor)
@@ -504,26 +478,16 @@ def __enter__(self):
hctx, devnum = self._tls_cache.ctx_devnum
# Not cached. Query the driver API.
else:
- if USE_NV_BINDING:
- hctx = driver.cuCtxGetCurrent()
- if int(hctx) == 0:
- hctx = None
- else:
- hctx = drvapi.cu_context(int(hctx))
+ hctx = driver.cuCtxGetCurrent()
+ if int(hctx) == 0:
+ hctx = None
else:
- hctx = drvapi.cu_context(0)
- driver.cuCtxGetCurrent(byref(hctx))
- hctx = hctx if hctx.value else None
+ hctx = drvapi.cu_context(int(hctx))
if hctx is None:
devnum = None
else:
- if USE_NV_BINDING:
- devnum = int(driver.cuCtxGetDevice())
- else:
- hdevice = drvapi.cu_device()
- driver.cuCtxGetDevice(byref(hdevice))
- devnum = hdevice.value
+ devnum = int(driver.cuCtxGetDevice())
self._tls_cache.ctx_devnum = (hctx, devnum)
is_top = True
@@ -582,15 +546,9 @@ def from_identity(self, identity):
raise RuntimeError(errmsg)
def __init__(self, devnum):
- if USE_NV_BINDING:
- result = driver.cuDeviceGet(devnum)
- self.id = result
- got_devnum = int(result)
- else:
- result = c_int()
- driver.cuDeviceGet(byref(result), devnum)
- got_devnum = result.value
- self.id = got_devnum
+ result = driver.cuDeviceGet(devnum)
+ self.id = result
+ got_devnum = int(result)
msg = f"Driver returned device {got_devnum} instead of {devnum}"
if devnum != got_devnum:
@@ -606,25 +564,14 @@ def __init__(self, devnum):
# Read name
bufsz = 128
-
- if USE_NV_BINDING:
- buf = driver.cuDeviceGetName(bufsz, self.id)
- name = buf.split(b"\x00")[0]
- else:
- buf = (c_char * bufsz)()
- driver.cuDeviceGetName(buf, bufsz, self.id)
- name = buf.value
+ buf = driver.cuDeviceGetName(bufsz, self.id)
+ name = buf.split(b"\x00")[0]
self.name = name
# Read UUID
- if USE_NV_BINDING:
- uuid = driver.cuDeviceGetUuid(self.id)
- uuid_vals = tuple(uuid.bytes)
- else:
- uuid = cu_uuid()
- driver.cuDeviceGetUuid(byref(uuid), self.id)
- uuid_vals = tuple(bytes(uuid))
+ uuid = driver.cuDeviceGetUuid(self.id)
+ uuid_vals = tuple(uuid.bytes)
b = "%02x"
b2 = b * 2
@@ -647,20 +594,10 @@ def __repr__(self):
def __getattr__(self, attr):
"""Read attributes lazily"""
- if USE_NV_BINDING:
- code = getattr(
- binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}"
- )
- value = driver.cuDeviceGetAttribute(code, self.id)
- else:
- try:
- code = DEVICE_ATTRIBUTES[attr]
- except KeyError:
- raise AttributeError(attr)
-
- result = c_int()
- driver.cuDeviceGetAttribute(byref(result), code, self.id)
- value = result.value
+ code = getattr(
+ binding.CUdevice_attribute, f"CU_DEVICE_ATTRIBUTE_{attr}"
+ )
+ value = driver.cuDeviceGetAttribute(code, self.id)
setattr(self, attr, value)
return value
@@ -686,12 +623,8 @@ def get_primary_context(self):
met_requirement_for_device(self)
# create primary context
- if USE_NV_BINDING:
- hctx = driver.cuDevicePrimaryCtxRetain(self.id)
- hctx = drvapi.cu_context(int(hctx))
- else:
- hctx = drvapi.cu_context()
- driver.cuDevicePrimaryCtxRetain(byref(hctx), self.id)
+ hctx = driver.cuDevicePrimaryCtxRetain(self.id)
+ hctx = drvapi.cu_context(int(hctx))
ctx = Context(weakref.proxy(self), hctx)
self.primary_context = ctx
@@ -879,11 +812,7 @@ def _attempt_allocation(self, allocator):
return allocator()
except CudaAPIError as e:
# is out-of-memory?
- if USE_NV_BINDING:
- oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY
- else:
- oom_code = enums.CUDA_ERROR_OUT_OF_MEMORY
-
+ oom_code = binding.CUresult.CUDA_ERROR_OUT_OF_MEMORY
if e.code == oom_code:
# clear pending deallocations
self.deallocations.clear()
@@ -906,29 +835,15 @@ def memhostalloc(self, size, mapped=False, portable=False, wc=False):
if wc:
flags |= enums.CU_MEMHOSTALLOC_WRITECOMBINED
- if USE_NV_BINDING:
-
- def allocator():
- return driver.cuMemHostAlloc(size, flags)
-
- if mapped:
- pointer = self._attempt_allocation(allocator)
- else:
- pointer = allocator()
+ def allocator():
+ return driver.cuMemHostAlloc(size, flags)
- alloc_key = pointer
+ if mapped:
+ pointer = self._attempt_allocation(allocator)
else:
- pointer = c_void_p()
+ pointer = allocator()
- def allocator():
- driver.cuMemHostAlloc(byref(pointer), size, flags)
-
- if mapped:
- self._attempt_allocation(allocator)
- else:
- allocator()
-
- alloc_key = pointer.value
+ alloc_key = pointer
finalizer = _hostalloc_finalizer(self, pointer, alloc_key, size, mapped)
ctx = weakref.proxy(self.context)
@@ -946,13 +861,7 @@ def mempin(self, owner, pointer, size, mapped=False):
It is recommended that this method is not overridden by EMM Plugin
implementations - instead, use the :class:`BaseCUDAMemoryManager`.
"""
- if isinstance(pointer, int) and not USE_NV_BINDING:
- pointer = c_void_p(pointer)
-
- if USE_NV_BINDING:
- alloc_key = pointer
- else:
- alloc_key = pointer.value
+ alloc_key = pointer
# possible flags are "portable" (between context)
# and "device-map" (map host memory to device thus no need
@@ -985,37 +894,19 @@ def allocator():
)
def memallocmanaged(self, size, attach_global):
- if USE_NV_BINDING:
-
- def allocator():
- ma_flags = binding.CUmemAttach_flags
-
- if attach_global:
- flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value
- else:
- flags = ma_flags.CU_MEM_ATTACH_HOST.value
-
- return driver.cuMemAllocManaged(size, flags)
-
- ptr = self._attempt_allocation(allocator)
-
- alloc_key = ptr
-
- else:
- ptr = drvapi.cu_device_ptr()
+ def allocator():
+ ma_flags = binding.CUmemAttach_flags
- def allocator():
- flags = c_uint()
- if attach_global:
- flags = enums.CU_MEM_ATTACH_GLOBAL
- else:
- flags = enums.CU_MEM_ATTACH_HOST
+ if attach_global:
+ flags = ma_flags.CU_MEM_ATTACH_GLOBAL.value
+ else:
+ flags = ma_flags.CU_MEM_ATTACH_HOST.value
- driver.cuMemAllocManaged(byref(ptr), size, flags)
+ return driver.cuMemAllocManaged(size, flags)
- self._attempt_allocation(allocator)
+ ptr = self._attempt_allocation(allocator)
- alloc_key = ptr.value
+ alloc_key = ptr
finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
ctx = weakref.proxy(self.context)
@@ -1055,13 +946,8 @@ def get_ipc_handle(self, memory):
populated with the underlying ``ipc_mem_handle``.
"""
base, end = device_extents(memory)
- if USE_NV_BINDING:
- ipchandle = driver.cuIpcGetMemHandle(base)
- offset = int(memory.handle) - int(base)
- else:
- ipchandle = drvapi.cu_ipc_mem_handle()
- driver.cuIpcGetMemHandle(byref(ipchandle), base)
- offset = memory.handle.value - base
+ ipchandle = driver.cuIpcGetMemHandle(base)
+ offset = int(memory.handle) - int(base)
source_info = self.context.device.get_device_identity()
return IpcHandle(
@@ -1080,21 +966,11 @@ def initialize(self):
self.deallocations.memory_capacity = self.get_memory_info().total
def memalloc(self, size):
- if USE_NV_BINDING:
-
- def allocator():
- return driver.cuMemAlloc(size)
-
- ptr = self._attempt_allocation(allocator)
- alloc_key = ptr
- else:
- ptr = drvapi.cu_device_ptr()
-
- def allocator():
- driver.cuMemAlloc(byref(ptr), size)
+ def allocator():
+ return driver.cuMemAlloc(size)
- self._attempt_allocation(allocator)
- alloc_key = ptr.value
+ ptr = self._attempt_allocation(allocator)
+ alloc_key = ptr
finalizer = _alloc_finalizer(self, ptr, alloc_key, size)
ctx = weakref.proxy(self.context)
@@ -1103,15 +979,7 @@ def allocator():
return mem.own()
def get_memory_info(self):
- if USE_NV_BINDING:
- free, total = driver.cuMemGetInfo()
- else:
- free = c_size_t()
- total = c_size_t()
- driver.cuMemGetInfo(byref(free), byref(total))
- free = free.value
- total = total.value
-
+ free, total = driver.cuMemGetInfo()
return MemoryInfo(free=free, total=total)
@property
@@ -1309,10 +1177,7 @@ def get_active_blocks_per_multiprocessor(
:param memsize: per-block dynamic shared memory usage intended, in bytes
"""
args = (func, blocksize, memsize, flags)
- if USE_NV_BINDING:
- return self._cuda_python_active_blocks_per_multiprocessor(*args)
- else:
- return self._ctypes_active_blocks_per_multiprocessor(*args)
+ return self._cuda_python_active_blocks_per_multiprocessor(*args)
def _cuda_python_active_blocks_per_multiprocessor(
self, func, blocksize, memsize, flags
@@ -1352,10 +1217,7 @@ def get_max_potential_block_size(
handle
"""
args = (func, b2d_func, memsize, blocksizelimit, flags)
- if USE_NV_BINDING:
- return self._cuda_python_max_potential_block_size(*args)
- else:
- return self._ctypes_max_potential_block_size(*args)
+ return self._cuda_python_max_potential_block_size(*args)
def _ctypes_max_potential_block_size(
self, func, b2d_func, memsize, blocksizelimit, flags
@@ -1404,10 +1266,7 @@ def push(self):
"""
Pushes this context on the current CPU Thread.
"""
- if USE_NV_BINDING:
- driver.cuCtxPushCurrent(self.handle.value)
- else:
- driver.cuCtxPushCurrent(self.handle)
+ driver.cuCtxPushCurrent(self.handle.value)
self.prepare_for_use()
def pop(self):
@@ -1443,11 +1302,7 @@ def get_ipc_handle(self, memory):
def open_ipc_handle(self, handle, size):
# open the IPC handle to get the device pointer
flags = 1 # CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS
- if USE_NV_BINDING:
- dptr = driver.cuIpcOpenMemHandle(handle, flags)
- else:
- dptr = drvapi.cu_device_ptr()
- driver.cuIpcOpenMemHandle(byref(dptr), handle, flags)
+ dptr = driver.cuIpcOpenMemHandle(handle, flags)
# wrap it
return MemoryPointer(
@@ -1463,28 +1318,17 @@ def can_access_peer(self, peer_device):
"""Returns a bool indicating whether the peer access between the
current and peer device is possible.
"""
- if USE_NV_BINDING:
- peer_device = binding.CUdevice(peer_device)
- can_access_peer = driver.cuDeviceCanAccessPeer(
- self.device.id, peer_device
- )
- else:
- can_access_peer = c_int()
- driver.cuDeviceCanAccessPeer(
- byref(can_access_peer),
- self.device.id,
- peer_device,
- )
+ peer_device = binding.CUdevice(peer_device)
+ can_access_peer = driver.cuDeviceCanAccessPeer(
+ self.device.id, peer_device
+ )
return bool(can_access_peer)
def create_module_ptx(self, ptx):
if isinstance(ptx, str):
ptx = ptx.encode("utf8")
- if USE_NV_BINDING:
- image = ObjectCode.from_ptx(ptx)
- else:
- image = c_char_p(ptx)
+ image = ObjectCode.from_ptx(ptx)
return self.create_module_image(image)
def create_module_image(
@@ -1493,56 +1337,37 @@ def create_module_image(
module = load_module_image(
self, image, setup_callbacks, teardown_callbacks
)
- if USE_NV_BINDING:
- key = module.handle
- else:
- key = module.handle.value
+ key = module.handle
self.modules[key] = module
return weakref.proxy(module)
def unload_module(self, module):
- if USE_NV_BINDING:
- key = module.handle
- else:
- key = module.handle.value
+ key = module.handle
del self.modules[key]
def get_default_stream(self):
- if USE_NV_BINDING:
- handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT)))
- else:
- handle = drvapi.cu_stream(drvapi.CU_STREAM_DEFAULT)
+ handle = drvapi.cu_stream(int(binding.CUstream(CU_STREAM_DEFAULT)))
return Stream(weakref.proxy(self), handle, None)
def get_legacy_default_stream(self):
- if USE_NV_BINDING:
- handle = drvapi.cu_stream(
- int(binding.CUstream(binding.CU_STREAM_LEGACY))
- )
- else:
- handle = drvapi.cu_stream(drvapi.CU_STREAM_LEGACY)
+ handle = drvapi.cu_stream(
+ int(binding.CUstream(binding.CU_STREAM_LEGACY))
+ )
return Stream(weakref.proxy(self), handle, None)
def get_per_thread_default_stream(self):
- if USE_NV_BINDING:
- handle = drvapi.cu_stream(
- int(binding.CUstream(binding.CU_STREAM_PER_THREAD))
- )
- else:
- handle = drvapi.cu_stream(drvapi.CU_STREAM_PER_THREAD)
+ handle = drvapi.cu_stream(
+ int(binding.CUstream(binding.CU_STREAM_PER_THREAD))
+ )
return Stream(weakref.proxy(self), handle, None)
def create_stream(self):
- if USE_NV_BINDING:
- # The default stream creation flag, specifying that the created
- # stream synchronizes with stream 0 (this is different from the
- # default stream, which we define also as CU_STREAM_DEFAULT when
- # the NV binding is in use).
- flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
- handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags)))
- else:
- handle = drvapi.cu_stream()
- driver.cuStreamCreate(byref(handle), 0)
+ # The default stream creation flag, specifying that the created
+ # stream synchronizes with stream 0 (this is different from the
+ # default stream, which we define also as CU_STREAM_DEFAULT when
+ # the NV binding is in use).
+ flags = binding.CUstream_flags.CU_STREAM_DEFAULT.value
+ handle = drvapi.cu_stream(int(driver.cuStreamCreate(flags)))
return Stream(
weakref.proxy(self),
handle,
@@ -1552,21 +1377,14 @@ def create_stream(self):
def create_external_stream(self, ptr):
if not isinstance(ptr, int):
raise TypeError("ptr for external stream must be an int")
- if USE_NV_BINDING:
- handle = drvapi.cu_stream(int(binding.CUstream(ptr)))
- else:
- handle = drvapi.cu_stream(ptr)
+ handle = drvapi.cu_stream(int(binding.CUstream(ptr)))
return Stream(weakref.proxy(self), handle, None, external=True)
def create_event(self, timing=True):
flags = 0
if not timing:
flags |= enums.CU_EVENT_DISABLE_TIMING
- if USE_NV_BINDING:
- handle = drvapi.cu_event(int(driver.cuEventCreate(flags)))
- else:
- handle = drvapi.cu_event()
- driver.cuEventCreate(byref(handle), flags)
+ handle = drvapi.cu_event(int(driver.cuEventCreate(flags)))
return Event(
weakref.proxy(self),
handle,
@@ -1601,14 +1419,9 @@ def load_module_image(
"""
image must be a pointer
"""
- if USE_NV_BINDING:
- return load_module_image_cuda_python(
- context, image, setup_callbacks, teardown_callbacks
- )
- else:
- return load_module_image_ctypes(
- context, image, setup_callbacks, teardown_callbacks
- )
+ return load_module_image_cuda_python(
+ context, image, setup_callbacks, teardown_callbacks
+ )
def load_module_image_ctypes(
@@ -1769,11 +1582,7 @@ def core():
def _module_finalizer(context, handle):
dealloc = context.deallocations
modules = context.modules
-
- if USE_NV_BINDING:
- key = handle
- else:
- key = handle.value
+ key = handle
def core():
shutting_down = utils.shutting_down # early bind
@@ -1843,10 +1652,7 @@ def open(self, context):
from numba import cuda
srcdev = Device.from_identity(self.source_info)
- if USE_NV_BINDING:
- srcdev_id = int(srcdev.id)
- else:
- srcdev_id = srcdev.id
+ srcdev_id = int(srcdev.id)
impl = _CudaIpcImpl(parent=self.parent)
# Open context on the source device.
@@ -1968,10 +1774,7 @@ def close(self):
def __reduce__(self):
# Preprocess the IPC handle, which is defined as a byte array.
- if USE_NV_BINDING:
- preprocessed_handle = self.handle.reserved
- else:
- preprocessed_handle = tuple(self.handle.reserved)
+ preprocessed_handle = self.handle.reserved
args = (
self.__class__,
preprocessed_handle,
@@ -1983,10 +1786,7 @@ def __reduce__(self):
@classmethod
def _rebuild(cls, handle_ary, size, source_info, offset):
- if USE_NV_BINDING:
- handle = binding.CUipcMemHandle()
- else:
- handle = drvapi.cu_ipc_mem_handle()
+ handle = binding.CUipcMemHandle()
handle.reserved = handle_ary
return cls(
base=None,
@@ -2032,7 +1832,7 @@ class MemoryPointer(object):
__cuda_memory__ = True
def __init__(self, context, pointer, size, owner=None, finalizer=None):
- if USE_NV_BINDING and isinstance(pointer, ctypes.c_void_p):
+ if isinstance(pointer, ctypes.c_void_p):
pointer = binding.CUdeviceptr(pointer.value)
self.context = context
@@ -2067,10 +1867,7 @@ def free(self):
def memset(self, byte, count=None, stream=0):
count = self.size if count is None else count
if stream:
- if USE_NV_BINDING:
- handle = stream.handle.value
- else:
- handle = stream.handle
+ handle = stream.handle.value
driver.cuMemsetD8Async(self.device_pointer, byte, count, handle)
else:
driver.cuMemsetD8(self.device_pointer, byte, count)
@@ -2091,12 +1888,9 @@ def view(self, start, stop=None):
base = self.device_pointer_value + start
if size < 0:
raise RuntimeError("size cannot be negative")
- if USE_NV_BINDING:
- pointer = binding.CUdeviceptr()
- ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
- ctypes_ptr.value = base
- else:
- pointer = drvapi.cu_device_ptr(base)
+ pointer = binding.CUdeviceptr()
+ ctypes_ptr = drvapi.cu_device_ptr.from_address(pointer.getPtr())
+ ctypes_ptr.value = base
view = MemoryPointer(self.context, pointer, size, owner=self.owner)
if isinstance(self.owner, (MemoryPointer, OwnedPointer)):
@@ -2108,16 +1902,11 @@ def view(self, start, stop=None):
@property
def device_ctypes_pointer(self):
- if USE_NV_BINDING:
- return drvapi.cu_device_ptr(int(self.device_pointer))
- return self.device_pointer
+ return drvapi.cu_device_ptr(int(self.device_pointer))
@property
def device_pointer_value(self):
- if USE_NV_BINDING:
- return int(self.device_pointer) or None
- else:
- return self.device_pointer.value
+ return int(self.device_pointer) or None
class AutoFreePointer(MemoryPointer):
@@ -2162,13 +1951,8 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None):
self.owned = owner
self.host_pointer = pointer
- if USE_NV_BINDING:
- devptr = driver.cuMemHostGetDevicePointer(pointer, 0)
- self._bufptr_ = self.host_pointer
- else:
- devptr = drvapi.cu_device_ptr()
- driver.cuMemHostGetDevicePointer(byref(devptr), pointer, 0)
- self._bufptr_ = self.host_pointer.value
+ devptr = driver.cuMemHostGetDevicePointer(pointer, 0)
+ self._bufptr_ = self.host_pointer
self.device_pointer = devptr
super(MappedMemory, self).__init__(
@@ -2212,10 +1996,7 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None):
# For buffer interface
self._buflen_ = self.size
- if USE_NV_BINDING:
- self._bufptr_ = self.host_pointer
- else:
- self._bufptr_ = self.host_pointer.value
+ self._bufptr_ = self.host_pointer
if finalizer is not None:
weakref.finalize(self, finalizer)
@@ -2253,10 +2034,7 @@ def __init__(self, context, pointer, size, owner=None, finalizer=None):
# For buffer interface
self._buflen_ = self.size
- if USE_NV_BINDING:
- self._bufptr_ = self.device_pointer
- else:
- self._bufptr_ = self.device_pointer.value
+ self._bufptr_ = self.device_pointer
def own(self):
return ManagedOwnedPointer(weakref.proxy(self))
@@ -2332,10 +2110,7 @@ def synchronize(self):
Wait for all commands in this stream to execute. This will commit any
pending memory transfers.
"""
- if USE_NV_BINDING:
- handle = self.handle.value
- else:
- handle = self.handle
+ handle = self.handle.value
driver.cuStreamSynchronize(handle)
@contextlib.contextmanager
@@ -2379,15 +2154,11 @@ def add_callback(self, callback, arg=None):
"""
data = (self, callback, arg)
_py_incref(data)
- if USE_NV_BINDING:
- ptr = int.from_bytes(self._stream_callback, byteorder="little")
- stream_callback = binding.CUstreamCallback(ptr)
- # The callback needs to receive a pointer to the data PyObject
- data = id(data)
- handle = self.handle.value
- else:
- stream_callback = self._stream_callback
- handle = self.handle
+ ptr = int.from_bytes(self._stream_callback, byteorder="little")
+ stream_callback = binding.CUstreamCallback(ptr)
+ # The callback needs to receive a pointer to the data PyObject
+ data = id(data)
+ handle = self.handle.value
driver.cuStreamAddCallback(handle, stream_callback, data, 0)
@staticmethod
@@ -2465,34 +2236,23 @@ def record(self, stream=0):
queued in the stream at the time of the call to ``record()`` has been
completed.
"""
- if USE_NV_BINDING:
- hstream = stream.handle.value if stream else binding.CUstream(0)
- handle = self.handle.value
- else:
- hstream = stream.handle if stream else 0
- handle = self.handle
+ hstream = stream.handle.value if stream else binding.CUstream(0)
+ handle = self.handle.value
driver.cuEventRecord(handle, hstream)
def synchronize(self):
"""
Synchronize the host thread for the completion of the event.
"""
- if USE_NV_BINDING:
- handle = self.handle.value
- else:
- handle = self.handle
+ handle = self.handle.value
driver.cuEventSynchronize(handle)
def wait(self, stream=0):
"""
All future works submitted to stream will wait util the event completes.
"""
- if USE_NV_BINDING:
- hstream = stream.handle.value if stream else binding.CUstream(0)
- handle = self.handle.value
- else:
- hstream = stream.handle if stream else 0
- handle = self.handle
+ hstream = stream.handle.value if stream else binding.CUstream(0)
+ handle = self.handle.value
flags = 0
driver.cuStreamWaitEvent(hstream, handle, flags)
@@ -2504,14 +2264,7 @@ def event_elapsed_time(evtstart, evtend):
"""
Compute the elapsed time between two events in milliseconds.
"""
- if USE_NV_BINDING:
- return driver.cuEventElapsedTime(
- evtstart.handle.value, evtend.handle.value
- )
- else:
- msec = c_float()
- driver.cuEventElapsedTime(byref(msec), evtstart.handle, evtend.handle)
- return msec.value
+ return driver.cuEventElapsedTime(evtstart.handle.value, evtend.handle.value)
class Module(metaclass=ABCMeta):
@@ -2729,12 +2482,8 @@ def launch_kernel(
param_ptrs = [addressof(arg) for arg in args]
params = (c_void_p * len(param_ptrs))(*param_ptrs)
- if USE_NV_BINDING:
- params_for_launch = addressof(params)
- extra = 0
- else:
- params_for_launch = params
- extra = None
+ params_for_launch = addressof(params)
+ extra = 0
if cooperative:
driver.cuLaunchCooperativeKernel(
@@ -2777,10 +2526,7 @@ def new(
lto=None,
additional_flags=None,
):
- if USE_NV_BINDING:
- linker = _Linker
- else:
- linker = CtypesLinker
+ linker = _Linker
params = (max_registers, lineinfo, cc)
if linker is _Linker:
@@ -3217,21 +2963,12 @@ def get_devptr_for_active_ctx(ptr):
pointer.
"""
if ptr != 0:
- if USE_NV_BINDING:
- ptr_attrs = binding.CUpointer_attribute
- attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
- ptrobj = binding.CUdeviceptr(ptr)
- return driver.cuPointerGetAttribute(attr, ptrobj)
- else:
- devptr = drvapi.cu_device_ptr()
- attr = enums.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
- driver.cuPointerGetAttribute(byref(devptr), attr, ptr)
- return devptr
+ ptr_attrs = binding.CUpointer_attribute
+ attr = ptr_attrs.CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+ ptrobj = binding.CUdeviceptr(ptr)
+ return driver.cuPointerGetAttribute(attr, ptrobj)
else:
- if USE_NV_BINDING:
- return binding.CUdeviceptr()
- else:
- return drvapi.cu_device_ptr()
+ return binding.CUdeviceptr()
def device_extents(devmem):
@@ -3242,15 +2979,8 @@ def device_extents(devmem):
of the device memory view that can be a subsection of the entire allocation.
"""
devptr = device_ctypes_pointer(devmem)
- if USE_NV_BINDING:
- s, n = driver.cuMemGetAddressRange(devptr.value)
- return int(s), int(binding.CUdeviceptr(int(s) + n))
- else:
- s = drvapi.cu_device_ptr()
- n = c_size_t()
- driver.cuMemGetAddressRange(byref(s), byref(n), devptr)
- s, n = s.value, n.value
- return s, s + n
+ s, n = driver.cuMemGetAddressRange(devptr.value)
+ return int(s), int(binding.CUdeviceptr(int(s) + n))
def device_memory_size(devmem):
@@ -3376,10 +3106,7 @@ def host_to_device(dst, src, size, stream=0):
if stream:
assert isinstance(stream, Stream)
fn = driver.cuMemcpyHtoDAsync
- if USE_NV_BINDING:
- handle = stream.handle.value
- else:
- handle = stream.handle
+ handle = stream.handle.value
varargs.append(handle)
else:
fn = driver.cuMemcpyHtoD
@@ -3398,10 +3125,7 @@ def device_to_host(dst, src, size, stream=0):
if stream:
assert isinstance(stream, Stream)
fn = driver.cuMemcpyDtoHAsync
- if USE_NV_BINDING:
- handle = stream.handle.value
- else:
- handle = stream.handle
+ handle = stream.handle.value
varargs.append(handle)
else:
fn = driver.cuMemcpyDtoH
@@ -3420,10 +3144,7 @@ def device_to_device(dst, src, size, stream=0):
if stream:
assert isinstance(stream, Stream)
fn = driver.cuMemcpyDtoDAsync
- if USE_NV_BINDING:
- handle = stream.handle.value
- else:
- handle = stream.handle
+ handle = stream.handle.value
varargs.append(handle)
else:
fn = driver.cuMemcpyDtoD
@@ -3447,10 +3168,7 @@ def device_memset(dst, val, size, stream=0):
if stream:
assert isinstance(stream, Stream)
fn = driver.cuMemsetD8Async
- if USE_NV_BINDING:
- handle = stream.handle.value
- else:
- handle = stream.handle
+ handle = stream.handle.value
varargs.append(handle)
else:
fn = driver.cuMemsetD8
diff --git a/numba_cuda/numba/cuda/cudadrv/mappings.py b/numba_cuda/numba/cuda/cudadrv/mappings.py
index 5c45299d..d74fe6e4 100644
--- a/numba_cuda/numba/cuda/cudadrv/mappings.py
+++ b/numba_cuda/numba/cuda/cudadrv/mappings.py
@@ -1,28 +1,14 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-2-Clause
-from numba.cuda import config
-from . import enums
+from cuda.bindings.driver import CUjitInputType
-if config.CUDA_USE_NVIDIA_BINDING:
- from cuda.bindings.driver import CUjitInputType
-
- FILE_EXTENSION_MAP = {
- "o": CUjitInputType.CU_JIT_INPUT_OBJECT,
- "ptx": CUjitInputType.CU_JIT_INPUT_PTX,
- "a": CUjitInputType.CU_JIT_INPUT_LIBRARY,
- "lib": CUjitInputType.CU_JIT_INPUT_LIBRARY,
- "cubin": CUjitInputType.CU_JIT_INPUT_CUBIN,
- "fatbin": CUjitInputType.CU_JIT_INPUT_FATBINARY,
- "ltoir": CUjitInputType.CU_JIT_INPUT_NVVM,
- }
-else:
- FILE_EXTENSION_MAP = {
- "o": enums.CU_JIT_INPUT_OBJECT,
- "ptx": enums.CU_JIT_INPUT_PTX,
- "a": enums.CU_JIT_INPUT_LIBRARY,
- "lib": enums.CU_JIT_INPUT_LIBRARY,
- "cubin": enums.CU_JIT_INPUT_CUBIN,
- "fatbin": enums.CU_JIT_INPUT_FATBINARY,
- "ltoir": enums.CU_JIT_INPUT_NVVM,
- }
+FILE_EXTENSION_MAP = {
+ "o": CUjitInputType.CU_JIT_INPUT_OBJECT,
+ "ptx": CUjitInputType.CU_JIT_INPUT_PTX,
+ "a": CUjitInputType.CU_JIT_INPUT_LIBRARY,
+ "lib": CUjitInputType.CU_JIT_INPUT_LIBRARY,
+ "cubin": CUjitInputType.CU_JIT_INPUT_CUBIN,
+ "fatbin": CUjitInputType.CU_JIT_INPUT_FATBINARY,
+ "ltoir": CUjitInputType.CU_JIT_INPUT_NVVM,
+}
diff --git a/numba_cuda/numba/cuda/cudadrv/nvrtc.py b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
index cef0b576..8d861fcc 100644
--- a/numba_cuda/numba/cuda/cudadrv/nvrtc.py
+++ b/numba_cuda/numba/cuda/cudadrv/nvrtc.py
@@ -1,23 +1,19 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: BSD-2-Clause
-from ctypes import byref, c_char, c_char_p, c_int, c_size_t, c_void_p, POINTER
-from enum import IntEnum
from numba.cuda.cudadrv.error import (
CCSupportError,
- NvrtcError,
- NvrtcBuiltinOperationFailure,
- NvrtcCompilationError,
- NvrtcSupportError,
)
from numba.cuda import config
from numba.cuda.cuda_paths import get_cuda_paths
from numba.cuda.utils import _readenv
-import functools
import os
-import threading
import warnings
+import functools
+
+from cuda.core.experimental import Program, ProgramOptions
+from cuda.bindings import nvrtc as bindings_nvrtc
NVRTC_EXTRA_SEARCH_PATHS = _readenv(
"NUMBA_CUDA_NVRTC_EXTRA_SEARCH_PATHS", str, ""
@@ -25,268 +21,13 @@
if not hasattr(config, "CUDA_NVRTC_EXTRA_SEARCH_PATHS"):
config.CUDA_NVRTC_EXTRA_SEARCH_PATHS = NVRTC_EXTRA_SEARCH_PATHS
-# Opaque handle for compilation unit
-nvrtc_program = c_void_p
-
-# Result code
-nvrtc_result = c_int
-
-if config.CUDA_USE_NVIDIA_BINDING:
- from cuda.bindings import nvrtc as bindings_nvrtc
- from cuda.core.experimental import Program, ProgramOptions
-
-
-class NvrtcResult(IntEnum):
- NVRTC_SUCCESS = 0
- NVRTC_ERROR_OUT_OF_MEMORY = 1
- NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
- NVRTC_ERROR_INVALID_INPUT = 3
- NVRTC_ERROR_INVALID_PROGRAM = 4
- NVRTC_ERROR_INVALID_OPTION = 5
- NVRTC_ERROR_COMPILATION = 6
- NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
- NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
- NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
- NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
- NVRTC_ERROR_INTERNAL_ERROR = 11
-
-
-_nvrtc_lock = threading.Lock()
-
-class NvrtcProgram:
- """
- A class for managing the lifetime of nvrtcProgram instances. Instances of
- the class own an nvrtcProgram; when an instance is deleted, the underlying
- nvrtcProgram is destroyed using the appropriate NVRTC API.
- """
-
- def __init__(self, nvrtc, handle):
- self._nvrtc = nvrtc
- self._handle = handle
-
- @property
- def handle(self):
- return self._handle
-
- def __del__(self):
- if self._handle:
- self._nvrtc.destroy_program(self)
-
-
-class NVRTC:
- """
- Provides a Pythonic interface to the NVRTC APIs, abstracting away the C API
- calls.
-
- The sole instance of this class is a process-wide singleton, similar to the
- NVVM interface. Initialization is protected by a lock and uses the standard
- (for Numba) open_cudalib function to load the NVRTC library.
- """
-
- _PROTOTYPES = {
- # nvrtcResult nvrtcVersion(int *major, int *minor)
- "nvrtcVersion": (nvrtc_result, POINTER(c_int), POINTER(c_int)),
- # nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
- # const char *src,
- # const char *name,
- # int numHeaders,
- # const char * const *headers,
- # const char * const *includeNames)
- "nvrtcCreateProgram": (
- nvrtc_result,
- nvrtc_program,
- c_char_p,
- c_char_p,
- c_int,
- POINTER(c_char_p),
- POINTER(c_char_p),
- ),
- # nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
- "nvrtcDestroyProgram": (nvrtc_result, POINTER(nvrtc_program)),
- # nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
- # int numOptions,
- # const char * const *options)
- "nvrtcCompileProgram": (
- nvrtc_result,
- nvrtc_program,
- c_int,
- POINTER(c_char_p),
- ),
- # nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
- "nvrtcGetPTXSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
- # nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
- "nvrtcGetPTX": (nvrtc_result, nvrtc_program, c_char_p),
- # nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog,
- # size_t *cubinSizeRet);
- "nvrtcGetCUBINSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
- # nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
- "nvrtcGetCUBIN": (nvrtc_result, nvrtc_program, c_char_p),
- # nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
- # size_t *logSizeRet);
- "nvrtcGetProgramLogSize": (
- nvrtc_result,
- nvrtc_program,
- POINTER(c_size_t),
- ),
- # nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
- "nvrtcGetProgramLog": (nvrtc_result, nvrtc_program, c_char_p),
- # nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs);
- "nvrtcGetNumSupportedArchs": (nvrtc_result, POINTER(c_int)),
- # nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs);
- "nvrtcGetSupportedArchs": (nvrtc_result, POINTER(c_int)),
- # nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *ltoSizeRet);
- "nvrtcGetLTOIRSize": (nvrtc_result, nvrtc_program, POINTER(c_size_t)),
- # nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *lto);
- "nvrtcGetLTOIR": (nvrtc_result, nvrtc_program, c_char_p),
- }
-
- # Singleton reference
- __INSTANCE = None
-
- def __new__(cls):
- with _nvrtc_lock:
- if config.CUDA_USE_NVIDIA_BINDING:
- raise RuntimeError(
- "NVRTC objects should not be used with cuda-python bindings"
- )
- if cls.__INSTANCE is None:
- from numba.cuda.cudadrv.libs import open_cudalib
-
- cls.__INSTANCE = inst = object.__new__(cls)
- try:
- lib = open_cudalib("nvrtc")
- except OSError as e:
- cls.__INSTANCE = None
- raise NvrtcSupportError("NVRTC cannot be loaded") from e
-
- # Find & populate functions
- for name, proto in inst._PROTOTYPES.items():
- func = getattr(lib, name)
- func.restype = proto[0]
- func.argtypes = proto[1:]
-
- @functools.wraps(func)
- def checked_call(*args, func=func, name=name):
- error = func(*args)
- if error == NvrtcResult.NVRTC_ERROR_COMPILATION:
- raise NvrtcCompilationError()
- elif (
- error
- == NvrtcResult.NVRTC_ERROR_BUILTIN_OPERATION_FAILURE
- ):
- raise NvrtcBuiltinOperationFailure()
- elif error != NvrtcResult.NVRTC_SUCCESS:
- try:
- error_name = NvrtcResult(error).name
- except ValueError:
- error_name = (
- "Unknown nvrtc_result "
- f"(error code: {error})"
- )
- msg = f"Failed to call {name}: {error_name}"
- raise NvrtcError(msg)
-
- setattr(inst, name, checked_call)
-
- return cls.__INSTANCE
-
- @functools.cache
- def get_supported_archs(self):
- """
- Get Supported Architectures by NVRTC as list of arch tuples.
- """
- num = c_int()
- self.nvrtcGetNumSupportedArchs(byref(num))
- archs = (c_int * num.value)()
- self.nvrtcGetSupportedArchs(archs)
- return [(archs[i] // 10, archs[i] % 10) for i in range(num.value)]
-
- def get_version(self):
- """
- Get the NVRTC version as a tuple (major, minor).
- """
- major = c_int()
- minor = c_int()
- self.nvrtcVersion(byref(major), byref(minor))
- return major.value, minor.value
-
- def create_program(self, src, name):
- """
- Create an NVRTC program with managed lifetime.
- """
- if isinstance(src, str):
- src = src.encode()
- if isinstance(name, str):
- name = name.encode()
-
- handle = nvrtc_program()
-
- # The final three arguments are for passing the contents of headers -
- # this is not supported, so there are 0 headers and the header names
- # and contents are null.
- self.nvrtcCreateProgram(byref(handle), src, name, 0, None, None)
- return NvrtcProgram(self, handle)
-
- def compile_program(self, program, options):
- """
- Compile an NVRTC program. Compilation may fail due to a user error in
- the source; this function returns ``True`` if there is a compilation
- error and ``False`` on success.
- """
- # We hold a list of encoded options to ensure they can't be collected
- # prior to the call to nvrtcCompileProgram
- encoded_options = [opt.encode() for opt in options]
- option_pointers = [c_char_p(opt) for opt in encoded_options]
- c_options_type = c_char_p * len(options)
- c_options = c_options_type(*option_pointers)
- try:
- self.nvrtcCompileProgram(program.handle, len(options), c_options)
- return False
- except (NvrtcCompilationError, NvrtcBuiltinOperationFailure):
- return True
-
- def destroy_program(self, program):
- """
- Destroy an NVRTC program.
- """
- self.nvrtcDestroyProgram(byref(program.handle))
-
- def get_compile_log(self, program):
- """
- Get the compile log as a Python string.
- """
- log_size = c_size_t()
- self.nvrtcGetProgramLogSize(program.handle, byref(log_size))
-
- log = (c_char * log_size.value)()
- self.nvrtcGetProgramLog(program.handle, log)
-
- return log.value.decode()
-
- def get_ptx(self, program):
- """
- Get the compiled PTX as a Python string.
- """
- ptx_size = c_size_t()
- self.nvrtcGetPTXSize(program.handle, byref(ptx_size))
-
- ptx = (c_char * ptx_size.value)()
- self.nvrtcGetPTX(program.handle, ptx)
-
- return ptx.value.decode()
-
- def get_lto(self, program):
- """
- Get the compiled LTOIR as a Python bytes object.
- """
- lto_size = c_size_t()
- self.nvrtcGetLTOIRSize(program.handle, byref(lto_size))
-
- lto = b" " * lto_size.value
- self.nvrtcGetLTOIR(program.handle, lto)
-
- return lto
+@functools.cache
+def _get_nvrtc_version():
+ retcode, major, minor = bindings_nvrtc.nvrtcVersion()
+ if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS:
+ raise RuntimeError(f"{retcode.name} when calling nvrtcVersion()")
+ return (major, minor)
def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
@@ -308,17 +49,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
:return: The compiled PTX or LTOIR and compilation log
:rtype: tuple
"""
-
- if config.CUDA_USE_NVIDIA_BINDING:
- retcode, *version = bindings_nvrtc.nvrtcVersion()
- if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS:
- raise RuntimeError(
- f"{retcode.name} when calling nvrtcGetSupportedArchs()"
- )
- version = tuple(version)
- else:
- nvrtc = NVRTC()
- version = nvrtc.get_version()
+ version = _get_nvrtc_version()
ver_str = lambda version: ".".join(str(v) for v in version)
supported_ccs = get_supported_ccs()
@@ -345,10 +76,7 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
# being optimized away.
major, minor = found
- if config.CUDA_USE_NVIDIA_BINDING:
- arch = f"sm_{major}{minor}"
- else:
- arch = f"--gpu-architecture=compute_{major}{minor}"
+ arch = f"sm_{major}{minor}"
cuda_include_dir = get_cuda_paths()["include_dir"].info
cuda_includes = [f"{cuda_include_dir}"]
@@ -381,76 +109,35 @@ def compile(src, name, cc, ltoir=False, lineinfo=False, debug=False):
includes = [numba_include, *cuda_includes, nrt_include, *extra_includes]
- if config.CUDA_USE_NVIDIA_BINDING:
- options = ProgramOptions(
- arch=arch,
- include_path=includes,
- relocatable_device_code=True,
- link_time_optimization=ltoir,
- name=name,
- debug=debug,
- lineinfo=lineinfo,
- )
-
- class Logger:
- def __init__(self):
- self.log = []
-
- def write(self, msg):
- self.log.append(msg)
-
- logger = Logger()
- if isinstance(src, bytes):
- src = src.decode("utf8")
-
- prog = Program(src, "c++", options=options)
- result = prog.compile("ltoir" if ltoir else "ptx", logs=logger)
- log = ""
- if logger.log:
- log = logger.log
- joined_logs = "\n".join(log)
- warnings.warn(f"NVRTC log messages: {joined_logs}")
- return result, log
-
- else:
- program = nvrtc.create_program(src, name)
- includes = [f"-I{path}" for path in includes]
- options = [
- arch,
- *includes,
- "-rdc",
- "true",
- ]
-
- if ltoir:
- options.append("-dlto")
- if lineinfo:
- options.append("-lineinfo")
- if debug:
- options.append("-G")
-
- # Compile the program
- compile_error = nvrtc.compile_program(program, options)
-
- # Get log from compilation
- log = nvrtc.get_compile_log(program)
-
- # If the compile failed, provide the log in an exception
- if compile_error:
- msg = f"NVRTC Compilation failure whilst compiling {name}:\n\n{log}"
- raise NvrtcError(msg)
-
- # Otherwise, if there's any content in the log, present it as a warning
- if log:
- msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}"
- warnings.warn(msg)
-
- if ltoir:
- ltoir = nvrtc.get_lto(program)
- return ltoir, log
- else:
- ptx = nvrtc.get_ptx(program)
- return ptx, log
+ options = ProgramOptions(
+ arch=arch,
+ include_path=includes,
+ relocatable_device_code=True,
+ link_time_optimization=ltoir,
+ name=name,
+ debug=debug,
+ lineinfo=lineinfo,
+ )
+
+ class Logger:
+ def __init__(self):
+ self.log = []
+
+ def write(self, msg):
+ self.log.append(msg)
+
+ logger = Logger()
+ if isinstance(src, bytes):
+ src = src.decode("utf8")
+
+ prog = Program(src, "c++", options=options)
+ result = prog.compile("ltoir" if ltoir else "ptx", logs=logger)
+ log = ""
+ if logger.log:
+ log = logger.log
+ joined_logs = "\n".join(log)
+ warnings.warn(f"NVRTC log messages: {joined_logs}")
+ return result, log
def find_closest_arch(mycc):
@@ -498,12 +185,9 @@ def get_lowest_supported_cc():
def get_supported_ccs():
- if config.CUDA_USE_NVIDIA_BINDING:
- retcode, archs = bindings_nvrtc.nvrtcGetSupportedArchs()
- if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS:
- raise RuntimeError(
- f"{retcode.name} when calling nvrtcGetSupportedArchs()"
- )
- return [(arch // 10, arch % 10) for arch in archs]
- else:
- return NVRTC().get_supported_archs()
+ retcode, archs = bindings_nvrtc.nvrtcGetSupportedArchs()
+ if retcode != bindings_nvrtc.nvrtcResult.NVRTC_SUCCESS:
+ raise RuntimeError(
+ f"{retcode.name} when calling nvrtcGetSupportedArchs()"
+ )
+ return [(arch // 10, arch % 10) for arch in archs]
diff --git a/numba_cuda/numba/cuda/cudadrv/runtime.py b/numba_cuda/numba/cuda/cudadrv/runtime.py
index ebe615e0..004d4184 100644
--- a/numba_cuda/numba/cuda/cudadrv/runtime.py
+++ b/numba_cuda/numba/cuda/cudadrv/runtime.py
@@ -8,23 +8,12 @@
to the runtime anymore. This file is provided to maintain the existing API.
"""
-from numba.cuda import config
-from numba.cuda.cudadrv.nvrtc import NVRTC
+from numba.cuda.cudadrv.nvrtc import _get_nvrtc_version
class Runtime:
def get_version(self):
- if config.CUDA_USE_NVIDIA_BINDING:
- from cuda.bindings import nvrtc
-
- retcode, *version = nvrtc.nvrtcVersion()
- if retcode != nvrtc.nvrtcResult.NVRTC_SUCCESS:
- raise RuntimeError(
- f"{retcode.name} when calling nvrtcGetVersion()"
- )
- return tuple(version)
- else:
- return NVRTC().get_version()
+ return _get_nvrtc_version()
runtime = Runtime()
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
index c3671266..972c869e 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_linker.py
@@ -3,7 +3,6 @@
import numpy as np
import warnings
-from numba.cuda import config
from numba.cuda.testing import unittest
from numba.cuda.testing import (
skip_on_cudasim,
@@ -15,7 +14,6 @@
from numba.cuda import require_context
from numba.cuda.tests.support import ignore_internal_warnings
from numba import cuda, void, float64, int64, int32, typeof, float32
-from numba.cuda.cudadrv.error import NvrtcError
CONST1D = np.arange(10, dtype=np.float64)
@@ -179,23 +177,27 @@ def test_linking_cu_log_warning(self):
def kernel(x):
bar(x)
- self.assertEqual(len(w), 1, "Expected warnings from NVRTC")
+ nvrtc_log_warnings = [
+ wi for wi in w if "NVRTC log messages" in str(wi.message)
+ ]
+ self.assertEqual(
+ len(nvrtc_log_warnings), 1, "Expected warnings from NVRTC"
+ )
# Check the warning refers to the log messages
- self.assertIn("NVRTC log messages", str(w[0].message))
+ self.assertIn("NVRTC log messages", str(nvrtc_log_warnings[0].message))
# Check the message pertaining to the unused variable is provided
- self.assertIn("declared but never referenced", str(w[0].message))
+ self.assertIn(
+ "declared but never referenced", str(nvrtc_log_warnings[0].message)
+ )
def test_linking_cu_error(self):
bar = cuda.declare_device("bar", "int32(int32)")
link = str(test_data_dir / "error.cu")
- if config.CUDA_USE_NVIDIA_BINDING:
- from cuda.core.experimental._utils.cuda_utils import NVRTCError
+ from cuda.core.experimental._utils.cuda_utils import NVRTCError
- errty = NVRTCError
- else:
- errty = NvrtcError
+ errty = NVRTCError
with self.assertRaises(errty) as e:
@cuda.jit("void(int32)", link=[link])
@@ -204,11 +206,7 @@ def kernel(x):
msg = e.exception.args[0]
# Check the error message refers to the NVRTC compile
- nvrtc_err_str = (
- "NVRTC_ERROR_COMPILATION"
- if config.CUDA_USE_NVIDIA_BINDING
- else "NVRTC Compilation failure"
- )
+ nvrtc_err_str = "NVRTC_ERROR_COMPILATION"
self.assertIn(nvrtc_err_str, msg)
# Check the expected error in the CUDA source is reported
self.assertIn('identifier "SYNTAX" is undefined', msg)
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py b/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py
index 8a4f5b44..e74d97e9 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_module_callbacks.py
@@ -17,10 +17,7 @@
if not config.ENABLE_CUDASIM:
from cuda.bindings.driver import cuModuleGetGlobal, cuMemcpyHtoD
- if config.CUDA_USE_NVIDIA_BINDING:
- from cuda.bindings.driver import CUmodule as cu_module_type
- else:
- from numba.cuda.cudadrv.drvapi import cu_module as cu_module_type
+ from cuda.bindings.driver import CUmodule as cu_module_type
def wipe_all_modules_in_context():
@@ -35,8 +32,6 @@ def wipe_all_modules_in_context():
def get_hashable_handle_value(handle):
- if not config.CUDA_USE_NVIDIA_BINDING:
- handle = handle.value
return handle
diff --git a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
index 17b54d85..d7344541 100644
--- a/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
+++ b/numba_cuda/numba/cuda/tests/cudadrv/test_nvjitlink.py
@@ -45,10 +45,8 @@
@unittest.skipIf(
- not config.CUDA_USE_NVIDIA_BINDING
- or not TEST_BIN_DIR
- or not _have_nvjitlink(),
- "NVIDIA cuda bindings not enabled or nvJitLink not installed or new enough (>12.3)",
+ not TEST_BIN_DIR or not _have_nvjitlink(),
+ "nvJitLink not installed or new enough (>12.3)",
)
@skip_on_cudasim("Linking unsupported in the simulator")
class TestLinker(CUDATestCase):
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
index 5142c539..45f5634c 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_compiler.py
@@ -557,12 +557,9 @@ def f(z, x, y):
link_obj = LinkableCode.from_path(link)
if link_obj.kind == "cu":
# if link is a cu file, result contains a compiled object code
- if cuda.config.CUDA_USE_NVIDIA_BINDING:
- from cuda.core.experimental import ObjectCode
+ from cuda.core.experimental import ObjectCode
- assert isinstance(code_list[1], ObjectCode)
- else:
- assert isinstance(code_list[1], bytes)
+ assert isinstance(code_list[1], ObjectCode)
else:
assert code_list[1].kind == link_obj.kind
@@ -581,13 +578,10 @@ def f(z, x, y):
)
assert len(code_list) == 2
- if cuda.config.CUDA_USE_NVIDIA_BINDING:
- self.assertRegex(
- str(code_list[1].code.decode()),
- r"\.file.*test_device_functions",
- )
- else:
- self.assertRegex(code_list[1], r"\.file.*test_device_functions")
+ self.assertRegex(
+ str(code_list[1].code.decode()),
+ r"\.file.*test_device_functions",
+ )
@unittest.skipIf(not TEST_BIN_DIR, "necessary binaries not generated.")
def test_compile_all_debug(self):
@@ -604,12 +598,9 @@ def f(z, x, y):
)
assert len(code_list) == 2
- if cuda.config.CUDA_USE_NVIDIA_BINDING:
- self.assertRegex(
- str(code_list[1].code.decode()), r"\.section\s+\.debug_info"
- )
- else:
- self.assertRegex(code_list[1], r"\.section\s+\.debug_info")
+ self.assertRegex(
+ str(code_list[1].code.decode()), r"\.section\s+\.debug_info"
+ )
@skip_on_cudasim("Compilation unsupported in the simulator")
diff --git a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
index 48669e7d..3ea51a1a 100644
--- a/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
+++ b/numba_cuda/numba/cuda/tests/cudapy/test_errors.py
@@ -4,7 +4,7 @@
from numba import cuda
from numba.core.errors import TypingError
from numba.cuda.testing import unittest, CUDATestCase, skip_on_cudasim
-from numba.cuda import config
+from numba.cuda.cudadrv import driver
def noop(x):
@@ -95,7 +95,7 @@ def kernel_func():
@skip_on_cudasim("Simulator does not use nvjitlink")
@unittest.skipIf(
- config.CUDA_USE_NVIDIA_BINDING, "NVIDIA cuda bindings enabled"
+ driver._have_nvjitlink(), "nvJitLink available; LTO should not error"
)
def test_lto_without_nvjitlink_error(self):
with self.assertRaisesRegex(RuntimeError, "LTO requires nvjitlink"):
diff --git a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
index b87f4d35..2757d576 100644
--- a/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
+++ b/numba_cuda/numba/cuda/tests/nrt/test_nrt.py
@@ -173,16 +173,7 @@ def test_nrt_detect_linked_ptx_file(self):
cc = get_current_device().compute_capability
ptx, _ = compile(src, "external_nrt.cu", cc)
- @cuda.jit(
- link=[
- PTXSource(
- ptx.code
- if config.CUDA_USE_NVIDIA_BINDING
- else ptx.encode(),
- nrt=True,
- )
- ]
- )
+ @cuda.jit(link=[PTXSource(ptx.code, nrt=True)])
def kernel():
allocate_deallocate_handle()
diff --git a/pyproject.toml b/pyproject.toml
index 54ee6511..12ed8284 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -21,7 +21,7 @@ authors = [
license = "BSD-2-clause"
license-files = ["LICENSE", "LICENSE.numba"]
requires-python = ">=3.9"
-dependencies = ["numba>=0.60.0"]
+dependencies = ["numba>=0.60.0", "cuda-bindings>=12.9.1,<14.0.0", "cuda-core>=0.3.2,<0.4.0dev0"]
[project.optional-dependencies]
cu12 = [