diff --git a/cuda_core/cuda/core/experimental/_context.py b/cuda_core/cuda/core/experimental/_context.py index e59e94db3..ad2e4b94c 100644 --- a/cuda_core/cuda/core/experimental/_context.py +++ b/cuda_core/cuda/core/experimental/_context.py @@ -4,7 +4,8 @@ from dataclasses import dataclass -from cuda.core.experimental._utils import driver +from cuda.core.experimental._utils.clear_error_support import assert_type +from cuda.core.experimental._utils.cuda_utils import driver @dataclass @@ -20,7 +21,7 @@ def __new__(self, *args, **kwargs): @classmethod def _from_ctx(cls, obj, dev_id): - assert isinstance(obj, driver.CUcontext) + assert_type(obj, driver.CUcontext) ctx = super().__new__(cls) ctx._handle = obj ctx._id = dev_id diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py index 2eb827d38..1a98794f9 100644 --- a/cuda_core/cuda/core/experimental/_device.py +++ b/cuda_core/cuda/core/experimental/_device.py @@ -8,7 +8,15 @@ from cuda.core.experimental._context import Context, ContextOptions from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource from cuda.core.experimental._stream import Stream, StreamOptions, default_stream -from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime +from cuda.core.experimental._utils.clear_error_support import assert_type +from cuda.core.experimental._utils.cuda_utils import ( + ComputeCapability, + CUDAError, + driver, + handle_return, + precondition, + runtime, +) _tls = threading.local() _lock = threading.Lock() @@ -949,10 +957,11 @@ def __new__(cls, device_id=None): # important: creating a Device instance does not initialize the GPU! if device_id is None: device_id = handle_return(runtime.cudaGetDevice()) - assert isinstance(device_id, int), f"{device_id=}" + assert_type(device_id, int) else: total = handle_return(runtime.cudaGetDeviceCount()) - if not isinstance(device_id, int) or not (0 <= device_id < total): + assert_type(device_id, int) + if not (0 <= device_id < total): raise ValueError(f"device_id must be within [0, {total}), got {device_id}") # ensure Device is singleton @@ -981,7 +990,9 @@ def __new__(cls, device_id=None): def _check_context_initialized(self, *args, **kwargs): if not self._has_inited: - raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?") + raise CUDAError( + f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?" + ) @property def device_id(self) -> int: @@ -1053,7 +1064,8 @@ def context(self) -> Context: """ ctx = handle_return(driver.cuCtxGetCurrent()) - assert int(ctx) != 0 + if int(ctx) == 0: + raise CUDAError("No context is bound to the calling CPU thread.") return Context._from_ctx(ctx, self._id) @property @@ -1063,8 +1075,7 @@ def memory_resource(self) -> MemoryResource: @memory_resource.setter def memory_resource(self, mr): - if not isinstance(mr, MemoryResource): - raise TypeError + assert_type(mr, MemoryResource) self._mr = mr @property @@ -1118,12 +1129,11 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]: """ if ctx is not None: - if not isinstance(ctx, Context): - raise TypeError("a Context object is required") + assert_type(ctx, Context) if ctx._id != self._id: raise RuntimeError( - "the provided context was created on a different " - f"device {ctx._id} other than the target {self._id}" + "the provided context was created on the device with" + f" id={ctx._id}, which is different from the target id={self._id}" ) prev_ctx = handle_return(driver.cuCtxPopCurrent()) handle_return(driver.cuCtxPushCurrent(ctx._handle)) @@ -1165,7 +1175,7 @@ def create_context(self, options: ContextOptions = None) -> Context: Newly created context object. """ - raise NotImplementedError("TODO") + raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189") @precondition(_check_context_initialized) def create_stream(self, obj=None, options: StreamOptions = None) -> Stream: diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py index bd97305a6..54468f238 100644 --- a/cuda_core/cuda/core/experimental/_event.py +++ b/cuda_core/cuda/core/experimental/_event.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from typing import TYPE_CHECKING, Optional -from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, handle_return +from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return if TYPE_CHECKING: import cuda.bindings @@ -88,7 +88,7 @@ def _init(cls, options: Optional[EventOptions] = None): flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC self._busy_waited = True if options.support_ipc: - raise NotImplementedError("TODO") + raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") self._mnff.handle = handle_return(driver.cuEventCreate(flags)) return self @@ -109,7 +109,7 @@ def is_sync_busy_waited(self) -> bool: @property def is_ipc_supported(self) -> bool: """Return True if this event can be used as an interprocess event, otherwise False.""" - raise NotImplementedError("TODO") + raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103") def sync(self): """Synchronize until the event completes. @@ -129,10 +129,9 @@ def is_done(self) -> bool: (result,) = driver.cuEventQuery(self._mnff.handle) if result == driver.CUresult.CUDA_SUCCESS: return True - elif result == driver.CUresult.CUDA_ERROR_NOT_READY: + if result == driver.CUresult.CUDA_ERROR_NOT_READY: return False - else: - raise CUDAError(f"unexpected error: {result}") + handle_return(result) @property def handle(self) -> cuda.bindings.driver.CUevent: diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py index bdf498fb0..71f55bb23 100644 --- a/cuda_core/cuda/core/experimental/_launcher.py +++ b/cuda_core/cuda/core/experimental/_launcher.py @@ -9,7 +9,15 @@ from cuda.core.experimental._kernel_arg_handler import ParamHolder from cuda.core.experimental._module import Kernel from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, get_binding_version, handle_return +from cuda.core.experimental._utils.clear_error_support import assert_type +from cuda.core.experimental._utils.cuda_utils import ( + CUDAError, + cast_to_3_tuple, + check_or_create_options, + driver, + get_binding_version, + handle_return, +) # TODO: revisit this treatment for py313t builds _inited = False @@ -59,41 +67,23 @@ class LaunchConfig: def __post_init__(self): _lazy_init() - self.grid = self._cast_to_3_tuple(self.grid) - self.block = self._cast_to_3_tuple(self.block) + self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid) + self.block = cast_to_3_tuple("LaunchConfig.block", self.block) # thread block clusters are supported starting H100 if self.cluster is not None: if not _use_ex: - raise CUDAError("thread block clusters require cuda.bindings & driver 11.8+") - if Device().compute_capability < (9, 0): - raise CUDAError("thread block clusters are not supported on devices with compute capability < 9.0") - self.cluster = self._cast_to_3_tuple(self.cluster) + err, drvers = driver.cuDriverGetVersion() + drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else "" + raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}") + cc = Device().compute_capability + if cc < (9, 0): + raise CUDAError( + f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})" + ) + self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster) if self.shmem_size is None: self.shmem_size = 0 - def _cast_to_3_tuple(self, cfg): - if isinstance(cfg, int): - if cfg < 1: - raise ValueError - return (cfg, 1, 1) - elif isinstance(cfg, tuple): - size = len(cfg) - if size == 1: - cfg = cfg[0] - if cfg < 1: - raise ValueError - return (cfg, 1, 1) - elif size == 2: - if cfg[0] < 1 or cfg[1] < 1: - raise ValueError - return (*cfg, 1) - elif size == 3: - if cfg[0] < 1 or cfg[1] < 1 or cfg[2] < 1: - raise ValueError - return cfg - else: - raise ValueError - def launch(stream, config, kernel, *kernel_args): """Launches a :obj:`~_module.Kernel` @@ -120,9 +110,10 @@ def launch(stream, config, kernel, *kernel_args): try: stream = Stream._init(stream) except Exception as e: - raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e - if not isinstance(kernel, Kernel): - raise ValueError + raise ValueError( + f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})" + ) from e + assert_type(kernel, Kernel) config = check_or_create_options(LaunchConfig, config, "launch config") # TODO: can we ensure kernel_args is valid/safe to use here? diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py index 43d1eb3c6..42dd2c0ba 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/experimental/_linker.py @@ -16,7 +16,8 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import check_or_create_options, driver, handle_return, is_sequence +from cuda.core.experimental._utils.clear_error_support import assert_type +from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence # TODO: revisit this treatment for py313t builds _driver = None # populated if nvJitLink cannot be used @@ -382,12 +383,12 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None): self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink) for code in object_codes: - assert isinstance(code, ObjectCode) + assert_type(code, ObjectCode) self._add_code_object(code) def _add_code_object(self, object_code: ObjectCode): data = object_code._module - assert isinstance(data, bytes) + assert_type(data, bytes) with _exception_manager(self): if _nvjitlink: _nvjitlink.add_data( diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py index 403ee0842..6a0c611d3 100644 --- a/cuda_core/cuda/core/experimental/_memory.py +++ b/cuda_core/cuda/core/experimental/_memory.py @@ -10,7 +10,7 @@ from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule from cuda.core.experimental._stream import default_stream -from cuda.core.experimental._utils import driver, handle_return +from cuda.core.experimental._utils.cuda_utils import driver, handle_return PyCapsule = TypeVar("PyCapsule") @@ -100,21 +100,21 @@ def is_device_accessible(self) -> bool: """Return True if this buffer can be accessed by the GPU, otherwise False.""" if self._mnff.mr is not None: return self._mnff.mr.is_device_accessible - raise NotImplementedError + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") @property def is_host_accessible(self) -> bool: """Return True if this buffer can be accessed by the CPU, otherwise False.""" if self._mnff.mr is not None: return self._mnff.mr.is_host_accessible - raise NotImplementedError + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") @property def device_id(self) -> int: """Return the device ordinal of this buffer.""" if self._mnff.mr is not None: return self._mnff.mr.device_id - raise NotImplementedError + raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource") def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: """Copy from this buffer to the dst buffer asynchronously on the given stream. @@ -136,10 +136,12 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer: raise ValueError("stream must be provided") if dst is None: if self._mnff.mr is None: - raise ValueError("a destination buffer must be provided") + raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)") dst = self._mnff.mr.allocate(self._mnff.size, stream) if dst._mnff.size != self._mnff.size: - raise ValueError("buffer sizes mismatch between src and dst") + raise ValueError( + f"buffer sizes mismatch between src and dst (sizes are: src={self._mnff.size}, dst={dst._mnff.size})" + ) handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle)) return dst @@ -158,7 +160,9 @@ def copy_from(self, src: Buffer, *, stream): if stream is None: raise ValueError("stream must be provided") if src._mnff.size != self._mnff.size: - raise ValueError("buffer sizes mismatch between src and dst") + raise ValueError( + f"buffer sizes mismatch between src and dst (sizes are: src={src._mnff.size}, dst={self._mnff.size})" + ) handle_return(driver.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle)) def __dlpack__( @@ -171,37 +175,40 @@ def __dlpack__( ) -> PyCapsule: # Note: we ignore the stream argument entirely (as if it is -1). # It is the user's responsibility to maintain stream order. - if dl_device is not None or copy is True: - raise BufferError + if dl_device is not None: + raise BufferError("Sorry, not supported: dl_device other than None") + if copy is True: + raise BufferError("Sorry, not supported: copy=True") if max_version is None: versioned = False else: - assert len(max_version) == 2 + if not isinstance(max_version, tuple) or len(max_version) != 2: + raise BufferError(f"Expected max_version Tuple[int, int], got {max_version}") versioned = max_version >= (1, 0) capsule = make_py_capsule(self, versioned) return capsule def __dlpack_device__(self) -> Tuple[int, int]: - if self.is_device_accessible and not self.is_host_accessible: + d_h = (bool(self.is_device_accessible), bool(self.is_host_accessible)) + if d_h == (True, False): return (DLDeviceType.kDLCUDA, self.device_id) - elif self.is_device_accessible and self.is_host_accessible: + if d_h == (True, True): # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks return (DLDeviceType.kDLCUDAHost, 0) - elif not self.is_device_accessible and self.is_host_accessible: + if d_h == (False, True): return (DLDeviceType.kDLCPU, 0) - else: # not self.is_device_accessible and not self.is_host_accessible - raise BufferError("invalid buffer") + raise BufferError("buffer is neither device-accessible nor host-accessible") def __buffer__(self, flags: int, /) -> memoryview: # Support for Python-level buffer protocol as per PEP 688. # This raises a BufferError unless: # 1. Python is 3.12+ # 2. This Buffer object is host accessible - raise NotImplementedError("TODO") + raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.") def __release_buffer__(self, buffer: memoryview, /): # Supporting method paired with __buffer__. - raise NotImplementedError("TODO") + raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.") class MemoryResource(abc.ABC): @@ -291,7 +298,7 @@ def is_host_accessible(self) -> bool: @property def device_id(self) -> int: - raise RuntimeError("the pinned memory resource is not bound to any GPU") + raise RuntimeError("a pinned memory resource is not bound to any GPU") class _SynchronousMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index 7ebfa4806..04a0ca35b 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -11,7 +11,7 @@ from typing import Any, Optional import numpy -from cuda.core.experimental._utils import handle_return, driver +from cuda.core.experimental._utils.cuda_utils import handle_return, driver # TODO(leofang): support NumPy structured dtypes diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py index 39a7c0f5b..320b5b603 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/experimental/_module.py @@ -5,7 +5,12 @@ from typing import Optional, Union from warnings import warn -from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition +from cuda.core.experimental._utils.clear_error_support import ( + assert_type, + assert_type_str_or_bytes, + raise_code_path_meant_to_be_unreachable, +) +from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition _backend = { "old": { @@ -195,8 +200,8 @@ def __new__(self, *args, **kwargs): @classmethod def _from_obj(cls, obj, mod): - assert isinstance(obj, _kernel_ctypes) - assert isinstance(mod, ObjectCode) + assert_type(obj, _kernel_ctypes) + assert_type(mod, ObjectCode) ker = super().__new__(cls) ker._handle = obj ker._module = mod @@ -300,17 +305,20 @@ def _lazy_load_module(self, *args, **kwargs): if self._handle is not None: return module = self._module + assert_type_str_or_bytes(module) if isinstance(module, str): if self._backend_version == "new": self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0)) else: # "old" backend self._handle = handle_return(self._loader["file"](module.encode())) - else: - assert isinstance(module, bytes) + return + if isinstance(module, bytes): if self._backend_version == "new": self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0)) else: # "old" backend self._handle = handle_return(self._loader["data"](module, 0, [], [])) + return + raise_code_path_meant_to_be_unreachable() @precondition(_lazy_load_module) def get_kernel(self, name) -> Kernel: @@ -327,8 +335,9 @@ def get_kernel(self, name) -> Kernel: Newly created kernel object. """ - if self._code_type not in ("cubin", "ptx", "fatbin"): - raise RuntimeError(f"get_kernel() is not supported for {self._code_type}") + supported_code_types = ("cubin", "ptx", "fatbin") + if self._code_type not in supported_code_types: + raise RuntimeError(f'Unsupported code type "{self._code_type}" ({supported_code_types=})') try: name = self._sym_map[name] except KeyError: diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py index 8e7fea245..ab9362531 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/experimental/_program.py @@ -15,7 +15,8 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import ( +from cuda.core.experimental._utils.clear_error_support import assert_type +from cuda.core.experimental._utils.cuda_utils import ( _handle_boolean_option, check_or_create_options, driver, @@ -26,6 +27,30 @@ ) +def _process_define_macro_inner(formatted_options, macro): + if isinstance(macro, str): + formatted_options.append(f"--define-macro={macro}") + return True + if isinstance(macro, tuple): + if len(macro) != 2 or any(not isinstance(val, str) for val in macro): + raise RuntimeError(f"Expected define_macro Tuple[str, str], got {macro}") + formatted_options.append(f"--define-macro={macro[0]}={macro[1]}") + return True + return False + + +def _process_define_macro(formatted_options, macro): + union_type = "Union[str, Tuple[str, str]]" + if _process_define_macro_inner(formatted_options, macro): + return + if is_nested_sequence(macro): + for seq_macro in macro: + if not _process_define_macro_inner(formatted_options, seq_macro): + raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}") + return + raise RuntimeError(f"Expected define_macro {union_type}, List[{union_type}], got {macro}") + + @dataclass class ProgramOptions: """Customizable options for configuring `Program`. @@ -242,19 +267,7 @@ def __post_init__(self): if self.gen_opt_lto is not None and self.gen_opt_lto: self._formatted_options.append("--gen-opt-lto") if self.define_macro is not None: - if isinstance(self.define_macro, str): - self._formatted_options.append(f"--define-macro={self.define_macro}") - elif isinstance(self.define_macro, tuple): - assert len(self.define_macro) == 2 - self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}") - elif is_nested_sequence(self.define_macro): - for macro in self.define_macro: - if isinstance(macro, tuple): - assert len(macro) == 2 - self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}") - else: - self._formatted_options.append(f"--define-macro={macro}") - + _process_define_macro(self._formatted_options, self.define_macro) if self.undefine_macro is not None: if isinstance(self.undefine_macro, str): self._formatted_options.append(f"--undefine-macro={self.undefine_macro}") @@ -371,8 +384,6 @@ def close(self): self.handle = None __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options") - _supported_code_type = ("c++", "ptx") - _supported_target_type = ("ptx", "cubin", "ltoir") def __init__(self, code, code_type, options: ProgramOptions = None): self._mnff = Program._MembersNeededForFinalize(self, None) @@ -380,12 +391,8 @@ def __init__(self, code, code_type, options: ProgramOptions = None): self._options = options = check_or_create_options(ProgramOptions, options, "Program options") code_type = code_type.lower() - if code_type not in self._supported_code_type: - raise NotImplementedError - if code_type == "c++": - if not isinstance(code, str): - raise TypeError("c++ Program expects code argument to be a string") + assert_type(code, str) # TODO: support pre-loaded headers & include names # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved @@ -394,14 +401,15 @@ def __init__(self, code, code_type, options: ProgramOptions = None): self._linker = None elif code_type == "ptx": - if not isinstance(code, str): - raise TypeError("ptx Program expects code argument to be a string") + assert_type(code, str) self._linker = Linker( ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options) ) self._backend = self._linker.backend else: - raise NotImplementedError + supported_code_types = ("c++", "ptx") + assert code_type not in supported_code_types, f"{code_type=}" + raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})") def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions: return LinkerOptions( @@ -453,8 +461,9 @@ def compile(self, target_type, name_expressions=(), logs=None): Newly created code object. """ - if target_type not in self._supported_target_type: - raise ValueError(f"the target type {target_type} is not supported") + supported_target_types = ("ptx", "cubin", "ltoir") + if target_type not in supported_target_types: + raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})') if self._backend == "NVRTC": if target_type == "ptx" and not self._can_load_generated_ptx(): @@ -498,7 +507,9 @@ def compile(self, target_type, name_expressions=(), logs=None): return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping) - assert self._backend in ("nvJitLink", "driver") + supported_backends = ("nvJitLink", "driver") + if self._backend not in supported_backends: + raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})') return self._linker.link(target_type) @property diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py index 95b59595d..237bcf92b 100644 --- a/cuda_core/cuda/core/experimental/_stream.py +++ b/cuda_core/cuda/core/experimental/_stream.py @@ -15,7 +15,14 @@ from cuda.core.experimental._device import Device from cuda.core.experimental._context import Context from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._utils import check_or_create_options, driver, get_device_from_ctx, handle_return, runtime +from cuda.core.experimental._utils.clear_error_support import assert_type +from cuda.core.experimental._utils.cuda_utils import ( + check_or_create_options, + driver, + get_device_from_ctx, + handle_return, + runtime, +) @dataclass @@ -107,20 +114,34 @@ def _init(cls, obj=None, *, options: Optional[StreamOptions] = None): if obj is not None and options is not None: raise ValueError("obj and options cannot be both specified") if obj is not None: - try: - info = obj.__cuda_stream__() - except AttributeError as e: - raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' method") from e - except TypeError: - info = obj.__cuda_stream__ + cuda_stream_attr = getattr(obj, "__cuda_stream__", None) + if cuda_stream_attr is None: + raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute") + if callable(cuda_stream_attr): + info = cuda_stream_attr() + else: + info = cuda_stream_attr warnings.simplefilter("once", DeprecationWarning) warnings.warn( "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method", stacklevel=3, category=DeprecationWarning, ) + try: + len_info = len(info) + except Exception as e: + raise RuntimeError( + f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}" + ) from e + if len_info != 2: + raise RuntimeError( + f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements" + ) + if info[0] != 0: + raise RuntimeError( + f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}" + ) - assert info[0] == 0 self._mnff.handle = driver.CUstream(info[1]) # TODO: check if obj is created under the current context/device self._mnff.owner = obj @@ -218,8 +239,7 @@ def record(self, event: Event = None, options: EventOptions = None) -> Event: # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions. if event is None: event = Event._init(options) - elif not isinstance(event, Event): - raise TypeError("record only takes an Event object") + assert_type(event, Event) handle_return(driver.cuEventRecord(event.handle, self._mnff.handle)) return event @@ -237,13 +257,16 @@ def wait(self, event_or_stream: Union[Event, Stream]): event = event_or_stream.handle discard_event = False else: - if not isinstance(event_or_stream, Stream): + if isinstance(event_or_stream, Stream): + stream = event_or_stream + else: try: stream = Stream._init(event_or_stream) except Exception as e: - raise ValueError("only an Event, Stream, or object supporting __cuda_stream__ can be waited") from e - else: - stream = event_or_stream + raise ValueError( + "only an Event, Stream, or object supporting __cuda_stream__ can be waited," + f" got {type(event_or_stream)}" + ) from e event = handle_return(driver.cuEventCreate(driver.CUevent_flags.CU_EVENT_DISABLE_TIMING)) handle_return(driver.cuEventRecord(event, stream.handle)) discard_event = True diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py index 548701be5..a74b15ed7 100644 --- a/cuda_core/cuda/core/experimental/_system.py +++ b/cuda_core/cuda/core/experimental/_system.py @@ -5,7 +5,7 @@ from typing import Tuple from cuda.core.experimental._device import Device -from cuda.core.experimental._utils import driver, handle_return, runtime +from cuda.core.experimental._utils.cuda_utils import driver, handle_return, runtime class System: diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.py b/cuda_core/cuda/core/experimental/_utils/__init__.py new file mode 100644 index 000000000..174c85e9b --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py new file mode 100644 index 000000000..b430e6ccb --- /dev/null +++ b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py @@ -0,0 +1,19 @@ +# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + + +def assert_type(obj, expected_type): + """Ensure obj is of expected_type, else raise AssertionError with a clear message.""" + if not isinstance(obj, expected_type): + raise TypeError(f"Expected type {expected_type.__name__}, but got {type(obj).__name__}") + + +def assert_type_str_or_bytes(obj): + """Ensure obj is of type str or bytes, else raise AssertionError with a clear message.""" + if not isinstance(obj, (str, bytes)): + raise TypeError(f"Expected type str or bytes, but got {type(obj).__name__}") + + +def raise_code_path_meant_to_be_unreachable(): + raise RuntimeError("This code path is meant to be unreachable.") diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils/cuda_utils.py similarity index 87% rename from cuda_core/cuda/core/experimental/_utils.py rename to cuda_core/cuda/core/experimental/_utils/cuda_utils.py index 3538ae6c1..1bc66a24a 100644 --- a/cuda_core/cuda/core/experimental/_utils.py +++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. # # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE @@ -27,6 +27,24 @@ class NVRTCError(CUDAError): ComputeCapability = namedtuple("ComputeCapability", ("major", "minor")) +def cast_to_3_tuple(label, cfg): + cfg_orig = cfg + if isinstance(cfg, int): + cfg = (cfg,) + else: + common = "must be an int, or a tuple with up to 3 ints" + if not isinstance(cfg, tuple): + raise ValueError(f"{label} {common} (got {type(cfg)})") + if len(cfg) > 3: + raise ValueError(f"{label} {common} (got tuple with length {len(cfg)})") + if any(not isinstance(val, int) for val in cfg): + raise ValueError(f"{label} {common} (got {cfg})") + if any(val < 1 for val in cfg): + plural_s = "" if len(cfg) == 1 else "s" + raise ValueError(f"{label} value{plural_s} must be >= 1 (got {cfg_orig})") + return cfg + (1,) * (3 - len(cfg)) + + def _check_error(error, handle=None): if isinstance(error, driver.CUresult): if error == driver.CUresult.CUDA_SUCCESS: diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 934a0e852..889372417 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -17,7 +17,7 @@ import pytest from cuda.core.experimental import Device, _device -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._utils.cuda_utils import handle_return @pytest.fixture(scope="session", autouse=True) diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index 1d20adfed..60fd75b01 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -15,7 +15,7 @@ import cuda.core.experimental from cuda.core.experimental import Device -from cuda.core.experimental._utils import ComputeCapability, get_binding_version, handle_return +from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return def test_device_init_disabled(): diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index 984ef7ad0..43fc9e5fa 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -23,18 +23,6 @@ def test_launch_config_init(init_cuda): assert config.shmem_size == 1024 -def test_launch_config_cast_to_3_tuple(): - config = LaunchConfig(grid=1, block=1) - assert config._cast_to_3_tuple(1) == (1, 1, 1) - assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1) - assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3) - - # Edge cases - assert config._cast_to_3_tuple(999) == (999, 1, 1) - assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1) - assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777) - - def test_launch_config_invalid_values(): with pytest.raises(ValueError): LaunchConfig(grid=0, block=1) @@ -69,10 +57,10 @@ def test_launch_invalid_values(init_cuda): with pytest.raises(ValueError): launch(None, ker, config) - with pytest.raises(ValueError): + with pytest.raises(TypeError): launch(stream, None, config) - with pytest.raises(ValueError): + with pytest.raises(TypeError): launch(stream, ker, None) launch(stream, config, ker) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index 613afdb28..78195c2dc 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -6,7 +6,7 @@ from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils import CUDAError +from cuda.core.experimental._utils.cuda_utils import CUDAError ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index a48db69b5..1ff728c64 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -13,9 +13,11 @@ import ctypes +import pytest + from cuda.core.experimental import Device -from cuda.core.experimental._memory import Buffer, MemoryResource -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._memory import Buffer, DLDeviceType, MemoryResource +from cuda.core.experimental._utils.cuda_utils import handle_return class DummyDeviceMemoryResource(MemoryResource): @@ -116,6 +118,12 @@ def device_id(self) -> int: raise RuntimeError("the pinned memory resource is not bound to any GPU") +class NullMemoryResource(DummyHostMemoryResource): + @property + def is_host_accessible(self) -> bool: + return False + + def buffer_initialization(dummy_mr: MemoryResource): buffer = dummy_mr.allocate(size=1024) assert buffer.handle != 0 @@ -211,3 +219,46 @@ def test_buffer_close(): buffer_close(DummyHostMemoryResource()) buffer_close(DummyUnifiedMemoryResource(device)) buffer_close(DummyPinnedMemoryResource(device)) + + +def test_buffer_dunder_dlpack(): + device = Device() + device.set_current() + dummy_mr = DummyDeviceMemoryResource(device) + buffer = dummy_mr.allocate(size=1024) + capsule = buffer.__dlpack__() + assert "dltensor" in repr(capsule) + capsule = buffer.__dlpack__(max_version=(1, 0)) + assert "dltensor" in repr(capsule) + with pytest.raises(BufferError, match=r"^Sorry, not supported: dl_device other than None$"): + buffer.__dlpack__(dl_device=[]) + with pytest.raises(BufferError, match=r"^Sorry, not supported: copy=True$"): + buffer.__dlpack__(copy=True) + with pytest.raises(BufferError, match=r"^Expected max_version Tuple\[int, int\], got \[\]$"): + buffer.__dlpack__(max_version=[]) + with pytest.raises(BufferError, match=r"^Expected max_version Tuple\[int, int\], got \(9, 8, 7\)$"): + buffer.__dlpack__(max_version=(9, 8, 7)) + + +@pytest.mark.parametrize( + ("DummyMR", "expected"), + [ + (DummyDeviceMemoryResource, (DLDeviceType.kDLCUDA, 0)), + (DummyHostMemoryResource, (DLDeviceType.kDLCPU, 0)), + (DummyUnifiedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)), + (DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)), + ], +) +def test_buffer_dunder_dlpack_device_success(DummyMR, expected): + device = Device() + device.set_current() + dummy_mr = DummyMR() if DummyMR is DummyHostMemoryResource else DummyMR(device) + buffer = dummy_mr.allocate(size=1024) + assert buffer.__dlpack_device__() == expected + + +def test_buffer_dunder_dlpack_device_failure(): + dummy_mr = NullMemoryResource() + buffer = dummy_mr.allocate(size=1024) + with pytest.raises(BufferError, match=r"^buffer is neither device-accessible nor host-accessible$"): + buffer.__dlpack_device__() diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index 05132a201..eaa212a13 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -87,9 +87,11 @@ def test_program_init_valid_code_type(): def test_program_init_invalid_code_type(): - code = 'extern "C" __global__ void my_kernel() {}' - with pytest.raises(NotImplementedError): - Program(code, "python") + code = "goto 100" + with pytest.raises( + RuntimeError, match=r"^Unsupported code_type='fortran' \(supported_code_types=\('c\+\+', 'ptx'\)\)$" + ): + Program(code, "FORTRAN") def test_program_init_invalid_code_format(): diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index c1ebc3d2e..11cf02fa4 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -11,7 +11,7 @@ from cuda.core.experimental import Device, Stream, StreamOptions from cuda.core.experimental._event import Event from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream -from cuda.core.experimental._utils import driver +from cuda.core.experimental._utils.cuda_utils import driver def test_stream_init_disabled(): diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index 7a39388ff..e8d3d8355 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -5,7 +5,7 @@ from cuda import cudart as runtime from cuda.core.experimental import Device, system -from cuda.core.experimental._utils import handle_return +from cuda.core.experimental._utils.cuda_utils import handle_return def test_system_singleton(): diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 0926a549d..eea7bd1e5 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -13,10 +13,39 @@ import numpy as np import pytest +import cuda.core.experimental from cuda.core.experimental import Device from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory +def test_cast_to_3_tuple_success(): + c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple + assert c3t("", ()) == (1, 1, 1) + assert c3t("", 2) == (2, 1, 1) + assert c3t("", (2,)) == (2, 1, 1) + assert c3t("", (2, 3)) == (2, 3, 1) + assert c3t("", (2, 3, 4)) == (2, 3, 4) + + +_cast_to_3_tuple_value_error_test_cases = { + "not tuple": ([], r"^Lbl must be an int, or a tuple with up to 3 ints \(got .*\)$"), + "len 4": ((1, 2, 3, 4), r"^Lbl must be an int, or a tuple with up to 3 ints \(got tuple with length 4\)$"), + "not int": (("bAd",), r"^Lbl must be an int, or a tuple with up to 3 ints \(got \('bAd',\)\)$"), + "isolated negative": (-9, r"^Lbl value must be >= 1 \(got -9\)$"), + "tuple negative": ((-9,), r"^Lbl value must be >= 1 \(got \(-9,\)\)$"), +} + + +@pytest.mark.parametrize( + ("cfg", "expected"), + _cast_to_3_tuple_value_error_test_cases.values(), + ids=_cast_to_3_tuple_value_error_test_cases.keys(), +) +def test_cast_to_3_tuple_value_error(cfg, expected): + with pytest.raises(ValueError, match=expected): + cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg) + + def convert_strides_to_counts(strides, itemsize): return tuple(s // itemsize for s in strides)