diff --git a/cuda_core/cuda/core/experimental/_context.py b/cuda_core/cuda/core/experimental/_context.py
index e59e94db3..ad2e4b94c 100644
--- a/cuda_core/cuda/core/experimental/_context.py
+++ b/cuda_core/cuda/core/experimental/_context.py
@@ -4,7 +4,8 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils import driver
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import driver
 
 
 @dataclass
@@ -20,7 +21,7 @@ def __new__(self, *args, **kwargs):
 
     @classmethod
     def _from_ctx(cls, obj, dev_id):
-        assert isinstance(obj, driver.CUcontext)
+        assert_type(obj, driver.CUcontext)
         ctx = super().__new__(cls)
         ctx._handle = obj
         ctx._id = dev_id
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
index 2eb827d38..1a98794f9 100644
--- a/cuda_core/cuda/core/experimental/_device.py
+++ b/cuda_core/cuda/core/experimental/_device.py
@@ -8,7 +8,15 @@
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
-from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import (
+    ComputeCapability,
+    CUDAError,
+    driver,
+    handle_return,
+    precondition,
+    runtime,
+)
 
 _tls = threading.local()
 _lock = threading.Lock()
@@ -949,10 +957,11 @@ def __new__(cls, device_id=None):
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(runtime.cudaGetDevice())
-            assert isinstance(device_id, int), f"{device_id=}"
+            assert_type(device_id, int)
         else:
             total = handle_return(runtime.cudaGetDeviceCount())
-            if not isinstance(device_id, int) or not (0 <= device_id < total):
+            assert_type(device_id, int)
+            if not (0 <= device_id < total):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
@@ -981,7 +990,9 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
+            raise CUDAError(
+                f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
+            )
 
     @property
     def device_id(self) -> int:
@@ -1053,7 +1064,8 @@ def context(self) -> Context:
 
         """
         ctx = handle_return(driver.cuCtxGetCurrent())
-        assert int(ctx) != 0
+        if int(ctx) == 0:
+            raise CUDAError("No context is bound to the calling CPU thread.")
         return Context._from_ctx(ctx, self._id)
 
     @property
@@ -1063,8 +1075,7 @@ def memory_resource(self) -> MemoryResource:
 
     @memory_resource.setter
     def memory_resource(self, mr):
-        if not isinstance(mr, MemoryResource):
-            raise TypeError
+        assert_type(mr, MemoryResource)
         self._mr = mr
 
     @property
@@ -1118,12 +1129,11 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
 
         """
         if ctx is not None:
-            if not isinstance(ctx, Context):
-                raise TypeError("a Context object is required")
+            assert_type(ctx, Context)
             if ctx._id != self._id:
                 raise RuntimeError(
-                    "the provided context was created on a different "
-                    f"device {ctx._id} other than the target {self._id}"
+                    "the provided context was created on the device with"
+                    f" id={ctx._id}, which is different from the target id={self._id}"
                 )
             prev_ctx = handle_return(driver.cuCtxPopCurrent())
             handle_return(driver.cuCtxPushCurrent(ctx._handle))
@@ -1165,7 +1175,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
             Newly created context object.
 
         """
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
     @precondition(_check_context_initialized)
     def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
index bd97305a6..54468f238 100644
--- a/cuda_core/cuda/core/experimental/_event.py
+++ b/cuda_core/cuda/core/experimental/_event.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -88,7 +88,7 @@ def _init(cls, options: Optional[EventOptions] = None):
             flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
             self._busy_waited = True
         if options.support_ipc:
-            raise NotImplementedError("TODO")
+            raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
         self._mnff.handle = handle_return(driver.cuEventCreate(flags))
         return self
 
@@ -109,7 +109,7 @@ def is_sync_busy_waited(self) -> bool:
     @property
     def is_ipc_supported(self) -> bool:
         """Return True if this event can be used as an interprocess event, otherwise False."""
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
 
     def sync(self):
         """Synchronize until the event completes.
@@ -129,10 +129,9 @@ def is_done(self) -> bool:
         (result,) = driver.cuEventQuery(self._mnff.handle)
         if result == driver.CUresult.CUDA_SUCCESS:
             return True
-        elif result == driver.CUresult.CUDA_ERROR_NOT_READY:
+        if result == driver.CUresult.CUDA_ERROR_NOT_READY:
             return False
-        else:
-            raise CUDAError(f"unexpected error: {result}")
+        handle_return(result)
 
     @property
     def handle(self) -> cuda.bindings.driver.CUevent:
diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
index bdf498fb0..71f55bb23 100644
--- a/cuda_core/cuda/core/experimental/_launcher.py
+++ b/cuda_core/cuda/core/experimental/_launcher.py
@@ -9,7 +9,15 @@
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._module import Kernel
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, get_binding_version, handle_return
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import (
+    CUDAError,
+    cast_to_3_tuple,
+    check_or_create_options,
+    driver,
+    get_binding_version,
+    handle_return,
+)
 
 # TODO: revisit this treatment for py313t builds
 _inited = False
@@ -59,41 +67,23 @@ class LaunchConfig:
 
     def __post_init__(self):
         _lazy_init()
-        self.grid = self._cast_to_3_tuple(self.grid)
-        self.block = self._cast_to_3_tuple(self.block)
+        self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
+        self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
         # thread block clusters are supported starting H100
         if self.cluster is not None:
             if not _use_ex:
-                raise CUDAError("thread block clusters require cuda.bindings & driver 11.8+")
-            if Device().compute_capability < (9, 0):
-                raise CUDAError("thread block clusters are not supported on devices with compute capability < 9.0")
-            self.cluster = self._cast_to_3_tuple(self.cluster)
+                err, drvers = driver.cuDriverGetVersion()
+                drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
+                raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
+            cc = Device().compute_capability
+            if cc < (9, 0):
+                raise CUDAError(
+                    f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
+                )
+            self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
         if self.shmem_size is None:
             self.shmem_size = 0
 
-    def _cast_to_3_tuple(self, cfg):
-        if isinstance(cfg, int):
-            if cfg < 1:
-                raise ValueError
-            return (cfg, 1, 1)
-        elif isinstance(cfg, tuple):
-            size = len(cfg)
-            if size == 1:
-                cfg = cfg[0]
-                if cfg < 1:
-                    raise ValueError
-                return (cfg, 1, 1)
-            elif size == 2:
-                if cfg[0] < 1 or cfg[1] < 1:
-                    raise ValueError
-                return (*cfg, 1)
-            elif size == 3:
-                if cfg[0] < 1 or cfg[1] < 1 or cfg[2] < 1:
-                    raise ValueError
-                return cfg
-        else:
-            raise ValueError
-
 
 def launch(stream, config, kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
@@ -120,9 +110,10 @@ def launch(stream, config, kernel, *kernel_args):
         try:
             stream = Stream._init(stream)
         except Exception as e:
-            raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e
-    if not isinstance(kernel, Kernel):
-        raise ValueError
+            raise ValueError(
+                f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
+            ) from e
+    assert_type(kernel, Kernel)
     config = check_or_create_options(LaunchConfig, config, "launch config")
 
     # TODO: can we ensure kernel_args is valid/safe to use here?
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
index 43d1eb3c6..42dd2c0ba 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/experimental/_linker.py
@@ -16,7 +16,8 @@
 
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import check_or_create_options, driver, handle_return, is_sequence
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
 
 # TODO: revisit this treatment for py313t builds
 _driver = None  # populated if nvJitLink cannot be used
@@ -382,12 +383,12 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
         self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
 
         for code in object_codes:
-            assert isinstance(code, ObjectCode)
+            assert_type(code, ObjectCode)
             self._add_code_object(code)
 
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
-        assert isinstance(data, bytes)
+        assert_type(data, bytes)
         with _exception_manager(self):
             if _nvjitlink:
                 _nvjitlink.add_data(
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
index 403ee0842..6a0c611d3 100644
--- a/cuda_core/cuda/core/experimental/_memory.py
+++ b/cuda_core/cuda/core/experimental/_memory.py
@@ -10,7 +10,7 @@
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import default_stream
-from cuda.core.experimental._utils import driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 PyCapsule = TypeVar("PyCapsule")
 
@@ -100,21 +100,21 @@ def is_device_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._mnff.mr is not None:
             return self._mnff.mr.is_device_accessible
-        raise NotImplementedError
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._mnff.mr is not None:
             return self._mnff.mr.is_host_accessible
-        raise NotImplementedError
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
     def device_id(self) -> int:
         """Return the device ordinal of this buffer."""
         if self._mnff.mr is not None:
             return self._mnff.mr.device_id
-        raise NotImplementedError
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
@@ -136,10 +136,12 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
             raise ValueError("stream must be provided")
         if dst is None:
             if self._mnff.mr is None:
-                raise ValueError("a destination buffer must be provided")
+                raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
             dst = self._mnff.mr.allocate(self._mnff.size, stream)
         if dst._mnff.size != self._mnff.size:
-            raise ValueError("buffer sizes mismatch between src and dst")
+            raise ValueError(
+                f"buffer sizes mismatch between src and dst (sizes are: src={self._mnff.size}, dst={dst._mnff.size})"
+            )
         handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
         return dst
 
@@ -158,7 +160,9 @@ def copy_from(self, src: Buffer, *, stream):
         if stream is None:
             raise ValueError("stream must be provided")
         if src._mnff.size != self._mnff.size:
-            raise ValueError("buffer sizes mismatch between src and dst")
+            raise ValueError(
+                f"buffer sizes mismatch between src and dst (sizes are: src={src._mnff.size}, dst={self._mnff.size})"
+            )
         handle_return(driver.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle))
 
     def __dlpack__(
@@ -171,37 +175,40 @@ def __dlpack__(
     ) -> PyCapsule:
         # Note: we ignore the stream argument entirely (as if it is -1).
         # It is the user's responsibility to maintain stream order.
-        if dl_device is not None or copy is True:
-            raise BufferError
+        if dl_device is not None:
+            raise BufferError("Sorry, not supported: dl_device other than None")
+        if copy is True:
+            raise BufferError("Sorry, not supported: copy=True")
         if max_version is None:
             versioned = False
         else:
-            assert len(max_version) == 2
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise BufferError(f"Expected max_version Tuple[int, int], got {max_version}")
             versioned = max_version >= (1, 0)
         capsule = make_py_capsule(self, versioned)
         return capsule
 
     def __dlpack_device__(self) -> Tuple[int, int]:
-        if self.is_device_accessible and not self.is_host_accessible:
+        d_h = (bool(self.is_device_accessible), bool(self.is_host_accessible))
+        if d_h == (True, False):
             return (DLDeviceType.kDLCUDA, self.device_id)
-        elif self.is_device_accessible and self.is_host_accessible:
+        if d_h == (True, True):
             # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
             return (DLDeviceType.kDLCUDAHost, 0)
-        elif not self.is_device_accessible and self.is_host_accessible:
+        if d_h == (False, True):
             return (DLDeviceType.kDLCPU, 0)
-        else:  # not self.is_device_accessible and not self.is_host_accessible
-            raise BufferError("invalid buffer")
+        raise BufferError("buffer is neither device-accessible nor host-accessible")
 
     def __buffer__(self, flags: int, /) -> memoryview:
         # Support for Python-level buffer protocol as per PEP 688.
         # This raises a BufferError unless:
         #   1. Python is 3.12+
         #   2. This Buffer object is host accessible
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.")
 
     def __release_buffer__(self, buffer: memoryview, /):
         # Supporting method paired with __buffer__.
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
 
 
 class MemoryResource(abc.ABC):
@@ -291,7 +298,7 @@ def is_host_accessible(self) -> bool:
 
     @property
     def device_id(self) -> int:
-        raise RuntimeError("the pinned memory resource is not bound to any GPU")
+        raise RuntimeError("a pinned memory resource is not bound to any GPU")
 
 
 class _SynchronousMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index 7ebfa4806..04a0ca35b 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -11,7 +11,7 @@ from typing import Any, Optional
 
 import numpy
 
-from cuda.core.experimental._utils import handle_return, driver
+from cuda.core.experimental._utils.cuda_utils import handle_return, driver
 
 
 # TODO(leofang): support NumPy structured dtypes
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
index 39a7c0f5b..320b5b603 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/experimental/_module.py
@@ -5,7 +5,12 @@
 from typing import Optional, Union
 from warnings import warn
 
-from cuda.core.experimental._utils import driver, get_binding_version, handle_return, precondition
+from cuda.core.experimental._utils.clear_error_support import (
+    assert_type,
+    assert_type_str_or_bytes,
+    raise_code_path_meant_to_be_unreachable,
+)
+from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
 
 _backend = {
     "old": {
@@ -195,8 +200,8 @@ def __new__(self, *args, **kwargs):
 
     @classmethod
     def _from_obj(cls, obj, mod):
-        assert isinstance(obj, _kernel_ctypes)
-        assert isinstance(mod, ObjectCode)
+        assert_type(obj, _kernel_ctypes)
+        assert_type(mod, ObjectCode)
         ker = super().__new__(cls)
         ker._handle = obj
         ker._module = mod
@@ -300,17 +305,20 @@ def _lazy_load_module(self, *args, **kwargs):
         if self._handle is not None:
             return
         module = self._module
+        assert_type_str_or_bytes(module)
         if isinstance(module, str):
             if self._backend_version == "new":
                 self._handle = handle_return(self._loader["file"](module.encode(), [], [], 0, [], [], 0))
             else:  # "old" backend
                 self._handle = handle_return(self._loader["file"](module.encode()))
-        else:
-            assert isinstance(module, bytes)
+            return
+        if isinstance(module, bytes):
             if self._backend_version == "new":
                 self._handle = handle_return(self._loader["data"](module, [], [], 0, [], [], 0))
             else:  # "old" backend
                 self._handle = handle_return(self._loader["data"](module, 0, [], []))
+            return
+        raise_code_path_meant_to_be_unreachable()
 
     @precondition(_lazy_load_module)
     def get_kernel(self, name) -> Kernel:
@@ -327,8 +335,9 @@ def get_kernel(self, name) -> Kernel:
             Newly created kernel object.
 
         """
-        if self._code_type not in ("cubin", "ptx", "fatbin"):
-            raise RuntimeError(f"get_kernel() is not supported for {self._code_type}")
+        supported_code_types = ("cubin", "ptx", "fatbin")
+        if self._code_type not in supported_code_types:
+            raise RuntimeError(f'Unsupported code type "{self._code_type}" ({supported_code_types=})')
         try:
             name = self._sym_map[name]
         except KeyError:
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
index 8e7fea245..ab9362531 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/experimental/_program.py
@@ -15,7 +15,8 @@
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import (
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import (
     _handle_boolean_option,
     check_or_create_options,
     driver,
@@ -26,6 +27,30 @@
 )
 
 
+def _process_define_macro_inner(formatted_options, macro):
+    if isinstance(macro, str):
+        formatted_options.append(f"--define-macro={macro}")
+        return True
+    if isinstance(macro, tuple):
+        if len(macro) != 2 or any(not isinstance(val, str) for val in macro):
+            raise RuntimeError(f"Expected define_macro Tuple[str, str], got {macro}")
+        formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
+        return True
+    return False
+
+
+def _process_define_macro(formatted_options, macro):
+    union_type = "Union[str, Tuple[str, str]]"
+    if _process_define_macro_inner(formatted_options, macro):
+        return
+    if is_nested_sequence(macro):
+        for seq_macro in macro:
+            if not _process_define_macro_inner(formatted_options, seq_macro):
+                raise RuntimeError(f"Expected define_macro {union_type}, got {seq_macro}")
+        return
+    raise RuntimeError(f"Expected define_macro {union_type}, List[{union_type}], got {macro}")
+
+
 @dataclass
 class ProgramOptions:
     """Customizable options for configuring `Program`.
@@ -242,19 +267,7 @@ def __post_init__(self):
         if self.gen_opt_lto is not None and self.gen_opt_lto:
             self._formatted_options.append("--gen-opt-lto")
         if self.define_macro is not None:
-            if isinstance(self.define_macro, str):
-                self._formatted_options.append(f"--define-macro={self.define_macro}")
-            elif isinstance(self.define_macro, tuple):
-                assert len(self.define_macro) == 2
-                self._formatted_options.append(f"--define-macro={self.define_macro[0]}={self.define_macro[1]}")
-            elif is_nested_sequence(self.define_macro):
-                for macro in self.define_macro:
-                    if isinstance(macro, tuple):
-                        assert len(macro) == 2
-                        self._formatted_options.append(f"--define-macro={macro[0]}={macro[1]}")
-                    else:
-                        self._formatted_options.append(f"--define-macro={macro}")
-
+            _process_define_macro(self._formatted_options, self.define_macro)
         if self.undefine_macro is not None:
             if isinstance(self.undefine_macro, str):
                 self._formatted_options.append(f"--undefine-macro={self.undefine_macro}")
@@ -371,8 +384,6 @@ def close(self):
                 self.handle = None
 
     __slots__ = ("__weakref__", "_mnff", "_backend", "_linker", "_options")
-    _supported_code_type = ("c++", "ptx")
-    _supported_target_type = ("ptx", "cubin", "ltoir")
 
     def __init__(self, code, code_type, options: ProgramOptions = None):
         self._mnff = Program._MembersNeededForFinalize(self, None)
@@ -380,12 +391,8 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
         self._options = options = check_or_create_options(ProgramOptions, options, "Program options")
         code_type = code_type.lower()
 
-        if code_type not in self._supported_code_type:
-            raise NotImplementedError
-
         if code_type == "c++":
-            if not isinstance(code, str):
-                raise TypeError("c++ Program expects code argument to be a string")
+            assert_type(code, str)
             # TODO: support pre-loaded headers & include names
             # TODO: allow tuples once NVIDIA/cuda-python#72 is resolved
 
@@ -394,14 +401,15 @@ def __init__(self, code, code_type, options: ProgramOptions = None):
             self._linker = None
 
         elif code_type == "ptx":
-            if not isinstance(code, str):
-                raise TypeError("ptx Program expects code argument to be a string")
+            assert_type(code, str)
             self._linker = Linker(
                 ObjectCode._init(code.encode(), code_type), options=self._translate_program_options(options)
             )
             self._backend = self._linker.backend
         else:
-            raise NotImplementedError
+            supported_code_types = ("c++", "ptx")
+            assert code_type not in supported_code_types, f"{code_type=}"
+            raise RuntimeError(f"Unsupported {code_type=} ({supported_code_types=})")
 
     def _translate_program_options(self, options: ProgramOptions) -> LinkerOptions:
         return LinkerOptions(
@@ -453,8 +461,9 @@ def compile(self, target_type, name_expressions=(), logs=None):
             Newly created code object.
 
         """
-        if target_type not in self._supported_target_type:
-            raise ValueError(f"the target type {target_type} is not supported")
+        supported_target_types = ("ptx", "cubin", "ltoir")
+        if target_type not in supported_target_types:
+            raise ValueError(f'Unsupported target_type="{target_type}" ({supported_target_types=})')
 
         if self._backend == "NVRTC":
             if target_type == "ptx" and not self._can_load_generated_ptx():
@@ -498,7 +507,9 @@ def compile(self, target_type, name_expressions=(), logs=None):
 
             return ObjectCode._init(data, target_type, symbol_mapping=symbol_mapping)
 
-        assert self._backend in ("nvJitLink", "driver")
+        supported_backends = ("nvJitLink", "driver")
+        if self._backend not in supported_backends:
+            raise ValueError(f'Unsupported backend="{self._backend}" ({supported_backends=})')
         return self._linker.link(target_type)
 
     @property
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
index 95b59595d..237bcf92b 100644
--- a/cuda_core/cuda/core/experimental/_stream.py
+++ b/cuda_core/cuda/core/experimental/_stream.py
@@ -15,7 +15,14 @@
     from cuda.core.experimental._device import Device
 from cuda.core.experimental._context import Context
 from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._utils import check_or_create_options, driver, get_device_from_ctx, handle_return, runtime
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import (
+    check_or_create_options,
+    driver,
+    get_device_from_ctx,
+    handle_return,
+    runtime,
+)
 
 
 @dataclass
@@ -107,20 +114,34 @@ def _init(cls, obj=None, *, options: Optional[StreamOptions] = None):
         if obj is not None and options is not None:
             raise ValueError("obj and options cannot be both specified")
         if obj is not None:
-            try:
-                info = obj.__cuda_stream__()
-            except AttributeError as e:
-                raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' method") from e
-            except TypeError:
-                info = obj.__cuda_stream__
+            cuda_stream_attr = getattr(obj, "__cuda_stream__", None)
+            if cuda_stream_attr is None:
+                raise TypeError(f"{type(obj)} object does not have a '__cuda_stream__' attribute")
+            if callable(cuda_stream_attr):
+                info = cuda_stream_attr()
+            else:
+                info = cuda_stream_attr
                 warnings.simplefilter("once", DeprecationWarning)
                 warnings.warn(
                     "Implementing __cuda_stream__ as an attribute is deprecated; it must be implemented as a method",
                     stacklevel=3,
                     category=DeprecationWarning,
                 )
+            try:
+                len_info = len(info)
+            except Exception as e:
+                raise RuntimeError(
+                    f"obj.__cuda_stream__ must return a sequence with 2 elements, got {type(info)}"
+                ) from e
+            if len_info != 2:
+                raise RuntimeError(
+                    f"obj.__cuda_stream__ must return a sequence with 2 elements, got {len_info} elements"
+                )
+            if info[0] != 0:
+                raise RuntimeError(
+                    f"The first element of the sequence returned by obj.__cuda_stream__ must be 0, got {repr(info[0])}"
+                )
 
-            assert info[0] == 0
             self._mnff.handle = driver.CUstream(info[1])
             # TODO: check if obj is created under the current context/device
             self._mnff.owner = obj
@@ -218,8 +239,7 @@ def record(self, event: Event = None, options: EventOptions = None) -> Event:
         # and CU_EVENT_RECORD_EXTERNAL, can be set in EventOptions.
         if event is None:
             event = Event._init(options)
-        elif not isinstance(event, Event):
-            raise TypeError("record only takes an Event object")
+        assert_type(event, Event)
         handle_return(driver.cuEventRecord(event.handle, self._mnff.handle))
         return event
 
@@ -237,13 +257,16 @@ def wait(self, event_or_stream: Union[Event, Stream]):
             event = event_or_stream.handle
             discard_event = False
         else:
-            if not isinstance(event_or_stream, Stream):
+            if isinstance(event_or_stream, Stream):
+                stream = event_or_stream
+            else:
                 try:
                     stream = Stream._init(event_or_stream)
                 except Exception as e:
-                    raise ValueError("only an Event, Stream, or object supporting __cuda_stream__ can be waited") from e
-            else:
-                stream = event_or_stream
+                    raise ValueError(
+                        "only an Event, Stream, or object supporting __cuda_stream__ can be waited,"
+                        f" got {type(event_or_stream)}"
+                    ) from e
             event = handle_return(driver.cuEventCreate(driver.CUevent_flags.CU_EVENT_DISABLE_TIMING))
             handle_return(driver.cuEventRecord(event, stream.handle))
             discard_event = True
diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/experimental/_system.py
index 548701be5..a74b15ed7 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/experimental/_system.py
@@ -5,7 +5,7 @@
 from typing import Tuple
 
 from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils import driver, handle_return, runtime
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return, runtime
 
 
 class System:
diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.py b/cuda_core/cuda/core/experimental/_utils/__init__.py
new file mode 100644
index 000000000..174c85e9b
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py
new file mode 100644
index 000000000..b430e6ccb
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_utils/clear_error_support.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+
+def assert_type(obj, expected_type):
+    """Ensure obj is of expected_type, else raise AssertionError with a clear message."""
+    if not isinstance(obj, expected_type):
+        raise TypeError(f"Expected type {expected_type.__name__}, but got {type(obj).__name__}")
+
+
+def assert_type_str_or_bytes(obj):
+    """Ensure obj is of type str or bytes, else raise AssertionError with a clear message."""
+    if not isinstance(obj, (str, bytes)):
+        raise TypeError(f"Expected type str or bytes, but got {type(obj).__name__}")
+
+
+def raise_code_path_meant_to_be_unreachable():
+    raise RuntimeError("This code path is meant to be unreachable.")
diff --git a/cuda_core/cuda/core/experimental/_utils.py b/cuda_core/cuda/core/experimental/_utils/cuda_utils.py
similarity index 87%
rename from cuda_core/cuda/core/experimental/_utils.py
rename to cuda_core/cuda/core/experimental/_utils/cuda_utils.py
index 3538ae6c1..1bc66a24a 100644
--- a/cuda_core/cuda/core/experimental/_utils.py
+++ b/cuda_core/cuda/core/experimental/_utils/cuda_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
@@ -27,6 +27,24 @@ class NVRTCError(CUDAError):
 ComputeCapability = namedtuple("ComputeCapability", ("major", "minor"))
 
 
+def cast_to_3_tuple(label, cfg):
+    cfg_orig = cfg
+    if isinstance(cfg, int):
+        cfg = (cfg,)
+    else:
+        common = "must be an int, or a tuple with up to 3 ints"
+        if not isinstance(cfg, tuple):
+            raise ValueError(f"{label} {common} (got {type(cfg)})")
+        if len(cfg) > 3:
+            raise ValueError(f"{label} {common} (got tuple with length {len(cfg)})")
+        if any(not isinstance(val, int) for val in cfg):
+            raise ValueError(f"{label} {common} (got {cfg})")
+    if any(val < 1 for val in cfg):
+        plural_s = "" if len(cfg) == 1 else "s"
+        raise ValueError(f"{label} value{plural_s} must be >= 1 (got {cfg_orig})")
+    return cfg + (1,) * (3 - len(cfg))
+
+
 def _check_error(error, handle=None):
     if isinstance(error, driver.CUresult):
         if error == driver.CUresult.CUDA_SUCCESS:
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 934a0e852..889372417 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -17,7 +17,7 @@
 import pytest
 
 from cuda.core.experimental import Device, _device
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 @pytest.fixture(scope="session", autouse=True)
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index 1d20adfed..60fd75b01 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -15,7 +15,7 @@
 
 import cuda.core.experimental
 from cuda.core.experimental import Device
-from cuda.core.experimental._utils import ComputeCapability, get_binding_version, handle_return
+from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
 
 
 def test_device_init_disabled():
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index 984ef7ad0..43fc9e5fa 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -23,18 +23,6 @@ def test_launch_config_init(init_cuda):
     assert config.shmem_size == 1024
 
 
-def test_launch_config_cast_to_3_tuple():
-    config = LaunchConfig(grid=1, block=1)
-    assert config._cast_to_3_tuple(1) == (1, 1, 1)
-    assert config._cast_to_3_tuple((1, 2)) == (1, 2, 1)
-    assert config._cast_to_3_tuple((1, 2, 3)) == (1, 2, 3)
-
-    # Edge cases
-    assert config._cast_to_3_tuple(999) == (999, 1, 1)
-    assert config._cast_to_3_tuple((999, 888)) == (999, 888, 1)
-    assert config._cast_to_3_tuple((999, 888, 777)) == (999, 888, 777)
-
-
 def test_launch_config_invalid_values():
     with pytest.raises(ValueError):
         LaunchConfig(grid=0, block=1)
@@ -69,10 +57,10 @@ def test_launch_invalid_values(init_cuda):
     with pytest.raises(ValueError):
         launch(None, ker, config)
 
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         launch(stream, None, config)
 
-    with pytest.raises(ValueError):
+    with pytest.raises(TypeError):
         launch(stream, ker, None)
 
     launch(stream, config, ker)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index 613afdb28..78195c2dc 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -6,7 +6,7 @@
 
 from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import CUDAError
+from cuda.core.experimental._utils.cuda_utils import CUDAError
 
 ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index a48db69b5..1ff728c64 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -13,9 +13,11 @@
 
 import ctypes
 
+import pytest
+
 from cuda.core.experimental import Device
-from cuda.core.experimental._memory import Buffer, MemoryResource
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._memory import Buffer, DLDeviceType, MemoryResource
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 class DummyDeviceMemoryResource(MemoryResource):
@@ -116,6 +118,12 @@ def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
 
 
+class NullMemoryResource(DummyHostMemoryResource):
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+
 def buffer_initialization(dummy_mr: MemoryResource):
     buffer = dummy_mr.allocate(size=1024)
     assert buffer.handle != 0
@@ -211,3 +219,46 @@ def test_buffer_close():
     buffer_close(DummyHostMemoryResource())
     buffer_close(DummyUnifiedMemoryResource(device))
     buffer_close(DummyPinnedMemoryResource(device))
+
+
+def test_buffer_dunder_dlpack():
+    device = Device()
+    device.set_current()
+    dummy_mr = DummyDeviceMemoryResource(device)
+    buffer = dummy_mr.allocate(size=1024)
+    capsule = buffer.__dlpack__()
+    assert "dltensor" in repr(capsule)
+    capsule = buffer.__dlpack__(max_version=(1, 0))
+    assert "dltensor" in repr(capsule)
+    with pytest.raises(BufferError, match=r"^Sorry, not supported: dl_device other than None$"):
+        buffer.__dlpack__(dl_device=[])
+    with pytest.raises(BufferError, match=r"^Sorry, not supported: copy=True$"):
+        buffer.__dlpack__(copy=True)
+    with pytest.raises(BufferError, match=r"^Expected max_version Tuple\[int, int\], got \[\]$"):
+        buffer.__dlpack__(max_version=[])
+    with pytest.raises(BufferError, match=r"^Expected max_version Tuple\[int, int\], got \(9, 8, 7\)$"):
+        buffer.__dlpack__(max_version=(9, 8, 7))
+
+
+@pytest.mark.parametrize(
+    ("DummyMR", "expected"),
+    [
+        (DummyDeviceMemoryResource, (DLDeviceType.kDLCUDA, 0)),
+        (DummyHostMemoryResource, (DLDeviceType.kDLCPU, 0)),
+        (DummyUnifiedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
+        (DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)),
+    ],
+)
+def test_buffer_dunder_dlpack_device_success(DummyMR, expected):
+    device = Device()
+    device.set_current()
+    dummy_mr = DummyMR() if DummyMR is DummyHostMemoryResource else DummyMR(device)
+    buffer = dummy_mr.allocate(size=1024)
+    assert buffer.__dlpack_device__() == expected
+
+
+def test_buffer_dunder_dlpack_device_failure():
+    dummy_mr = NullMemoryResource()
+    buffer = dummy_mr.allocate(size=1024)
+    with pytest.raises(BufferError, match=r"^buffer is neither device-accessible nor host-accessible$"):
+        buffer.__dlpack_device__()
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 05132a201..eaa212a13 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -87,9 +87,11 @@ def test_program_init_valid_code_type():
 
 
 def test_program_init_invalid_code_type():
-    code = 'extern "C" __global__ void my_kernel() {}'
-    with pytest.raises(NotImplementedError):
-        Program(code, "python")
+    code = "goto 100"
+    with pytest.raises(
+        RuntimeError, match=r"^Unsupported code_type='fortran' \(supported_code_types=\('c\+\+', 'ptx'\)\)$"
+    ):
+        Program(code, "FORTRAN")
 
 
 def test_program_init_invalid_code_format():
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index c1ebc3d2e..11cf02fa4 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -11,7 +11,7 @@
 from cuda.core.experimental import Device, Stream, StreamOptions
 from cuda.core.experimental._event import Event
 from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM, default_stream
-from cuda.core.experimental._utils import driver
+from cuda.core.experimental._utils.cuda_utils import driver
 
 
 def test_stream_init_disabled():
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index 7a39388ff..e8d3d8355 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -5,7 +5,7 @@
     from cuda import cudart as runtime
 
 from cuda.core.experimental import Device, system
-from cuda.core.experimental._utils import handle_return
+from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
 def test_system_singleton():
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 0926a549d..eea7bd1e5 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -13,10 +13,39 @@
 import numpy as np
 import pytest
 
+import cuda.core.experimental
 from cuda.core.experimental import Device
 from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 
+def test_cast_to_3_tuple_success():
+    c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple
+    assert c3t("", ()) == (1, 1, 1)
+    assert c3t("", 2) == (2, 1, 1)
+    assert c3t("", (2,)) == (2, 1, 1)
+    assert c3t("", (2, 3)) == (2, 3, 1)
+    assert c3t("", (2, 3, 4)) == (2, 3, 4)
+
+
+_cast_to_3_tuple_value_error_test_cases = {
+    "not tuple": ([], r"^Lbl must be an int, or a tuple with up to 3 ints \(got .*\)$"),
+    "len 4": ((1, 2, 3, 4), r"^Lbl must be an int, or a tuple with up to 3 ints \(got tuple with length 4\)$"),
+    "not int": (("bAd",), r"^Lbl must be an int, or a tuple with up to 3 ints \(got \('bAd',\)\)$"),
+    "isolated negative": (-9, r"^Lbl value must be >= 1 \(got -9\)$"),
+    "tuple negative": ((-9,), r"^Lbl value must be >= 1 \(got \(-9,\)\)$"),
+}
+
+
+@pytest.mark.parametrize(
+    ("cfg", "expected"),
+    _cast_to_3_tuple_value_error_test_cases.values(),
+    ids=_cast_to_3_tuple_value_error_test_cases.keys(),
+)
+def test_cast_to_3_tuple_value_error(cfg, expected):
+    with pytest.raises(ValueError, match=expected):
+        cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
+
+
 def convert_strides_to_counts(strides, itemsize):
     return tuple(s // itemsize for s in strides)