NVIDIA · rwgk · Mar 6, 2025 · Feb 21, 2025 · Feb 21, 2025 · Feb 21, 2025
diff --git a/cuda_core/cuda/core/experimental/_context.py b/cuda_core/cuda/core/experimental/_context.py
@@ -4,7 +4,8 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils import driver
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import driver
 
 
 @dataclass
@@ -20,7 +21,7 @@ def __new__(self, *args, **kwargs):
 
     @classmethod
     def _from_ctx(cls, obj, dev_id):
-        assert isinstance(obj, driver.CUcontext)
+        assert_type(obj, driver.CUcontext)
         ctx = super().__new__(cls)
         ctx._handle = obj
         ctx._id = dev_id

diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -8,7 +8,15 @@
 from cuda.core.experimental._context import Context, ContextOptions
 from cuda.core.experimental._memory import Buffer, MemoryResource, _DefaultAsyncMempool, _SynchronousMemoryResource
 from cuda.core.experimental._stream import Stream, StreamOptions, default_stream
-from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import (
+    ComputeCapability,
+    CUDAError,
+    driver,
+    handle_return,
+    precondition,
+    runtime,
+)
 
 _tls = threading.local()
 _lock = threading.Lock()
@@ -949,10 +957,11 @@ def __new__(cls, device_id=None):
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(runtime.cudaGetDevice())
-            assert isinstance(device_id, int), f"{device_id=}"
+            assert_type(device_id, int)
         else:
             total = handle_return(runtime.cudaGetDeviceCount())
-            if not isinstance(device_id, int) or not (0 <= device_id < total):
+            assert_type(device_id, int)
+            if not (0 <= device_id < total):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
@@ -981,7 +990,9 @@ def __new__(cls, device_id=None):
 
     def _check_context_initialized(self, *args, **kwargs):
         if not self._has_inited:
-            raise CUDAError("the device is not yet initialized, perhaps you forgot to call .set_current() first?")
+            raise CUDAError(
+                f"Device {self._id} is not yet initialized, perhaps you forgot to call .set_current() first?"
+            )
 
     @property
     def device_id(self) -> int:
@@ -1053,7 +1064,8 @@ def context(self) -> Context:
 
         """
         ctx = handle_return(driver.cuCtxGetCurrent())
-        assert int(ctx) != 0
+        if int(ctx) == 0:
+            raise CUDAError("No context is bound to the calling CPU thread.")
         return Context._from_ctx(ctx, self._id)
 
     @property
@@ -1063,8 +1075,7 @@ def memory_resource(self) -> MemoryResource:
 
     @memory_resource.setter
     def memory_resource(self, mr):
-        if not isinstance(mr, MemoryResource):
-            raise TypeError
+        assert_type(mr, MemoryResource)
         self._mr = mr
 
     @property
@@ -1118,12 +1129,11 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
 
         """
         if ctx is not None:
-            if not isinstance(ctx, Context):
-                raise TypeError("a Context object is required")
+            assert_type(ctx, Context)
             if ctx._id != self._id:
                 raise RuntimeError(
-                    "the provided context was created on a different "
-                    f"device {ctx._id} other than the target {self._id}"
+                    "the provided context was created on the device with"
+                    f" id={ctx._id}, which is different from the target id={self._id}"
                 )
             prev_ctx = handle_return(driver.cuCtxPopCurrent())
             handle_return(driver.cuCtxPushCurrent(ctx._handle))
@@ -1165,7 +1175,7 @@ def create_context(self, options: ContextOptions = None) -> Context:
             Newly created context object.
 
         """
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/189")
 
     @precondition(_check_context_initialized)
     def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:

diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
@@ -8,7 +8,7 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -88,7 +88,7 @@ def _init(cls, options: Optional[EventOptions] = None):
             flags |= driver.CUevent_flags.CU_EVENT_BLOCKING_SYNC
             self._busy_waited = True
         if options.support_ipc:
-            raise NotImplementedError("TODO")
+            raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
         self._mnff.handle = handle_return(driver.cuEventCreate(flags))
         return self
 
@@ -109,7 +109,7 @@ def is_sync_busy_waited(self) -> bool:
     @property
     def is_ipc_supported(self) -> bool:
         """Return True if this event can be used as an interprocess event, otherwise False."""
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: https://github.com/NVIDIA/cuda-python/issues/103")
 
     def sync(self):
         """Synchronize until the event completes.
@@ -129,10 +129,9 @@ def is_done(self) -> bool:
         (result,) = driver.cuEventQuery(self._mnff.handle)
         if result == driver.CUresult.CUDA_SUCCESS:
             return True
-        elif result == driver.CUresult.CUDA_ERROR_NOT_READY:
+        if result == driver.CUresult.CUDA_ERROR_NOT_READY:
             return False
-        else:
-            raise CUDAError(f"unexpected error: {result}")
+        handle_return(result)
 
     @property
     def handle(self) -> cuda.bindings.driver.CUevent:

diff --git a/cuda_core/cuda/core/experimental/_launcher.py b/cuda_core/cuda/core/experimental/_launcher.py
@@ -9,7 +9,15 @@
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._module import Kernel
 from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils import CUDAError, check_or_create_options, driver, get_binding_version, handle_return
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import (
+    CUDAError,
+    cast_to_3_tuple,
+    check_or_create_options,
+    driver,
+    get_binding_version,
+    handle_return,
+)
 
 # TODO: revisit this treatment for py313t builds
 _inited = False
@@ -59,41 +67,23 @@ class LaunchConfig:
 
     def __post_init__(self):
         _lazy_init()
-        self.grid = self._cast_to_3_tuple(self.grid)
-        self.block = self._cast_to_3_tuple(self.block)
+        self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
+        self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
         # thread block clusters are supported starting H100
         if self.cluster is not None:
             if not _use_ex:
-                raise CUDAError("thread block clusters require cuda.bindings & driver 11.8+")
-            if Device().compute_capability < (9, 0):
-                raise CUDAError("thread block clusters are not supported on devices with compute capability < 9.0")
-            self.cluster = self._cast_to_3_tuple(self.cluster)
+                err, drvers = driver.cuDriverGetVersion()
+                drvers_fmt = f" (got driver version {drvers})" if err == driver.CUresult.CUDA_SUCCESS else ""
+                raise CUDAError(f"thread block clusters require cuda.bindings & driver 11.8+{drvers_fmt}")
+            cc = Device().compute_capability
+            if cc < (9, 0):
+                raise CUDAError(
+                    f"thread block clusters are not supported on devices with compute capability < 9.0 (got {cc})"
+                )
+            self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
         if self.shmem_size is None:
             self.shmem_size = 0
 
-    def _cast_to_3_tuple(self, cfg):
-        if isinstance(cfg, int):
-            if cfg < 1:
-                raise ValueError
-            return (cfg, 1, 1)
-        elif isinstance(cfg, tuple):
-            size = len(cfg)
-            if size == 1:
-                cfg = cfg[0]
-                if cfg < 1:
-                    raise ValueError
-                return (cfg, 1, 1)
-            elif size == 2:
-                if cfg[0] < 1 or cfg[1] < 1:
-                    raise ValueError
-                return (*cfg, 1)
-            elif size == 3:
-                if cfg[0] < 1 or cfg[1] < 1 or cfg[2] < 1:
-                    raise ValueError
-                return cfg
-        else:
-            raise ValueError
-
 
 def launch(stream, config, kernel, *kernel_args):
     """Launches a :obj:`~_module.Kernel`
@@ -120,9 +110,10 @@ def launch(stream, config, kernel, *kernel_args):
         try:
             stream = Stream._init(stream)
         except Exception as e:
-            raise ValueError("stream must either be a Stream object or support __cuda_stream__") from e
-    if not isinstance(kernel, Kernel):
-        raise ValueError
+            raise ValueError(
+                f"stream must either be a Stream object or support __cuda_stream__ (got {type(stream)})"
+            ) from e
+    assert_type(kernel, Kernel)
     config = check_or_create_options(LaunchConfig, config, "launch config")
 
     # TODO: can we ensure kernel_args is valid/safe to use here?

diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -16,7 +16,8 @@
 
 from cuda.core.experimental._device import Device
 from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils import check_or_create_options, driver, handle_return, is_sequence
+from cuda.core.experimental._utils.clear_error_support import assert_type
+from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
 
 # TODO: revisit this treatment for py313t builds
 _driver = None  # populated if nvJitLink cannot be used
@@ -382,12 +383,12 @@ def __init__(self, *object_codes: ObjectCode, options: LinkerOptions = None):
         self._mnff = Linker._MembersNeededForFinalize(self, handle, use_nvjitlink)
 
         for code in object_codes:
-            assert isinstance(code, ObjectCode)
+            assert_type(code, ObjectCode)
             self._add_code_object(code)
 
     def _add_code_object(self, object_code: ObjectCode):
         data = object_code._module
-        assert isinstance(data, bytes)
+        assert_type(data, bytes)
         with _exception_manager(self):
             if _nvjitlink:
                 _nvjitlink.add_data(

diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
@@ -10,7 +10,7 @@
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import default_stream
-from cuda.core.experimental._utils import driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import driver, handle_return
 
 PyCapsule = TypeVar("PyCapsule")
 
@@ -100,21 +100,21 @@ def is_device_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the GPU, otherwise False."""
         if self._mnff.mr is not None:
             return self._mnff.mr.is_device_accessible
-        raise NotImplementedError
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
     def is_host_accessible(self) -> bool:
         """Return True if this buffer can be accessed by the CPU, otherwise False."""
         if self._mnff.mr is not None:
             return self._mnff.mr.is_host_accessible
-        raise NotImplementedError
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     @property
     def device_id(self) -> int:
         """Return the device ordinal of this buffer."""
         if self._mnff.mr is not None:
             return self._mnff.mr.device_id
-        raise NotImplementedError
+        raise NotImplementedError("WIP: Currently this property only supports buffers with associated MemoryResource")
 
     def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
         """Copy from this buffer to the dst buffer asynchronously on the given stream.
@@ -136,10 +136,12 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
             raise ValueError("stream must be provided")
         if dst is None:
             if self._mnff.mr is None:
-                raise ValueError("a destination buffer must be provided")
+                raise ValueError("a destination buffer must be provided (this buffer does not have a memory_resource)")
             dst = self._mnff.mr.allocate(self._mnff.size, stream)
         if dst._mnff.size != self._mnff.size:
-            raise ValueError("buffer sizes mismatch between src and dst")
+            raise ValueError(
+                f"buffer sizes mismatch between src and dst (sizes are: src={self._mnff.size}, dst={dst._mnff.size})"
+            )
         handle_return(driver.cuMemcpyAsync(dst._mnff.ptr, self._mnff.ptr, self._mnff.size, stream.handle))
         return dst
 
@@ -158,7 +160,9 @@ def copy_from(self, src: Buffer, *, stream):
         if stream is None:
             raise ValueError("stream must be provided")
         if src._mnff.size != self._mnff.size:
-            raise ValueError("buffer sizes mismatch between src and dst")
+            raise ValueError(
+                f"buffer sizes mismatch between src and dst (sizes are: src={src._mnff.size}, dst={self._mnff.size})"
+            )
         handle_return(driver.cuMemcpyAsync(self._mnff.ptr, src._mnff.ptr, self._mnff.size, stream.handle))
 
     def __dlpack__(
@@ -171,37 +175,40 @@ def __dlpack__(
     ) -> PyCapsule:
         # Note: we ignore the stream argument entirely (as if it is -1).
         # It is the user's responsibility to maintain stream order.
-        if dl_device is not None or copy is True:
-            raise BufferError
+        if dl_device is not None:
+            raise BufferError("Sorry, not supported: dl_device other than None")
+        if copy is True:
+            raise BufferError("Sorry, not supported: copy=True")
         if max_version is None:
             versioned = False
         else:
-            assert len(max_version) == 2
+            if not isinstance(max_version, tuple) or len(max_version) != 2:
+                raise BufferError(f"Expected max_version Tuple[int, int], got {max_version}")
             versioned = max_version >= (1, 0)
         capsule = make_py_capsule(self, versioned)
         return capsule
 
     def __dlpack_device__(self) -> Tuple[int, int]:
-        if self.is_device_accessible and not self.is_host_accessible:
+        d_h = (bool(self.is_device_accessible), bool(self.is_host_accessible))
+        if d_h == (True, False):
             return (DLDeviceType.kDLCUDA, self.device_id)
-        elif self.is_device_accessible and self.is_host_accessible:
+        if d_h == (True, True):
             # TODO: this can also be kDLCUDAManaged, we need more fine-grained checks
             return (DLDeviceType.kDLCUDAHost, 0)
-        elif not self.is_device_accessible and self.is_host_accessible:
+        if d_h == (False, True):
             return (DLDeviceType.kDLCPU, 0)
-        else:  # not self.is_device_accessible and not self.is_host_accessible
-            raise BufferError("invalid buffer")
+        raise BufferError("buffer is neither device-accessible nor host-accessible")
 
     def __buffer__(self, flags: int, /) -> memoryview:
         # Support for Python-level buffer protocol as per PEP 688.
         # This raises a BufferError unless:
         #   1. Python is 3.12+
         #   2. This Buffer object is host accessible
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: Buffer.__buffer__ hasn't been implemented yet.")
 
     def __release_buffer__(self, buffer: memoryview, /):
         # Supporting method paired with __buffer__.
-        raise NotImplementedError("TODO")
+        raise NotImplementedError("WIP: Buffer.__release_buffer__ hasn't been implemented yet.")
 
 
 class MemoryResource(abc.ABC):
@@ -291,7 +298,7 @@ def is_host_accessible(self) -> bool:
 
     @property
     def device_id(self) -> int:
-        raise RuntimeError("the pinned memory resource is not bound to any GPU")
+        raise RuntimeError("a pinned memory resource is not bound to any GPU")
 
 
 class _SynchronousMemoryResource(MemoryResource):

diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -11,7 +11,7 @@ from typing import Any, Optional
 
 import numpy
 
-from cuda.core.experimental._utils import handle_return, driver
+from cuda.core.experimental._utils.cuda_utils import handle_return, driver
 
 
 # TODO(leofang): support NumPy structured dtypes