Merge remote-tracking branch 'upstream/main' into dching/add-compute-sanitizer-to-ci

carterbox · carterbox · commit 3f7a79b1f5ea · 2025-04-28T11:14:49.000-05:00
diff --git a/.bandit b/.bandit
@@ -0,0 +1,2 @@
+[bandit]
+skips = B101,B311
diff --git a/.github/workflows/test-wheel-windows.yml b/.github/workflows/test-wheel-windows.yml
@@ -55,9 +55,9 @@ jobs:
 
           if ('${{ inputs.local-ctk }}' -eq '1') {
             if ($TEST_CUDA_MAJOR -eq '12') {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'
             } else {
-              $MINI_CTK_DEPS = '["nvcc", "nvrtc"]'
+              $MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'
             }
           }
 
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -20,6 +20,9 @@ repos:
     rev: 8ff25e07e487f143571cc305e56dd0253c60bc7b  #v1.8.3
     hooks:
       - id: bandit
+        args:
+          - --ini
+          - .bandit
 
 default_language_version:
       python: python3
diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py
@@ -652,7 +652,8 @@ def test_get_error_name_and_string():
 
 @pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existance needed")
 def test_device_get_name():
-    import subprocess
+    # TODO: Refactor this test once we have nvml bindings to avoid the use of subprocess
+    import subprocess  # nosec B404
 
     (err,) = cuda.cuInit(0)
     assert err == cuda.CUresult.CUDA_SUCCESS
@@ -661,12 +662,12 @@ def test_device_get_name():
     err, ctx = cuda.cuCtxCreate(0, device)
     assert err == cuda.CUresult.CUDA_SUCCESS
 
-    p = subprocess.run(
-        ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
-    )
+    p = subprocess.check_output(
+        ["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], shell=False, stderr=subprocess.PIPE
+    )  # nosec B603, B607
 
     delimiter = b"\r\n" if platform.system() == "Windows" else b"\n"
-    expect = p.stdout.split(delimiter)
+    expect = p.split(delimiter)
     size = 64
     _, got = cuda.cuDeviceGetName(size, device)
     got = got.split(b"\x00")[0]
diff --git a/cuda_core/cuda/core/experimental/_event.py b/cuda_core/cuda/core/experimental/_event.py
@@ -8,7 +8,15 @@
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
+from cuda.core.experimental._utils.cuda_utils import (
+    CUDAError,
+    check_or_create_options,
+    driver,
+    handle_return,
+)
+from cuda.core.experimental._utils.cuda_utils import (
+    _check_driver_error as raise_if_driver_error,
+)
 
 if TYPE_CHECKING:
     import cuda.bindings
@@ -117,13 +125,31 @@ def __rsub__(self, other):
 
     def __sub__(self, other):
         # return self - other (in milliseconds)
+        err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
         try:
-            timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
+            raise_if_driver_error(err)
+            return timing
         except CUDAError as e:
-            raise RuntimeError(
-                "Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
-            ) from e
-        return timing
+            if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
+                if self.is_timing_disabled or other.is_timing_disabled:
+                    explanation = (
+                        "Both Events must be created with timing enabled in order to subtract them; "
+                        "use EventOptions(enable_timing=True) when creating both events."
+                    )
+                else:
+                    explanation = (
+                        "Both Events must be recorded before they can be subtracted; "
+                        "use Stream.record() to record both events to a stream."
+                    )
+            elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
+                explanation = (
+                    "One or both events have not completed; "
+                    "use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
+                    "before subtracting them."
+                )
+            else:
+                raise e
+            raise RuntimeError(explanation) from e
 
     @property
     def is_timing_disabled(self) -> bool:
@@ -164,5 +190,11 @@ def is_done(self) -> bool:
 
     @property
     def handle(self) -> cuda.bindings.driver.CUevent:
-        """Return the underlying CUevent object."""
+        """Return the underlying CUevent object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Event.handle)``.
+        """
         return self._mnff.handle
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/experimental/_linker.py
@@ -503,6 +503,11 @@ def handle(self) -> LinkerHandleT:
         .. note::
 
            The type of the returned object depends on the backend.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Linker.handle)``.
         """
         return self._mnff.handle
 
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
@@ -6,7 +6,7 @@
 
 import abc
 import weakref
-from typing import Optional, Tuple, TypeVar
+from typing import Optional, Tuple, TypeVar, Union
 
 from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
 from cuda.core.experimental._stream import default_stream
@@ -18,6 +18,9 @@
 # TODO: define a memory property mixin class and make Buffer and
 # MemoryResource both inherit from it
 
+DevicePointerT = Union[driver.CUdeviceptr, int, None]
+"""A type union of `Cudeviceptr`, `int` and `None` for hinting Buffer.handle."""
+
 
 class Buffer:
     """Represent a handle to allocated memory.
@@ -81,8 +84,14 @@ def close(self, stream=None):
         self._mnff.close(stream)
 
     @property
-    def handle(self):
-        """Return the buffer handle object."""
+    def handle(self) -> DevicePointerT:
+        """Return the buffer handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Buffer.handle)``.
+        """
         return self._mnff.ptr
 
     @property
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/experimental/_module.py
@@ -354,5 +354,11 @@ def code(self) -> CodeTypeT:
     @property
     @precondition(_lazy_load_module)
     def handle(self):
-        """Return the underlying handle object."""
+        """Return the underlying handle object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(ObjectCode.handle)``.
+        """
         return self._handle
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/experimental/_program.py
@@ -524,5 +524,10 @@ def handle(self) -> ProgramHandleT:
         .. note::
 
            The type of the returned object depends on the backend.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Program.handle)``.
         """
         return self._mnff.handle
diff --git a/cuda_core/cuda/core/experimental/_stream.py b/cuda_core/cuda/core/experimental/_stream.py
@@ -189,7 +189,13 @@ def __cuda_stream__(self) -> Tuple[int, int]:
 
     @property
     def handle(self) -> cuda.bindings.driver.CUstream:
-        """Return the underlying ``CUstream`` object."""
+        """Return the underlying ``CUstream`` object.
+
+        .. caution::
+
+            This handle is a Python object. To get the memory address of the underlying C
+            handle, call ``int(Stream.handle)``.
+        """
         return self._mnff.handle
 
     @property
diff --git a/cuda_core/tests/example_tests/utils.py b/cuda_core/tests/example_tests/utils.py
@@ -33,7 +33,8 @@ def run_example(samples_path, filename, env=None):
         sys.argv = [fullpath]
         old_sys_path = sys.path.copy()
         sys.path.append(samples_path)
-        exec(script, env if env else {})
+        # TODO: Refactor the examples to give them a common callable `main()` to avoid needing to use exec here?
+        exec(script, env if env else {})  # nosec B102
     except ImportError as e:
         # for samples requiring any of optional dependencies
         for m in ("cupy",):
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
@@ -7,57 +7,43 @@
 # is strictly prohibited.
 
 import os
+import pathlib
 import time
 
+import numpy as np
 import pytest
 
 import cuda.core.experimental
-from cuda.core.experimental import Device, EventOptions
+from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.experimental._memory import _DefaultPinnedMemorySource
 
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
         cuda.core.experimental._event.Event()  # Ensure back door is locked.
 
 
-@pytest.mark.parametrize(
-    "enable_timing",
-    [
-        True,
-    ]
-    # The compute-sanitizer is running, and this test intentionally causes an API error.
-    + ([False, None] if os.environ.get("CUDA_PYTHON_SANITIZER_RUNNING", "0") != "1" else []),
-)
-def test_timing(init_cuda, enable_timing):
-    options = EventOptions(enable_timing=enable_timing)
+def test_timing_success(init_cuda):
+    options = EventOptions(enable_timing=True)
     stream = Device().create_stream()
     delay_seconds = 0.5
     e1 = stream.record(options=options)
     time.sleep(delay_seconds)
     e2 = stream.record(options=options)
     e2.sync()
-    for e in (e1, e2):
-        assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
-    if enable_timing:
-        elapsed_time_ms = e2 - e1
-        assert isinstance(elapsed_time_ms, float)
-        # Using a generous tolerance, to avoid flaky tests:
-        # We only want to exercise the __sub__ method, this test is not meant
-        # to stress-test the CUDA driver or time.sleep().
-        delay_ms = delay_seconds * 1000
-        if os.name == "nt":  # noqa: SIM108
-            # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
-            generous_tolerance = 100
-        else:
-            # Most modern Linux kernels have a default timer resolution of 1 ms.
-            generous_tolerance = 20
-        assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
+    elapsed_time_ms = e2 - e1
+    assert isinstance(elapsed_time_ms, float)
+    # Using a generous tolerance, to avoid flaky tests:
+    # We only want to exercise the __sub__ method, this test is not meant
+    # to stress-test the CUDA driver or time.sleep().
+    delay_ms = delay_seconds * 1000
+    if os.name == "nt":  # noqa: SIM108
+        # For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
+        generous_tolerance = 100
     else:
-        with pytest.raises(RuntimeError) as e:
-            elapsed_time_ms = e2 - e1
-            msg = str(e)
-            assert "disabled by default" in msg
-            assert "CUDA_ERROR_INVALID_HANDLE" in msg
+        # Most modern Linux kernels have a default timer resolution of 1 ms.
+        generous_tolerance = 20
+    assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
 
 
 def test_is_sync_busy_waited(init_cuda):
@@ -87,3 +73,100 @@ def test_is_done(init_cuda):
     # Without a sync, the captured work might not have yet completed
     # Therefore this check should never raise an exception
     assert event.is_done in (True, False)
+
+
+def test_error_timing_disabled():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    disabled = EventOptions(enable_timing=False)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = stream.record(options=disabled)
+    assert not event1.is_timing_disabled
+    assert event2.is_timing_disabled
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+    event1 = stream.record(options=disabled)
+    event2 = stream.record(options=disabled)
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
+        event2 - event1
+
+
+def test_error_timing_recorded():
+    device = Device()
+    device.set_current()
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    event2 = device.create_event(options=enabled)
+    event3 = device.create_event(options=enabled)
+
+    stream.sync()
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event2 - event1
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event1 - event2
+    with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
+        event3 - event2
+
+
+# TODO: improve this once path finder can find headers
+@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
+@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
+def test_error_timing_incomplete():
+    device = Device()
+    device.set_current()
+
+    # This kernel is designed to busy loop until a signal is received
+    code = """
+#include <cuda/atomic>
+
+extern "C"
+__global__ void wait(int* val) {
+    cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val};
+    while (true) {
+        if (signal.load(cuda::memory_order_relaxed)) {
+            break;
+        }
+    }
+}
+"""
+
+    arch = "".join(f"{i}" for i in device.compute_capability)
+    program_options = ProgramOptions(
+        std="c++17",
+        arch=f"sm_{arch}",
+        include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
+    )
+    prog = Program(code, code_type="c++", options=program_options)
+    mod = prog.compile(target_type="cubin")
+    ker = mod.get_kernel("wait")
+
+    mr = _DefaultPinnedMemorySource()
+    b = mr.allocate(4)
+    arr = np.from_dlpack(b).view(np.int32)
+    arr[0] = 0
+
+    config = LaunchConfig(grid=1, block=1)
+    ker_args = (arr.ctypes.data,)
+
+    enabled = EventOptions(enable_timing=True)
+    stream = device.create_stream()
+
+    event1 = stream.record(options=enabled)
+    launch(stream, config, ker, *ker_args)
+    event3 = stream.record(options=enabled)
+
+    # event3 will never complete because the stream is waiting on wait() to complete
+    with pytest.raises(RuntimeError, match="^One or both events have not completed."):
+        event3 - event1
+
+    arr[0] = 1
+    event3.sync()
+    event3 - event1  # this should work
diff --git a/cuda_python/setup.py b/cuda_python/setup.py

Original file line number	Diff line number	Diff line change
`@@ -55,9 +55,9 @@ jobs:`
`55`	`55`
`56`	`56`	`if ('${{ inputs.local-ctk }}' -eq '1') {`
`57`	`57`	`if ($TEST_CUDA_MAJOR -eq '12') {`
`58`		`- $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'`
	`58`	`+ $MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'`
`59`	`59`	`} else {`
`60`		`- $MINI_CTK_DEPS = '["nvcc", "nvrtc"]'`
	`60`	`+ $MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'`
`61`	`61`	`}`
`62`	`62`	`}`
`63`	`63`