Skip to content

Commit 3f7a79b

Browse files
committed
Merge remote-tracking branch 'upstream/main' into dching/add-compute-sanitizer-to-ci
2 parents 2bfd402 + 2aca306 commit 3f7a79b

File tree

13 files changed

+209
-55
lines changed

13 files changed

+209
-55
lines changed

.bandit

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
[bandit]
2+
skips = B101,B311

.github/workflows/test-wheel-windows.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ jobs:
5555
5656
if ('${{ inputs.local-ctk }}' -eq '1') {
5757
if ($TEST_CUDA_MAJOR -eq '12') {
58-
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink"]'
58+
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "nvjitlink", "thrust"]'
5959
} else {
60-
$MINI_CTK_DEPS = '["nvcc", "nvrtc"]'
60+
$MINI_CTK_DEPS = '["nvcc", "nvrtc", "thrust"]'
6161
}
6262
}
6363

.pre-commit-config.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,9 @@ repos:
2020
rev: 8ff25e07e487f143571cc305e56dd0253c60bc7b #v1.8.3
2121
hooks:
2222
- id: bandit
23+
args:
24+
- --ini
25+
- .bandit
2326

2427
default_language_version:
2528
python: python3

cuda_bindings/tests/test_cuda.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -652,7 +652,8 @@ def test_get_error_name_and_string():
652652

653653
@pytest.mark.skipif(not callableBinary("nvidia-smi"), reason="Binary existance needed")
654654
def test_device_get_name():
655-
import subprocess
655+
# TODO: Refactor this test once we have nvml bindings to avoid the use of subprocess
656+
import subprocess # nosec B404
656657

657658
(err,) = cuda.cuInit(0)
658659
assert err == cuda.CUresult.CUDA_SUCCESS
@@ -661,12 +662,12 @@ def test_device_get_name():
661662
err, ctx = cuda.cuCtxCreate(0, device)
662663
assert err == cuda.CUresult.CUDA_SUCCESS
663664

664-
p = subprocess.run(
665-
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], stdout=subprocess.PIPE, stderr=subprocess.PIPE
666-
)
665+
p = subprocess.check_output(
666+
["nvidia-smi", "--query-gpu=name", "--format=csv,noheader"], shell=False, stderr=subprocess.PIPE
667+
) # nosec B603, B607
667668

668669
delimiter = b"\r\n" if platform.system() == "Windows" else b"\n"
669-
expect = p.stdout.split(delimiter)
670+
expect = p.split(delimiter)
670671
size = 64
671672
_, got = cuda.cuDeviceGetName(size, device)
672673
got = got.split(b"\x00")[0]

cuda_core/cuda/core/experimental/_event.py

Lines changed: 39 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,15 @@
88
from dataclasses import dataclass
99
from typing import TYPE_CHECKING, Optional
1010

11-
from cuda.core.experimental._utils.cuda_utils import CUDAError, check_or_create_options, driver, handle_return
11+
from cuda.core.experimental._utils.cuda_utils import (
12+
CUDAError,
13+
check_or_create_options,
14+
driver,
15+
handle_return,
16+
)
17+
from cuda.core.experimental._utils.cuda_utils import (
18+
_check_driver_error as raise_if_driver_error,
19+
)
1220

1321
if TYPE_CHECKING:
1422
import cuda.bindings
@@ -117,13 +125,31 @@ def __rsub__(self, other):
117125

118126
def __sub__(self, other):
119127
# return self - other (in milliseconds)
128+
err, timing = driver.cuEventElapsedTime(other.handle, self.handle)
120129
try:
121-
timing = handle_return(driver.cuEventElapsedTime(other.handle, self.handle))
130+
raise_if_driver_error(err)
131+
return timing
122132
except CUDAError as e:
123-
raise RuntimeError(
124-
"Timing capability must be enabled in order to subtract two Events; timing is disabled by default."
125-
) from e
126-
return timing
133+
if err == driver.CUresult.CUDA_ERROR_INVALID_HANDLE:
134+
if self.is_timing_disabled or other.is_timing_disabled:
135+
explanation = (
136+
"Both Events must be created with timing enabled in order to subtract them; "
137+
"use EventOptions(enable_timing=True) when creating both events."
138+
)
139+
else:
140+
explanation = (
141+
"Both Events must be recorded before they can be subtracted; "
142+
"use Stream.record() to record both events to a stream."
143+
)
144+
elif err == driver.CUresult.CUDA_ERROR_NOT_READY:
145+
explanation = (
146+
"One or both events have not completed; "
147+
"use Event.sync(), Stream.sync(), or Device.sync() to wait for the events to complete "
148+
"before subtracting them."
149+
)
150+
else:
151+
raise e
152+
raise RuntimeError(explanation) from e
127153

128154
@property
129155
def is_timing_disabled(self) -> bool:
@@ -164,5 +190,11 @@ def is_done(self) -> bool:
164190

165191
@property
166192
def handle(self) -> cuda.bindings.driver.CUevent:
167-
"""Return the underlying CUevent object."""
193+
"""Return the underlying CUevent object.
194+
195+
.. caution::
196+
197+
This handle is a Python object. To get the memory address of the underlying C
198+
handle, call ``int(Event.handle)``.
199+
"""
168200
return self._mnff.handle

cuda_core/cuda/core/experimental/_linker.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,11 @@ def handle(self) -> LinkerHandleT:
503503
.. note::
504504
505505
The type of the returned object depends on the backend.
506+
507+
.. caution::
508+
509+
This handle is a Python object. To get the memory address of the underlying C
510+
handle, call ``int(Linker.handle)``.
506511
"""
507512
return self._mnff.handle
508513

cuda_core/cuda/core/experimental/_memory.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
import abc
88
import weakref
9-
from typing import Optional, Tuple, TypeVar
9+
from typing import Optional, Tuple, TypeVar, Union
1010

1111
from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
1212
from cuda.core.experimental._stream import default_stream
@@ -18,6 +18,9 @@
1818
# TODO: define a memory property mixin class and make Buffer and
1919
# MemoryResource both inherit from it
2020

21+
DevicePointerT = Union[driver.CUdeviceptr, int, None]
22+
"""A type union of `Cudeviceptr`, `int` and `None` for hinting Buffer.handle."""
23+
2124

2225
class Buffer:
2326
"""Represent a handle to allocated memory.
@@ -81,8 +84,14 @@ def close(self, stream=None):
8184
self._mnff.close(stream)
8285

8386
@property
84-
def handle(self):
85-
"""Return the buffer handle object."""
87+
def handle(self) -> DevicePointerT:
88+
"""Return the buffer handle object.
89+
90+
.. caution::
91+
92+
This handle is a Python object. To get the memory address of the underlying C
93+
handle, call ``int(Buffer.handle)``.
94+
"""
8695
return self._mnff.ptr
8796

8897
@property

cuda_core/cuda/core/experimental/_module.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,5 +354,11 @@ def code(self) -> CodeTypeT:
354354
@property
355355
@precondition(_lazy_load_module)
356356
def handle(self):
357-
"""Return the underlying handle object."""
357+
"""Return the underlying handle object.
358+
359+
.. caution::
360+
361+
This handle is a Python object. To get the memory address of the underlying C
362+
handle, call ``int(ObjectCode.handle)``.
363+
"""
358364
return self._handle

cuda_core/cuda/core/experimental/_program.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,5 +524,10 @@ def handle(self) -> ProgramHandleT:
524524
.. note::
525525
526526
The type of the returned object depends on the backend.
527+
528+
.. caution::
529+
530+
This handle is a Python object. To get the memory address of the underlying C
531+
handle, call ``int(Program.handle)``.
527532
"""
528533
return self._mnff.handle

cuda_core/cuda/core/experimental/_stream.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -189,7 +189,13 @@ def __cuda_stream__(self) -> Tuple[int, int]:
189189

190190
@property
191191
def handle(self) -> cuda.bindings.driver.CUstream:
192-
"""Return the underlying ``CUstream`` object."""
192+
"""Return the underlying ``CUstream`` object.
193+
194+
.. caution::
195+
196+
This handle is a Python object. To get the memory address of the underlying C
197+
handle, call ``int(Stream.handle)``.
198+
"""
193199
return self._mnff.handle
194200

195201
@property

cuda_core/tests/example_tests/utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,8 @@ def run_example(samples_path, filename, env=None):
3333
sys.argv = [fullpath]
3434
old_sys_path = sys.path.copy()
3535
sys.path.append(samples_path)
36-
exec(script, env if env else {})
36+
# TODO: Refactor the examples to give them a common callable `main()` to avoid needing to use exec here?
37+
exec(script, env if env else {}) # nosec B102
3738
except ImportError as e:
3839
# for samples requiring any of optional dependencies
3940
for m in ("cupy",):

cuda_core/tests/test_event.py

Lines changed: 115 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -7,57 +7,43 @@
77
# is strictly prohibited.
88

99
import os
10+
import pathlib
1011
import time
1112

13+
import numpy as np
1214
import pytest
1315

1416
import cuda.core.experimental
15-
from cuda.core.experimental import Device, EventOptions
17+
from cuda.core.experimental import Device, EventOptions, LaunchConfig, Program, ProgramOptions, launch
18+
from cuda.core.experimental._memory import _DefaultPinnedMemorySource
1619

1720

1821
def test_event_init_disabled():
1922
with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
2023
cuda.core.experimental._event.Event() # Ensure back door is locked.
2124

2225

23-
@pytest.mark.parametrize(
24-
"enable_timing",
25-
[
26-
True,
27-
]
28-
# The compute-sanitizer is running, and this test intentionally causes an API error.
29-
+ ([False, None] if os.environ.get("CUDA_PYTHON_SANITIZER_RUNNING", "0") != "1" else []),
30-
)
31-
def test_timing(init_cuda, enable_timing):
32-
options = EventOptions(enable_timing=enable_timing)
26+
def test_timing_success(init_cuda):
27+
options = EventOptions(enable_timing=True)
3328
stream = Device().create_stream()
3429
delay_seconds = 0.5
3530
e1 = stream.record(options=options)
3631
time.sleep(delay_seconds)
3732
e2 = stream.record(options=options)
3833
e2.sync()
39-
for e in (e1, e2):
40-
assert e.is_timing_disabled == (True if enable_timing is None else not enable_timing)
41-
if enable_timing:
42-
elapsed_time_ms = e2 - e1
43-
assert isinstance(elapsed_time_ms, float)
44-
# Using a generous tolerance, to avoid flaky tests:
45-
# We only want to exercise the __sub__ method, this test is not meant
46-
# to stress-test the CUDA driver or time.sleep().
47-
delay_ms = delay_seconds * 1000
48-
if os.name == "nt": # noqa: SIM108
49-
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
50-
generous_tolerance = 100
51-
else:
52-
# Most modern Linux kernels have a default timer resolution of 1 ms.
53-
generous_tolerance = 20
54-
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
34+
elapsed_time_ms = e2 - e1
35+
assert isinstance(elapsed_time_ms, float)
36+
# Using a generous tolerance, to avoid flaky tests:
37+
# We only want to exercise the __sub__ method, this test is not meant
38+
# to stress-test the CUDA driver or time.sleep().
39+
delay_ms = delay_seconds * 1000
40+
if os.name == "nt": # noqa: SIM108
41+
# For Python <=3.10, the Windows timer resolution is typically limited to 15.6 ms by default.
42+
generous_tolerance = 100
5543
else:
56-
with pytest.raises(RuntimeError) as e:
57-
elapsed_time_ms = e2 - e1
58-
msg = str(e)
59-
assert "disabled by default" in msg
60-
assert "CUDA_ERROR_INVALID_HANDLE" in msg
44+
# Most modern Linux kernels have a default timer resolution of 1 ms.
45+
generous_tolerance = 20
46+
assert delay_ms - generous_tolerance <= elapsed_time_ms < delay_ms + generous_tolerance
6147

6248

6349
def test_is_sync_busy_waited(init_cuda):
@@ -87,3 +73,100 @@ def test_is_done(init_cuda):
8773
# Without a sync, the captured work might not have yet completed
8874
# Therefore this check should never raise an exception
8975
assert event.is_done in (True, False)
76+
77+
78+
def test_error_timing_disabled():
79+
device = Device()
80+
device.set_current()
81+
enabled = EventOptions(enable_timing=True)
82+
disabled = EventOptions(enable_timing=False)
83+
stream = device.create_stream()
84+
85+
event1 = stream.record(options=enabled)
86+
event2 = stream.record(options=disabled)
87+
assert not event1.is_timing_disabled
88+
assert event2.is_timing_disabled
89+
stream.sync()
90+
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
91+
event2 - event1
92+
93+
event1 = stream.record(options=disabled)
94+
event2 = stream.record(options=disabled)
95+
stream.sync()
96+
with pytest.raises(RuntimeError, match="^Both Events must be created with timing enabled"):
97+
event2 - event1
98+
99+
100+
def test_error_timing_recorded():
101+
device = Device()
102+
device.set_current()
103+
enabled = EventOptions(enable_timing=True)
104+
stream = device.create_stream()
105+
106+
event1 = stream.record(options=enabled)
107+
event2 = device.create_event(options=enabled)
108+
event3 = device.create_event(options=enabled)
109+
110+
stream.sync()
111+
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
112+
event2 - event1
113+
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
114+
event1 - event2
115+
with pytest.raises(RuntimeError, match="^Both Events must be recorded"):
116+
event3 - event2
117+
118+
119+
# TODO: improve this once path finder can find headers
120+
@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need libcu++ header")
121+
@pytest.mark.skipif(tuple(int(i) for i in np.__version__.split(".")[:2]) < (2, 1), reason="need numpy 2.1.0+")
122+
def test_error_timing_incomplete():
123+
device = Device()
124+
device.set_current()
125+
126+
# This kernel is designed to busy loop until a signal is received
127+
code = """
128+
#include <cuda/atomic>
129+
130+
extern "C"
131+
__global__ void wait(int* val) {
132+
cuda::atomic_ref<int, cuda::thread_scope_system> signal{*val};
133+
while (true) {
134+
if (signal.load(cuda::memory_order_relaxed)) {
135+
break;
136+
}
137+
}
138+
}
139+
"""
140+
141+
arch = "".join(f"{i}" for i in device.compute_capability)
142+
program_options = ProgramOptions(
143+
std="c++17",
144+
arch=f"sm_{arch}",
145+
include_path=str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include")),
146+
)
147+
prog = Program(code, code_type="c++", options=program_options)
148+
mod = prog.compile(target_type="cubin")
149+
ker = mod.get_kernel("wait")
150+
151+
mr = _DefaultPinnedMemorySource()
152+
b = mr.allocate(4)
153+
arr = np.from_dlpack(b).view(np.int32)
154+
arr[0] = 0
155+
156+
config = LaunchConfig(grid=1, block=1)
157+
ker_args = (arr.ctypes.data,)
158+
159+
enabled = EventOptions(enable_timing=True)
160+
stream = device.create_stream()
161+
162+
event1 = stream.record(options=enabled)
163+
launch(stream, config, ker, *ker_args)
164+
event3 = stream.record(options=enabled)
165+
166+
# event3 will never complete because the stream is waiting on wait() to complete
167+
with pytest.raises(RuntimeError, match="^One or both events have not completed."):
168+
event3 - event1
169+
170+
arr[0] = 1
171+
event3.sync()
172+
event3 - event1 # this should work

0 commit comments

Comments
 (0)