Skip to content

Commit 29cbd7c

Browse files
authored
Merge branch 'main' into names
2 parents c8572b2 + 430e890 commit 29cbd7c

File tree

11 files changed

+290
-117
lines changed

11 files changed

+290
-117
lines changed

cuda_core/cuda/core/experimental/_device.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,17 @@ def can_use_host_pointer_for_registered_mem(self) -> bool:
701701
)
702702
)
703703

704+
# TODO: A few attrs are missing here (NVIDIA/cuda-python#675)
705+
706+
@property
707+
def cooperative_launch(self) -> bool:
708+
"""
709+
True if device supports launching cooperative kernels, False if not.
710+
"""
711+
return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH))
712+
713+
# TODO: A few attrs are missing here (NVIDIA/cuda-python#675)
714+
704715
@property
705716
def max_shared_memory_per_block_optin(self) -> int:
706717
"""

cuda_core/cuda/core/experimental/_launch_config.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,15 @@ class LaunchConfig:
5858
cluster: Union[tuple, int] = None
5959
block: Union[tuple, int] = None
6060
shmem_size: Optional[int] = None
61+
cooperative_launch: Optional[bool] = False
6162

6263
def __post_init__(self):
6364
_lazy_init()
6465
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
6566
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
67+
# FIXME: Calling Device() strictly speaking is not quite right; we should instead
68+
# look up the device from stream. We probably need to defer the checks related to
69+
# device compute capability or attributes.
6670
# thread block clusters are supported starting H100
6771
if self.cluster is not None:
6872
if not _use_ex:
@@ -77,6 +81,8 @@ def __post_init__(self):
7781
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
7882
if self.shmem_size is None:
7983
self.shmem_size = 0
84+
if self.cooperative_launch and not Device().properties.cooperative_launch:
85+
raise CUDAError("cooperative kernels are not supported on this device")
8086

8187

8288
def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
@@ -92,6 +98,11 @@ def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
9298
dim = attr.value.clusterDim
9399
dim.x, dim.y, dim.z = config.cluster
94100
attrs.append(attr)
101+
if config.cooperative_launch:
102+
attr = driver.CUlaunchAttribute()
103+
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
104+
attr.value.cooperative = 1
105+
attrs.append(attr)
95106
drv_cfg.numAttrs = len(attrs)
96107
drv_cfg.attrs = attrs
97108
return drv_cfg

cuda_core/cuda/core/experimental/_launcher.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from cuda.core.experimental._stream import Stream
1010
from cuda.core.experimental._utils.clear_error_support import assert_type
1111
from cuda.core.experimental._utils.cuda_utils import (
12+
_reduce_3_tuple,
1213
check_or_create_options,
1314
driver,
1415
get_binding_version,
@@ -78,6 +79,8 @@ def launch(stream, config, kernel, *kernel_args):
7879
if _use_ex:
7980
drv_cfg = _to_native_launch_config(config)
8081
drv_cfg.hStream = stream.handle
82+
if config.cooperative_launch:
83+
_check_cooperative_launch(kernel, config, stream)
8184
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
8285
else:
8386
# TODO: check if config has any unsupported attrs
@@ -86,3 +89,16 @@ def launch(stream, config, kernel, *kernel_args):
8689
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
8790
)
8891
)
92+
93+
94+
def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
95+
dev = stream.device
96+
num_sm = dev.properties.multiprocessor_count
97+
max_grid_size = (
98+
kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm
99+
)
100+
if _reduce_3_tuple(config.grid) > max_grid_size:
101+
# For now let's try not to be smart and adjust the grid size behind users' back.
102+
# We explicitly ask users to adjust.
103+
x, y, z = config.grid
104+
raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})")

cuda_core/cuda/core/experimental/_utils/cuda_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def cast_to_3_tuple(label, cfg):
4848
return cfg + (1,) * (3 - len(cfg))
4949

5050

51+
def _reduce_3_tuple(t: tuple):
52+
return t[0] * t[1] * t[2]
53+
54+
5155
def _check_driver_error(error):
5256
if error == driver.CUresult.CUDA_SUCCESS:
5357
return

cuda_core/docs/source/release/0.3.0-notes.rst

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,13 @@ Breaking Changes
2020
New features
2121
------------
2222

23-
- :class:`Kernel` adds :property:`Kernel.num_arguments` and :property:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
24-
- Add pythonic access to kernel occupancy calculation functions via :property:`Kernel.occupancy`. (#648)
23+
- :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
24+
- Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648)
25+
- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`.
2526
- A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective
2627
options.
2728

29+
2830
New examples
2931
------------
3032

@@ -33,4 +35,4 @@ Fixes and enhancements
3335
----------------------
3436

3537
- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
36-
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed
38+
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
2+
#
3+
# SPDX-License-Identifier: Apache-2.0
4+
5+
# ################################################################################
6+
#
7+
# This demo aims to illustrate two takeaways:
8+
#
9+
# 1. The similarity between CPU and GPU JIT-compilation with C++ sources
10+
# 2. How to use StridedMemoryView to interface with foreign C/C++ functions
11+
#
12+
# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
13+
# path, which can be easily installed from pip or conda following their instructions.
14+
# We also use NumPy/CuPy as the CPU/GPU array container.
15+
#
16+
# ################################################################################
17+
18+
import importlib
19+
import shutil
20+
import string
21+
import sys
22+
import tempfile
23+
24+
try:
25+
from cffi import FFI
26+
except ImportError:
27+
print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
28+
FFI = None
29+
import numpy as np
30+
31+
from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
32+
33+
# ################################################################################
34+
#
35+
# Usually this entire code block is in a separate file, built as a Python extension
36+
# module that can be imported by users at run time. For illustrative purposes we
37+
# use JIT compilation to make this demo self-contained.
38+
#
39+
# Here we assume an in-place operation, equivalent to the following NumPy code:
40+
#
41+
# >>> arr = ...
42+
# >>> assert arr.dtype == np.int32
43+
# >>> assert arr.ndim == 1
44+
# >>> arr += np.arange(arr.size, dtype=arr.dtype)
45+
#
46+
# is implemented for both CPU and GPU at low-level, with the following C function
47+
# signature:
48+
func_name = "inplace_plus_arange_N"
49+
func_sig = f"void {func_name}(int* data, size_t N)"
50+
51+
52+
# Now we are prepared to run the code from the user's perspective!
53+
#
54+
# ################################################################################
55+
56+
57+
# Below, as a user we want to perform the said in-place operation on a CPU
58+
# or GPU, by calling the corresponding function implemented "elsewhere"
59+
# (in the body of run function).
60+
61+
62+
# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
63+
# of which are supported by StridedMemoryView).
64+
@args_viewable_as_strided_memory((0,))
65+
def my_func(arr):
66+
global cpu_func
67+
global cpu_prog
68+
# Create a memory view over arr (assumed to be a 1D array of int32). The stream
69+
# ordering is taken care of, so that arr can be safely accessed on our work
70+
# stream (ordered after a data stream on which arr is potentially prepared).
71+
view = arr.view(-1)
72+
assert isinstance(view, StridedMemoryView)
73+
assert len(view.shape) == 1
74+
assert view.dtype == np.int32
75+
assert not view.is_device_accessible
76+
77+
size = view.shape[0]
78+
# DLPack also supports host arrays. We want to know if the array data is
79+
# accessible from the GPU, and dispatch to the right routine accordingly.
80+
cpu_func(cpu_prog.cast("int*", view.ptr), size)
81+
82+
83+
def run():
84+
global my_func
85+
if not FFI:
86+
return
87+
# Here is a concrete (very naive!) implementation on CPU:
88+
cpu_code = string.Template(r"""
89+
extern "C"
90+
$func_sig {
91+
for (size_t i = 0; i < N; i++) {
92+
data[i] += i;
93+
}
94+
}
95+
""").substitute(func_sig=func_sig)
96+
# This is cffi's way of JIT compiling & loading a CPU function. cffi builds an
97+
# extension module that has the Python binding to the underlying C function.
98+
# For more details, please refer to cffi's documentation.
99+
cpu_prog = FFI()
100+
cpu_prog.cdef(f"{func_sig};")
101+
cpu_prog.set_source(
102+
"_cpu_obj",
103+
cpu_code,
104+
source_extension=".cpp",
105+
extra_compile_args=["-std=c++11"],
106+
)
107+
temp_dir = tempfile.mkdtemp()
108+
saved_sys_path = sys.path.copy()
109+
try:
110+
cpu_prog.compile(tmpdir=temp_dir)
111+
112+
sys.path.append(temp_dir)
113+
cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
114+
115+
# Create input array on CPU
116+
arr_cpu = np.zeros(1024, dtype=np.int32)
117+
print(f"before: {arr_cpu[:10]=}")
118+
119+
# Run the workload
120+
my_func(arr_cpu)
121+
122+
# Check the result
123+
print(f"after: {arr_cpu[:10]=}")
124+
assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
125+
finally:
126+
sys.path = saved_sys_path
127+
# to allow FFI module to unload, we delete references to
128+
# to cpu_func
129+
del cpu_func, my_func
130+
# clean up temp directory
131+
shutil.rmtree(temp_dir)
132+
133+
134+
if __name__ == "__main__":
135+
run()

0 commit comments

Comments
 (0)