Skip to content

Commit 3bdf609

Browse files
committed
support cooperative launch
1 parent 381df9c commit 3bdf609

File tree

6 files changed

+90
-1
lines changed

6 files changed

+90
-1
lines changed

cuda_core/cuda/core/experimental/_device.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,17 @@ def can_use_host_pointer_for_registered_mem(self) -> bool:
701701
)
702702
)
703703

704+
# TODO: A few attrs are missing here (NVIDIA/cuda-python#675)
705+
706+
@property
707+
def cooperative_launch(self) -> bool:
708+
"""
709+
True if device supports launching cooperative kernels, False if not.
710+
"""
711+
return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH))
712+
713+
# TODO: A few attrs are missing here (NVIDIA/cuda-python#675)
714+
704715
@property
705716
def max_shared_memory_per_block_optin(self) -> int:
706717
"""

cuda_core/cuda/core/experimental/_launch_config.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,15 @@ class LaunchConfig:
5858
cluster: Union[tuple, int] = None
5959
block: Union[tuple, int] = None
6060
shmem_size: Optional[int] = None
61+
cooperative_launch: Optional[bool] = False
6162

6263
def __post_init__(self):
6364
_lazy_init()
6465
self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
6566
self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
67+
# FIXME: Calling Device() strictly speaking is not quite right; we should instead
68+
# look up the device from stream. We probably need to defer the checks related to
69+
# device compute capability or attributes.
6670
# thread block clusters are supported starting H100
6771
if self.cluster is not None:
6872
if not _use_ex:
@@ -77,6 +81,8 @@ def __post_init__(self):
7781
self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
7882
if self.shmem_size is None:
7983
self.shmem_size = 0
84+
if self.cooperative_launch and not Device().properties.cooperative_launch:
85+
raise CUDAError("cooperative kernels are not supported on this device")
8086

8187

8288
def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
@@ -92,6 +98,11 @@ def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
9298
dim = attr.value.clusterDim
9399
dim.x, dim.y, dim.z = config.cluster
94100
attrs.append(attr)
101+
if config.cooperative_launch:
102+
attr = driver.CUlaunchAttribute()
103+
attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
104+
attr.value.cooperative = 1
105+
attrs.append(attr)
95106
drv_cfg.numAttrs = len(attrs)
96107
drv_cfg.attrs = attrs
97108
return drv_cfg

cuda_core/cuda/core/experimental/_launcher.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from cuda.core.experimental._stream import Stream
1010
from cuda.core.experimental._utils.clear_error_support import assert_type
1111
from cuda.core.experimental._utils.cuda_utils import (
12+
_reduce_tuple,
1213
check_or_create_options,
1314
driver,
1415
get_binding_version,
@@ -78,6 +79,8 @@ def launch(stream, config, kernel, *kernel_args):
7879
if _use_ex:
7980
drv_cfg = _to_native_launch_config(config)
8081
drv_cfg.hStream = stream.handle
82+
if config.cooperative_launch:
83+
_check_cooperative_launch(kernel, config, stream)
8184
handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
8285
else:
8386
# TODO: check if config has any unsupported attrs
@@ -86,3 +89,17 @@ def launch(stream, config, kernel, *kernel_args):
8689
int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
8790
)
8891
)
92+
93+
94+
def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
95+
dev = stream.device
96+
num_sm = dev.properties.multiprocessor_count
97+
max_grid_size = (
98+
kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_tuple(config.block), config.shmem_size) * num_sm
99+
)
100+
if _reduce_tuple(config.grid) > max_grid_size:
101+
# For now let's try not to be smart and adjust the grid size behind users' back.
102+
# We explicitly ask users to adjust.
103+
raise ValueError(
104+
"The specified grid size ({} * {} * {}) exceeds the limit ({}).".format(*config.grid, max_grid_size)
105+
)

cuda_core/cuda/core/experimental/_utils/cuda_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ def cast_to_3_tuple(label, cfg):
4848
return cfg + (1,) * (3 - len(cfg))
4949

5050

51+
def _reduce_tuple(t: tuple):
52+
return functools.reduce(lambda x, y: x * y, t, 1)
53+
54+
5155
def _check_driver_error(error):
5256
if error == driver.CUresult.CUDA_SUCCESS:
5357
return

cuda_core/docs/source/release/0.3.0-notes.rst

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ New features
2222

2323
- :class:`Kernel` adds :property:`Kernel.num_arguments` and :property:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
2424
- Add pythonic access to kernel occupancy calculation functions via :property:`Kernel.occupancy`. (#648)
25+
- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`.
2526

2627
New examples
2728
------------
@@ -31,4 +32,4 @@ Fixes and enhancements
3132
----------------------
3233

3334
- An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
34-
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed
35+
- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.

cuda_core/tests/test_launcher.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,3 +152,48 @@ def test_launch_scalar_argument(python_type, cpp_type, init_value):
152152

153153
# Check result
154154
assert arr[0] == init_value, f"Expected {init_value}, got {arr[0]}"
155+
156+
157+
@pytest.mark.skipif(os.environ.get("CUDA_PATH") is None, reason="need cg header")
158+
def test_cooperative_launch():
159+
dev = Device()
160+
dev.set_current()
161+
s = dev.create_stream(options={"nonblocking": True})
162+
163+
# CUDA kernel templated on type T
164+
code = r"""
165+
#include <cooperative_groups.h>
166+
167+
extern "C" __global__ void test_grid_sync() {
168+
namespace cg = cooperative_groups;
169+
auto grid = cg::this_grid();
170+
grid.sync();
171+
}
172+
"""
173+
174+
# Compile and force instantiation for this type
175+
arch = "".join(f"{i}" for i in dev.compute_capability)
176+
include_path = str(pathlib.Path(os.environ["CUDA_PATH"]) / pathlib.Path("include"))
177+
pro_opts = ProgramOptions(std="c++17", arch=f"sm_{arch}", include_path=include_path)
178+
prog = Program(code, code_type="c++", options=pro_opts)
179+
ker = prog.compile("cubin").get_kernel("test_grid_sync")
180+
181+
# # Launch without setting cooperative_launch
182+
# # Commented out as this seems to be a sticky error...
183+
# config = LaunchConfig(grid=1, block=1)
184+
# launch(s, config, ker)
185+
# from cuda.core.experimental._utils.cuda_utils import CUDAError
186+
# with pytest.raises(CUDAError) as e:
187+
# s.sync()
188+
# assert "CUDA_ERROR_LAUNCH_FAILED" in str(e)
189+
190+
# Crazy grid sizes would not work
191+
block = 128
192+
config = LaunchConfig(grid=dev.properties.max_grid_dim_x // block + 1, block=block, cooperative_launch=True)
193+
with pytest.raises(ValueError):
194+
launch(s, config, ker)
195+
196+
# This works just fine
197+
config = LaunchConfig(grid=1, block=1, cooperative_launch=True)
198+
launch(s, config, ker)
199+
s.sync()

0 commit comments

Comments
 (0)