NVIDIA
diff --git a/‎cuda_core/cuda/core/experimental/_device.py
Lines changed: 11 additions & 0 deletions b/‎cuda_core/cuda/core/experimental/_device.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎cuda_core/cuda/core/experimental/_launch_config.py
Lines changed: 11 additions & 0 deletions b/‎cuda_core/cuda/core/experimental/_launch_config.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎cuda_core/cuda/core/experimental/_launcher.py
Lines changed: 16 additions & 0 deletions b/‎cuda_core/cuda/core/experimental/_launcher.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎cuda_core/cuda/core/experimental/_utils/cuda_utils.py
Lines changed: 4 additions & 0 deletions b/‎cuda_core/cuda/core/experimental/_utils/cuda_utils.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎cuda_core/docs/source/release/0.3.0-notes.rst
Lines changed: 5 additions & 3 deletions b/‎cuda_core/docs/source/release/0.3.0-notes.rst
Lines changed: 5 additions & 3 deletions
diff --git a/‎cuda_core/examples/strided_memory_view_cpu.py
Lines changed: 135 additions & 0 deletions b/‎cuda_core/examples/strided_memory_view_cpu.py
Lines changed: 135 additions & 0 deletions
@@ -701,6 +701,17 @@ def can_use_host_pointer_for_registered_mem(self) -> bool:
             )
         )
 
+    # TODO: A few attrs are missing here (NVIDIA/cuda-python#675)
+
+    @property
+    def cooperative_launch(self) -> bool:
+        """
+        True if device supports launching cooperative kernels, False if not.
+        """
+        return bool(self._get_cached_attribute(driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH))
+
+    # TODO: A few attrs are missing here (NVIDIA/cuda-python#675)
+
     @property
     def max_shared_memory_per_block_optin(self) -> int:
         """
 
@@ -58,11 +58,15 @@ class LaunchConfig:
     cluster: Union[tuple, int] = None
     block: Union[tuple, int] = None
     shmem_size: Optional[int] = None
+    cooperative_launch: Optional[bool] = False
 
     def __post_init__(self):
         _lazy_init()
         self.grid = cast_to_3_tuple("LaunchConfig.grid", self.grid)
         self.block = cast_to_3_tuple("LaunchConfig.block", self.block)
+        # FIXME: Calling Device() strictly speaking is not quite right; we should instead
+        # look up the device from stream. We probably need to defer the checks related to
+        # device compute capability or attributes.
         # thread block clusters are supported starting H100
         if self.cluster is not None:
             if not _use_ex:
@@ -77,6 +81,8 @@ def __post_init__(self):
             self.cluster = cast_to_3_tuple("LaunchConfig.cluster", self.cluster)
         if self.shmem_size is None:
             self.shmem_size = 0
+        if self.cooperative_launch and not Device().properties.cooperative_launch:
+            raise CUDAError("cooperative kernels are not supported on this device")
 
 
 def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
@@ -92,6 +98,11 @@ def _to_native_launch_config(config: LaunchConfig) -> driver.CUlaunchConfig:
         dim = attr.value.clusterDim
         dim.x, dim.y, dim.z = config.cluster
         attrs.append(attr)
+    if config.cooperative_launch:
+        attr = driver.CUlaunchAttribute()
+        attr.id = driver.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_COOPERATIVE
+        attr.value.cooperative = 1
+        attrs.append(attr)
     drv_cfg.numAttrs = len(attrs)
     drv_cfg.attrs = attrs
     return drv_cfg
@@ -9,6 +9,7 @@
 from cuda.core.experimental._stream import Stream
 from cuda.core.experimental._utils.clear_error_support import assert_type
 from cuda.core.experimental._utils.cuda_utils import (
+    _reduce_3_tuple,
     check_or_create_options,
     driver,
     get_binding_version,
@@ -78,6 +79,8 @@ def launch(stream, config, kernel, *kernel_args):
     if _use_ex:
         drv_cfg = _to_native_launch_config(config)
         drv_cfg.hStream = stream.handle
+        if config.cooperative_launch:
+            _check_cooperative_launch(kernel, config, stream)
         handle_return(driver.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
     else:
         # TODO: check if config has any unsupported attrs
@@ -86,3 +89,16 @@ def launch(stream, config, kernel, *kernel_args):
                 int(kernel._handle), *config.grid, *config.block, config.shmem_size, stream.handle, args_ptr, 0
             )
         )
+
+
+def _check_cooperative_launch(kernel: Kernel, config: LaunchConfig, stream: Stream):
+    dev = stream.device
+    num_sm = dev.properties.multiprocessor_count
+    max_grid_size = (
+        kernel.occupancy.max_active_blocks_per_multiprocessor(_reduce_3_tuple(config.block), config.shmem_size) * num_sm
+    )
+    if _reduce_3_tuple(config.grid) > max_grid_size:
+        # For now let's try not to be smart and adjust the grid size behind users' back.
+        # We explicitly ask users to adjust.
+        x, y, z = config.grid
+        raise ValueError(f"The specified grid size ({x} * {y} * {z}) exceeds the limit ({max_grid_size})")
@@ -48,6 +48,10 @@ def cast_to_3_tuple(label, cfg):
     return cfg + (1,) * (3 - len(cfg))
 
 
+def _reduce_3_tuple(t: tuple):
+    return t[0] * t[1] * t[2]
+
+
 def _check_driver_error(error):
     if error == driver.CUresult.CUDA_SUCCESS:
         return
 
@@ -20,11 +20,13 @@ Breaking Changes
 New features
 ------------
 
-- :class:`Kernel` adds :property:`Kernel.num_arguments` and :property:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
-- Add pythonic access to kernel occupancy calculation functions via :property:`Kernel.occupancy`. (#648)
+- :class:`Kernel` adds :attr:`Kernel.num_arguments` and :attr:`Kernel.arguments_info` for introspection of kernel arguments. (#612)
+- Add pythonic access to kernel occupancy calculation functions via :attr:`Kernel.occupancy`. (#648)
+- Support launching cooperative kernels by setting :property:`LaunchConfig.cooperative_launch` to `True`.
 - A name can be assigned to :class:`ObjectCode` instances generated by both :class:`Program` and :class:`Linker` through their respective
   options.
 
+
 New examples
 ------------
 
@@ -33,4 +35,4 @@ Fixes and enhancements
 ----------------------
 
 - An :class:`Event` can now be used to look up its corresponding device and context using the ``.device`` and ``.context`` attributes respectively.
-- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed
+- The :func:`launch` function's handling of fp16 scalars was incorrect and is fixed.
@@ -0,0 +1,135 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This demo aims to illustrate two takeaways:
+#
+#   1. The similarity between CPU and GPU JIT-compilation with C++ sources
+#   2. How to use StridedMemoryView to interface with foreign C/C++ functions
+#
+# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
+# path, which can be easily installed from pip or conda following their instructions.
+# We also use NumPy/CuPy as the CPU/GPU array container.
+#
+# ################################################################################
+
+import importlib
+import shutil
+import string
+import sys
+import tempfile
+
+try:
+    from cffi import FFI
+except ImportError:
+    print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
+    FFI = None
+import numpy as np
+
+from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+
+# ################################################################################
+#
+# Usually this entire code block is in a separate file, built as a Python extension
+# module that can be imported by users at run time. For illustrative purposes we
+# use JIT compilation to make this demo self-contained.
+#
+# Here we assume an in-place operation, equivalent to the following NumPy code:
+#
+#   >>> arr = ...
+#   >>> assert arr.dtype == np.int32
+#   >>> assert arr.ndim == 1
+#   >>> arr += np.arange(arr.size, dtype=arr.dtype)
+#
+# is implemented for both CPU and GPU at low-level, with the following C function
+# signature:
+func_name = "inplace_plus_arange_N"
+func_sig = f"void {func_name}(int* data, size_t N)"
+
+
+# Now we are prepared to run the code from the user's perspective!
+#
+# ################################################################################
+
+
+# Below, as a user we want to perform the said in-place operation on a CPU
+# or GPU, by calling the corresponding function implemented "elsewhere"
+# (in the body of run function).
+
+
+# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
+# of which are supported by StridedMemoryView).
+@args_viewable_as_strided_memory((0,))
+def my_func(arr):
+    global cpu_func
+    global cpu_prog
+    # Create a memory view over arr (assumed to be a 1D array of int32). The stream
+    # ordering is taken care of, so that arr can be safely accessed on our work
+    # stream (ordered after a data stream on which arr is potentially prepared).
+    view = arr.view(-1)
+    assert isinstance(view, StridedMemoryView)
+    assert len(view.shape) == 1
+    assert view.dtype == np.int32
+    assert not view.is_device_accessible
+
+    size = view.shape[0]
+    # DLPack also supports host arrays. We want to know if the array data is
+    # accessible from the GPU, and dispatch to the right routine accordingly.
+    cpu_func(cpu_prog.cast("int*", view.ptr), size)
+
+
+def run():
+    global my_func
+    if not FFI:
+        return
+    # Here is a concrete (very naive!) implementation on CPU:
+    cpu_code = string.Template(r"""
+    extern "C"
+    $func_sig {
+        for (size_t i = 0; i < N; i++) {
+            data[i] += i;
+        }
+    }
+    """).substitute(func_sig=func_sig)
+    # This is cffi's way of JIT compiling & loading a CPU function. cffi builds an
+    # extension module that has the Python binding to the underlying C function.
+    # For more details, please refer to cffi's documentation.
+    cpu_prog = FFI()
+    cpu_prog.cdef(f"{func_sig};")
+    cpu_prog.set_source(
+        "_cpu_obj",
+        cpu_code,
+        source_extension=".cpp",
+        extra_compile_args=["-std=c++11"],
+    )
+    temp_dir = tempfile.mkdtemp()
+    saved_sys_path = sys.path.copy()
+    try:
+        cpu_prog.compile(tmpdir=temp_dir)
+
+        sys.path.append(temp_dir)
+        cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
+
+        # Create input array on CPU
+        arr_cpu = np.zeros(1024, dtype=np.int32)
+        print(f"before: {arr_cpu[:10]=}")
+
+        # Run the workload
+        my_func(arr_cpu)
+
+        # Check the result
+        print(f"after: {arr_cpu[:10]=}")
+        assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
+    finally:
+        sys.path = saved_sys_path
+        # to allow FFI module to unload, we delete references to
+        # to cpu_func
+        del cpu_func, my_func
+        # clean up temp directory
+        shutil.rmtree(temp_dir)
+
+
+if __name__ == "__main__":
+    run()