NVIDIA
diff --git a/‎.github/actions/build/action.yml
Lines changed: 12 additions & 0 deletions b/‎.github/actions/build/action.yml
Lines changed: 12 additions & 0 deletions
diff --git a/‎.github/actions/test/action.yml
Lines changed: 7 additions & 0 deletions b/‎.github/actions/test/action.yml
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/gh-build-and-test.yml
Lines changed: 7 additions & 5 deletions b/‎.github/workflows/gh-build-and-test.yml
Lines changed: 7 additions & 5 deletions
diff --git a/‎cuda_core/DESCRIPTION.rst
Lines changed: 27 additions & 0 deletions b/‎cuda_core/DESCRIPTION.rst
Lines changed: 27 additions & 0 deletions
diff --git a/‎cuda_core/README.md
Lines changed: 1 addition & 1 deletion b/‎cuda_core/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_core/cuda/core/_version.py
Lines changed: 1 addition & 1 deletion b/‎cuda_core/cuda/core/_version.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuda_core/cuda/core/experimental/_device.py
Lines changed: 16 additions & 16 deletions b/‎cuda_core/cuda/core/experimental/_device.py
Lines changed: 16 additions & 16 deletions
diff --git a/‎cuda_core/cuda/core/experimental/_event.py
Lines changed: 3 additions & 3 deletions b/‎cuda_core/cuda/core/experimental/_event.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎cuda_core/cuda/core/experimental/_launcher.py
Lines changed: 30 additions & 10 deletions b/‎cuda_core/cuda/core/experimental/_launcher.py
Lines changed: 30 additions & 10 deletions
diff --git a/‎cuda_core/cuda/core/experimental/_memory.py
Lines changed: 3 additions & 3 deletions b/‎cuda_core/cuda/core/experimental/_memory.py
Lines changed: 3 additions & 3 deletions
@@ -44,6 +44,12 @@ runs:
         $CHOWN -R $(whoami) ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
         ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
+    - name: Check cuda.core wheel
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        pip install twine
+        twine check ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
+
     - name: Upload cuda.core build artifacts
       uses: actions/upload-artifact@v4
       with:
@@ -82,6 +88,12 @@ runs:
         $CHOWN -R $(whoami) ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
         ls -lahR ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
+    # TODO: enable this after NVIDIA/cuda-python#297 is resolved
+    # - name: Check cuda.bindings wheel
+    #   shell: bash --noprofile --norc -xeuo pipefail {0}
+    #   run: |
+    #     twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
+
     - name: Upload cuda.bindings build artifacts
       uses: actions/upload-artifact@v4
       with:
 
@@ -14,6 +14,13 @@ runs:
       shell: bash --noprofile --norc -xeuo pipefail {0}
       run: nvidia-smi
 
+    # The cache action needs this
+    - name: Install zstd
+      shell: bash --noprofile --norc -xeuo pipefail {0}
+      run: |
+        apt update
+        apt install zstd
+
     - name: Download bindings build artifacts
       uses: actions/download-artifact@v4
       with:
 
@@ -76,17 +76,19 @@ jobs:
   test:
     # TODO: improve the name once a separate test matrix is defined
     name: Test (CUDA ${{ inputs.cuda-version }})
-    # TODO: enable testing once linux-aarch64 & win-64 GPU runners are up
+    # TODO: enable testing once win-64 GPU runners are up
     if: ${{ (github.repository_owner == 'nvidia') &&
-             startsWith(inputs.host-platform, 'linux-x64') }}
+             startsWith(inputs.host-platform, 'linux') }}
     permissions:
       id-token: write # This is required for configure-aws-credentials
       contents: read  # This is required for actions/checkout
-    runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') }}
-    # TODO: use a different (nvidia?) container, or just run on bare image
+    runs-on: ${{ (inputs.host-platform == 'linux-x64' && 'linux-amd64-gpu-v100-latest-1') ||
+                 (inputs.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') }}
+    # Our self-hosted runners require a container
+    # TODO: use a different (nvidia?) container
     container:
       options: -u root --security-opt seccomp=unconfined --privileged --shm-size 16g
-      image: condaforge/miniforge3:latest
+      image: ubuntu:22.04
       env:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
     needs:
 
@@ -0,0 +1,27 @@
+*******************************************************
+cuda-core: Pythonic access to CUDA core functionalities
+*******************************************************
+
+`cuda.core <https://nvidia.github.io/cuda-python/cuda-core/>`_ bridges Python's productivity
+with CUDA's performance through intuitive and pythonic APIs.
+The mission is to provide users full access to all of the core CUDA features in Python,
+such as runtime control, compiler and linker.
+
+* `Repository <https://github.com/NVIDIA/cuda-python/tree/main/cuda_core>`_
+* `Documentation <https://nvidia.github.io/cuda-python/cuda-core/>`_
+* `Examples <https://github.com/NVIDIA/cuda-python/tree/main/cuda_core/examples>`_
+* `Issue tracker <https://github.com/NVIDIA/cuda-python/issues/>`_
+
+`cuda.core` is currently under active development. Any feedbacks or suggestions are welcomed!
+
+
+Installation
+============
+
+.. code-block:: bash
+
+   pip install cuda-core[cu12]
+
+Please refer to the `installation instructions
+<https://nvidia.github.io/cuda-python/cuda-core/latest/install.html>`_ for different
+ways of installing `cuda.core`, including building from source.
@@ -1,6 +1,6 @@
 # `cuda.core`: (experimental) pythonic CUDA module
 
-Currently under active developmen; see [the documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) for more details.
+Currently under active development; see [the documentation](https://nvidia.github.io/cuda-python/cuda-core/latest/) for more details.
 
 ## Installing
 
 
@@ -2,4 +2,4 @@
 #
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
-__version__ = "0.1.0"
+__version__ = "0.1.1"
@@ -23,11 +23,11 @@ class Device:
     and use the same GPU device.
 
     While acting as the entry point, many other CUDA resources can be
-    allocated such as streams and buffers. Any :obj:`Context` dependent
+    allocated such as streams and buffers. Any :obj:`~_context.Context` dependent
     resource created through this device, will continue to refer to
     this device's context.
 
-    Newly returend :obj:`Device` object are is a thread-local singleton
+    Newly returned :obj:`~_device.Device` objects are thread-local singletons
     for a specified device.
 
     Note
@@ -37,7 +37,7 @@ class Device:
     Parameters
     ----------
     device_id : int, optional
-        Device ordinal to return a :obj:`Device` object for.
+        Device ordinal to return a :obj:`~_device.Device` object for.
         Default value of `None` return the currently used device.
 
     """
@@ -144,7 +144,7 @@ def compute_capability(self) -> ComputeCapability:
     @property
     @precondition(_check_context_initialized)
     def context(self) -> Context:
-        """Return the current :obj:`Context` associated with this device.
+        """Return the current :obj:`~_context.Context` associated with this device.
 
         Note
         ----
@@ -157,7 +157,7 @@ def context(self) -> Context:
 
     @property
     def memory_resource(self) -> MemoryResource:
-        """Return :obj:`MemoryResource` associated with this device."""
+        """Return :obj:`~_memory.MemoryResource` associated with this device."""
         return self._mr
 
     @memory_resource.setter
@@ -168,7 +168,7 @@ def memory_resource(self, mr):
 
     @property
     def default_stream(self) -> Stream:
-        """Return default CUDA :obj:`Stream` associated with this device.
+        """Return default CUDA :obj:`~_stream.Stream` associated with this device.
 
         The type of default stream returned depends on if the environment
         variable CUDA_PYTHON_CUDA_PER_THREAD_DEFAULT_STREAM is set.
@@ -191,18 +191,18 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
 
         Initializes CUDA and sets the calling thread to a valid CUDA
         context. By default the primary context is used, but optional `ctx`
-        parameter can be used to explicitly supply a :obj:`Context` object.
+        parameter can be used to explicitly supply a :obj:`~_context.Context` object.
 
         Providing a `ctx` causes the previous set context to be popped and returned.
 
         Parameters
         ----------
-        ctx : :obj:`Context`, optional
+        ctx : :obj:`~_context.Context`, optional
             Optional context to push onto this device's current thread stack.
 
         Returns
         -------
-        Union[:obj:`Context`, None], optional
+        Union[:obj:`~_context.Context`, None], optional
             Popped context.
 
         Examples
@@ -247,20 +247,20 @@ def set_current(self, ctx: Context = None) -> Union[Context, None]:
             self._has_inited = True
 
     def create_context(self, options: ContextOptions = None) -> Context:
-        """Create a new :obj:`Context` object.
+        """Create a new :obj:`~_context.Context` object.
 
         Note
         ----
         The newly context will not be set as current.
 
         Parameters
         ----------
-        options : :obj:`ContextOptions`, optional
+        options : :obj:`~_context.ContextOptions`, optional
             Customizable dataclass for context creation options.
 
         Returns
         -------
-        :obj:`Context`
+        :obj:`~_context.Context`
             Newly created context object.
 
         """
@@ -286,12 +286,12 @@ def create_stream(self, obj=None, options: StreamOptions = None) -> Stream:
         ----------
         obj : Any, optional
             Any object supporting the __cuda_stream__ protocol.
-        options : :obj:`StreamOptions`, optional
+        options : :obj:`~_stream.StreamOptions`, optional
             Customizable dataclass for stream creation options.
 
         Returns
         -------
-        :obj:`Stream`
+        :obj:`~_stream.Stream`
             Newly created stream object.
 
         """
@@ -314,13 +314,13 @@ def allocate(self, size, stream=None) -> Buffer:
         ----------
         size : int
             Number of bytes to allocate.
-        stream : :obj:`Stream`, optional
+        stream : :obj:`~_stream.Stream`, optional
             The stream establishing the stream ordering semantic.
             Default value of `None` uses default stream.
 
         Returns
         -------
-        :obj:`Buffer`
+        :obj:`~_memory.Buffer`
             Newly created buffer object.
 
         """
 
@@ -12,7 +12,7 @@
 
 @dataclass
 class EventOptions:
-    """Customizable :obj:`Event` options.
+    """Customizable :obj:`~_event.Event` options.
 
     Attributes
     ----------
@@ -46,8 +46,8 @@ class Event:
     of work up to event's record, and help establish dependencies
     between GPU work submissions.
 
-    Directly creating an :obj:`Event` is not supported due to ambiguity,
-    and they should instead be created through a :obj:`Stream` object.
+    Directly creating an :obj:`~_event.Event` is not supported due to ambiguity,
+    and they should instead be created through a :obj:`~_stream.Stream` object.
 
     """
 
 
@@ -7,6 +7,7 @@
 from typing import Optional, Union
 
 from cuda import cuda
+from cuda.core.experimental._device import Device
 from cuda.core.experimental._kernel_arg_handler import ParamHolder
 from cuda.core.experimental._module import Kernel
 from cuda.core.experimental._stream import Stream
@@ -38,11 +39,15 @@ class LaunchConfig:
     ----------
     grid : Union[tuple, int]
         Collection of threads that will execute a kernel function.
+    cluster : Union[tuple, int]
+        Group of blocks (Thread Block Cluster) that will execute on the same
+        GPU Processing Cluster (GPC). Blocks within a cluster have access to
+        distributed shared memory and can be explicitly synchronized.
     block : Union[tuple, int]
         Group of threads (Thread Block) that will execute on the same
-        multiprocessor. Threads within a thread blocks have access to
-        shared memory and can be explicitly synchronized.
-    stream : :obj:`Stream`
+        streaming multiprocessor (SM). Threads within a thread blocks have
+        access to shared memory and can be explicitly synchronized.
+    stream : :obj:`~_stream.Stream`
         The stream establishing the stream ordering semantic of a
         launch.
     shmem_size : int, optional
@@ -53,13 +58,22 @@ class LaunchConfig:
 
     # TODO: expand LaunchConfig to include other attributes
     grid: Union[tuple, int] = None
+    cluster: Union[tuple, int] = None
     block: Union[tuple, int] = None
     stream: Stream = None
     shmem_size: Optional[int] = None
 
     def __post_init__(self):
+        _lazy_init()
         self.grid = self._cast_to_3_tuple(self.grid)
         self.block = self._cast_to_3_tuple(self.block)
+        # thread block clusters are supported starting H100
+        if self.cluster is not None:
+            if not _use_ex:
+                raise CUDAError("thread block clusters require cuda.bindings & driver 11.8+")
+            if Device().compute_capability < (9, 0):
+                raise CUDAError("thread block clusters are not supported on devices with compute capability < 9.0")
+            self.cluster = self._cast_to_3_tuple(self.cluster)
         # we handle "stream=None" in the launch API
         if self.stream is not None and not isinstance(self.stream, Stream):
             try:
@@ -69,8 +83,6 @@ def __post_init__(self):
         if self.shmem_size is None:
             self.shmem_size = 0
 
-        _lazy_init()
-
     def _cast_to_3_tuple(self, cfg):
         if isinstance(cfg, int):
             if cfg < 1:
@@ -96,16 +108,16 @@ def _cast_to_3_tuple(self, cfg):
 
 
 def launch(kernel, config, *kernel_args):
-    """Launches a :obj:`~cuda.core.experimental._module.Kernel`
+    """Launches a :obj:`~_module.Kernel`
     object with launch-time configuration.
 
     Parameters
     ----------
-    kernel : :obj:`~cuda.core.experimental._module.Kernel`
+    kernel : :obj:`~_module.Kernel`
         Kernel to launch.
-    config : :obj:`LaunchConfig`
+    config : :obj:`~_launcher.LaunchConfig`
         Launch configurations inline with options provided by
-        :obj:`LaunchConfig` dataclass.
+        :obj:`~_launcher.LaunchConfig` dataclass.
     *kernel_args : Any
         Variable length argument list that is provided to the
         launching kernel.
@@ -133,7 +145,15 @@ def launch(kernel, config, *kernel_args):
         drv_cfg.blockDimX, drv_cfg.blockDimY, drv_cfg.blockDimZ = config.block
         drv_cfg.hStream = config.stream.handle
         drv_cfg.sharedMemBytes = config.shmem_size
-        drv_cfg.numAttrs = 0  # TODO
+        attrs = []  # TODO: support more attributes
+        if config.cluster:
+            attr = cuda.CUlaunchAttribute()
+            attr.id = cuda.CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_CLUSTER_DIMENSION
+            dim = attr.value.clusterDim
+            dim.x, dim.y, dim.z = config.cluster
+            attrs.append(attr)
+        drv_cfg.numAttrs = len(attrs)
+        drv_cfg.attrs = attrs
         handle_return(cuda.cuLaunchKernelEx(drv_cfg, int(kernel._handle), args_ptr, 0))
     else:
         # TODO: check if config has any unsupported attrs
 
@@ -37,7 +37,7 @@ class Buffer:
         Allocated buffer handle object
     size : Any
         Memory size of the buffer
-    mr : :obj:`MemoryResource`, optional
+    mr : :obj:`~_memory.MemoryResource`, optional
         Memory resource associated with the buffer
 
     """
@@ -126,7 +126,7 @@ def copy_to(self, dst: Buffer = None, *, stream) -> Buffer:
 
         Parameters
         ----------
-        dst : :obj:`Buffer`
+        dst : :obj:`~_memory.Buffer`
             Source buffer to copy data from
         stream : Any
             Keyword argument specifying the stream for the
@@ -149,7 +149,7 @@ def copy_from(self, src: Buffer, *, stream):
 
         Parameters
         ----------
-        src : :obj:`Buffer`
+        src : :obj:`~_memory.Buffer`
             Source buffer to copy data from
         stream : Any
             Keyword argument specifying the stream for the
Original file line number	Diff line number	Diff line change
`@@ -2,4 +2,4 @@`
`2`	`2`	`#`
`3`	`3`	`# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE`
`4`	`4`
`5`		`-__version__ = "0.1.0"`
	`5`	`+__version__ = "0.1.1"`