Merge remote-tracking branch 'upstream/main' into device-properties

ksimpson-work · ksimpson-work · commit 542c91fcecc0 · 2025-02-03T14:20:02.000-08:00
diff --git a/.github/workflows/build-docs.yml b/.github/workflows/build-docs.yml
@@ -38,7 +38,7 @@ jobs:
     # The build stage could fail but we want the CI to keep moving.
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     # WAR: Building the doc currently requires a GPU (NVIDIA/cuda-python#326,327)
-    runs-on: linux-amd64-gpu-t4-latest-1-testing
+    runs-on: linux-amd64-gpu-t4-latest-1
     #runs-on: ubuntu-latest
     defaults:
       run:
diff --git a/.github/workflows/test-wheel.yml b/.github/workflows/test-wheel.yml
@@ -28,7 +28,7 @@ jobs:
     if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     runs-on: ${{ (inputs.runner == 'default' && inputs.host-platform == 'linux-64' && 'linux-amd64-gpu-v100-latest-1') ||
                  (inputs.runner == 'default' && inputs.host-platform == 'linux-aarch64' && 'linux-arm64-gpu-a100-latest-1') ||
-                 (inputs.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1-testing') }}
+                 (inputs.runner == 'H100' && 'linux-amd64-gpu-h100-latest-1') }}
     # Our self-hosted runners require a container
     # TODO: use a different (nvidia?) container
     container:
diff --git a/cuda_bindings/docs/build_docs.sh b/cuda_bindings/docs/build_docs.sh
@@ -23,7 +23,7 @@ if [[ -z "${SPHINX_CUDA_BINDINGS_VER}" ]]; then
 fi
 
 # build the docs (in parallel)
-SPHINXOPTS="-j 4" make html
+SPHINXOPTS="-j 4 -d build/.doctrees" make html
 
 # for debugging/developing (conf.py), please comment out the above line and
 # use the line below instead, as we must build in serial to avoid getting
diff --git a/cuda_bindings/docs/source/conf.py b/cuda_bindings/docs/source/conf.py
@@ -32,7 +32,7 @@
 # ones.
 extensions = ["sphinx.ext.autodoc", "sphinx.ext.napoleon", "myst_nb", "enum_tools.autoenum"]
 
-jupyter_execute_notebooks = "force"
+nb_execution_mode = "off"
 numfig = True
 
 # Add any paths that contain templates here, relative to this directory.
diff --git a/cuda_bindings/docs/source/overview.md b/cuda_bindings/docs/source/overview.md
@@ -1,12 +1,3 @@
----
-jupytext:
-  text_representation:
-    format_name: myst
-kernelspec:
-  display_name: Python 3
-  name: python3
----
-
 # Overview
 
 <p style="font-size: 14px; color: grey; text-align: right;">by <a
@@ -48,7 +39,7 @@ API](https://docs.nvidia.com/cuda/cuda-driver-api/index.html) and
 Python package. In this example, you copy data from the host to device. You need
 [NumPy](https://numpy.org/doc/stable/contents.html) to store data on the host.
 
-```{code-cell} python
+```python
 from cuda.bindings import driver, nvrtc
 import numpy as np
 ```
@@ -58,7 +49,7 @@ example is provided.
 In a future release, this may automatically raise exceptions using a Python
 object model.
 
-```{code-cell} python
+```python
 def _cudaGetErrorEnum(error):
     if isinstance(error, driver.CUresult):
         err, name = driver.cuGetErrorName(error)
@@ -86,7 +77,7 @@ Python that requires some understanding of CUDA C++. For more information, see
 [An Even Easier Introduction to
 CUDA](https://developer.nvidia.com/blog/even-easier-introduction-cuda/).
 
-```{code-cell} python
+```python
 saxpy = """\
 extern "C" __global__
 void saxpy(float a, float *x, float *y, float *out, size_t n)
@@ -108,7 +99,7 @@ In the following code example, the Driver API is initialized so that the NVIDIA
 and GPU are accessible. Next, the GPU is queried for their compute capability. Finally,
 the program is compiled to target our local compute capability architecture with FMAD enabled.
 
-```{code-cell} python
+```python
 # Initialize CUDA Driver API
 checkCudaErrors(driver.cuInit(0))
 
@@ -138,7 +129,7 @@ context. CUDA contexts are analogous to host processes for the device. In the
 following code example, a handle for compute device 0 is passed to
 `cuCtxCreate` to designate that GPU for context creation.
 
-```{code-cell} python
+```python
 # Create context
 context = checkCudaErrors(driver.cuCtxCreate(0, cuDevice))
 ```
@@ -148,7 +139,7 @@ module. A module is analogous to dynamically loaded libraries for the device.
 After loading into the module, extract a specific kernel with
 `cuModuleGetFunction`. It is not uncommon for multiple kernels to reside in PTX.
 
-```{code-cell} python
+```python
 # Load PTX as module data and retrieve function
 ptx = np.char.array(ptx)
 # Note: Incompatible --gpu-architecture would be detected here
@@ -161,7 +152,7 @@ application performance, you can input data on the device to eliminate data
 transfers. For completeness, this example shows how you would transfer data to
 and from the device.
 
-```{code-cell} python
+```python
 NUM_THREADS = 512  # Threads per block
 NUM_BLOCKS = 32768  # Blocks per grid
 
@@ -184,7 +175,7 @@ Python doesn’t have a natural concept of pointers, yet `cuMemcpyHtoDAsync` exp
 `void*`. Therefore, `XX.ctypes.data` retrieves the pointer value associated with
 XX.
 
-```{code-cell} python
+```python
 dXclass = checkCudaErrors(driver.cuMemAlloc(bufferSize))
 dYclass = checkCudaErrors(driver.cuMemAlloc(bufferSize))
 dOutclass = checkCudaErrors(driver.cuMemAlloc(bufferSize))
@@ -209,7 +200,7 @@ Like `cuMemcpyHtoDAsync`, `cuLaunchKernel` expects `void**` in the argument list
 the earlier code example, it creates `void**` by grabbing the `void*` value of each
 individual argument and placing them into its own contiguous memory.
 
-```{code-cell} python
+```python
 # The following code example is not intuitive 
 # Subject to change in a future release
 dX = np.array([int(dXclass)], dtype=np.uint64)
@@ -222,7 +213,7 @@ args = np.array([arg.ctypes.data for arg in args], dtype=np.uint64)
 
 Now the kernel can be launched:
 
-```{code-cell} python
+```python
 checkCudaErrors(driver.cuLaunchKernel(
    kernel,
    NUM_BLOCKS,  # grid x dim
@@ -251,7 +242,7 @@ stream are serialized. After the call to transfer data back to the host is
 executed, `cuStreamSynchronize` is used to halt CPU execution until all operations
 in the designated stream are finished.
 
-```{code-cell} python
+```python
 # Assert values are same after running kernel
 hZ = a * hX + hY
 if not np.allclose(hOut, hZ):
@@ -261,7 +252,7 @@ if not np.allclose(hOut, hZ):
 Perform verification of the data to ensure correctness and finish the code with
 memory clean up.
 
-```{code-cell} python
+```python
 checkCudaErrors(driver.cuStreamDestroy(stream))
 checkCudaErrors(driver.cuMemFree(dXclass))
 checkCudaErrors(driver.cuMemFree(dYclass))
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -48,20 +48,20 @@ cdef class StridedMemoryView:
     ----------
     ptr : int
         Pointer to the tensor buffer (as a Python `int`).
-    shape: tuple
+    shape : tuple
         Shape of the tensor.
-    strides: tuple
+    strides : tuple
         Strides of the tensor (in **counts**, not bytes).
     dtype: numpy.dtype
         Data type of the tensor.
-    device_id: int
+    device_id : int
         The device ID for where the tensor is located. It is -1 for CPU tensors
         (meaning those only accessible from the host).
-    is_device_accessible: bool
+    is_device_accessible : bool
         Whether the tensor data can be accessed on the GPU.
     readonly: bool
         Whether the tensor data can be modified in place.
-    exporting_obj: Any
+    exporting_obj : Any
         A reference to the original tensor object that is being viewed.
 
     Parameters
@@ -334,7 +334,8 @@ cdef StridedMemoryView view_as_cai(obj, stream_ptr, view=None):
 
 
 def args_viewable_as_strided_memory(tuple arg_indices):
-    """Decorator to create proxy objects to :obj:`StridedMemoryView` for the
+    """
+    Decorator to create proxy objects to :obj:`StridedMemoryView` for the
     specified positional arguments.
 
     This allows array/tensor attributes to be accessed inside the function
diff --git a/cuda_core/docs/build_docs.sh b/cuda_core/docs/build_docs.sh
@@ -19,7 +19,7 @@ if [[ -z "${SPHINX_CUDA_CORE_VER}" ]]; then
 fi
 
 # build the docs (in parallel)
-SPHINXOPTS="-j 4" make html
+SPHINXOPTS="-j 4 -d build/.doctrees" make html
 
 # for debugging/developing (conf.py), please comment out the above line and
 # use the line below instead, as we must build in serial to avoid getting
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
@@ -10,8 +10,11 @@
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 import os
+import sys
+from unittest.mock import MagicMock
+
+from cuda.core.experimental._system import System
 
-# import sys
 # sys.path.insert(0, os.path.abspath('.'))
 
 
@@ -102,6 +105,24 @@
 napoleon_numpy_docstring = True
 
 
+# Mock the System class and its methods
+class MockSystem:
+    def __init__(self, *args, **kwargs):
+        pass
+
+    driver_version = MagicMock()
+    driver_version.__doc__ = System.driver_version.__doc__
+    num_devices = MagicMock()
+    num_devices.__doc__ = System.num_devices.__doc__
+    devices = MagicMock()
+    devices.__doc__ = System.devices.__doc__
+
+
+sys.modules["cuda.core.experimental._system.System"] = MagicMock(System=MockSystem)
+
+# Add 'cuda.core.experimental.system' to autodoc_mock_imports
+autodoc_mock_imports = ["cuda.core.experimental.system"]
+
 section_titles = ["Returns"]
 
 
diff --git a/cuda_python/docs/build_docs.sh b/cuda_python/docs/build_docs.sh
@@ -23,7 +23,7 @@ if [[ -z "${SPHINX_CUDA_PYTHON_VER}" ]]; then
 fi
 
 # build the docs (in parallel)
-SPHINXOPTS="-j 4" make html
+SPHINXOPTS="-j 4 -d build/.doctrees" make html
 
 # for debugging/developing (conf.py), please comment out the above line and
 # use the line below instead, as we must build in serial to avoid getting
diff --git a/cuda_python/docs/source/release/11.8.6-notes.md b/cuda_python/docs/source/release/11.8.6-notes.md
@@ -4,7 +4,7 @@ Released on January 24, 2025.
 
 ## Included components
 
-- [`cuda.bindings` 11.8.6](https://nvidia.github.io/cuda-python/cuda-bindings/11.8.6/release/11.8.6-notes.html)
+- [`cuda.bindings` 11.8.6](https://nvidia.github.io/cuda-python/cuda-bindings/12.8.0/release/11.8.6-notes.html)
 
 
 ## Highlights