naive implementation first draft

ksimpson-work · ksimpson-work · commit 219e2b7a2b33 · 2024-11-15T11:55:51.000-08:00
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -10,7 +10,7 @@
 from cuda.core.experimental._utils import handle_return, ComputeCapability, CUDAError, \
                              precondition
 from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._memory import _DefaultAsyncMempool, Buffer, MemoryResource
+from cuda.core.experimental._memory import _DefaultAsyncMempool, _AsyncMemoryResource, Buffer, MemoryResource
 from cuda.core.experimental._stream import default_stream, Stream, StreamOptions
 
 
@@ -65,7 +65,13 @@ def __new__(cls, device_id=None):
                 for dev_id in range(total):
                     dev = super().__new__(cls)
                     dev._id = dev_id
-                    dev._mr = _DefaultAsyncMempool(dev_id)
+                    # If the device is in TCC mode, or does not support memory pools for some other reason, use the AsyncMemoryResource which does not use memory pools.
+                    # The DefaultAsyncMempool uses memory pools, which are not always supported. 
+                    if handle_return(cudart.cudaGetDeviceProperties(dev_id))['CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED'] == 0:
+                        dev._mr = _AsyncMemoryResource(dev_id)
+                    else:
+                        dev._mr = _DefaultAsyncMempool(dev_id)
+
                     dev._has_inited = False
                     _tls.devices.append(dev)
 
diff --git a/cuda_core/cuda/core/experimental/_memory.py b/cuda_core/cuda/core/experimental/_memory.py
@@ -296,3 +296,34 @@ def is_host_accessible(self) -> bool:
     @property
     def device_id(self) -> int:
         raise RuntimeError("the pinned memory resource is not bound to any GPU")
+
+class _AsyncMemoryResource(MemoryResource):
+
+    __slots__ = ("_dev_id",)
+
+    def __init__(self, dev_id):
+        self._handle = None
+        self._dev_id = dev_id
+
+    def allocate(self, size, stream=None) -> Buffer:
+        if stream is None:
+            stream = default_stream()
+        ptr = handle_return(cuda.cuMemAllocAsync(size, stream._handle))
+        return Buffer(ptr, size, self)
+
+    def deallocate(self, ptr, size, stream=None):
+        if stream is None:
+            stream = default_stream()
+        handle_return(cuda.cuMemFreeAsync(ptr, stream._handle))
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self._dev_id