Improve perf of accessing dev.compute_capability (#459)

leofang · web-flow · commit 440eabdd6e26 · 2025-02-25T12:34:22.000-05:00
* cache cc to speed it up

* avoid silly, redundant lock
diff --git a/cuda_core/cuda/core/experimental/_device.py b/cuda_core/cuda/core/experimental/_device.py
@@ -11,7 +11,8 @@
 from cuda.core.experimental._utils import ComputeCapability, CUDAError, driver, handle_return, precondition, runtime
 
 _tls = threading.local()
-_tls_lock = threading.Lock()
+_lock = threading.Lock()
+_is_cuInit = False
 
 
 class DeviceProperties:
@@ -938,6 +939,12 @@ class Device:
     __slots__ = ("_id", "_mr", "_has_inited", "_properties")
 
     def __new__(cls, device_id=None):
+        global _is_cuInit
+        if _is_cuInit is False:
+            with _lock:
+                handle_return(driver.cuInit(0))
+                _is_cuInit = True
+
         # important: creating a Device instance does not initialize the GPU!
         if device_id is None:
             device_id = handle_return(runtime.cudaGetDevice())
@@ -948,27 +955,26 @@ def __new__(cls, device_id=None):
                 raise ValueError(f"device_id must be within [0, {total}), got {device_id}")
 
         # ensure Device is singleton
-        with _tls_lock:
-            if not hasattr(_tls, "devices"):
-                total = handle_return(runtime.cudaGetDeviceCount())
-                _tls.devices = []
-                for dev_id in range(total):
-                    dev = super().__new__(cls)
-                    dev._id = dev_id
-                    # If the device is in TCC mode, or does not support memory pools for some other reason,
-                    # use the SynchronousMemoryResource which does not use memory pools.
-                    if (
-                        handle_return(
-                            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
-                        )
-                    ) == 1:
-                        dev._mr = _DefaultAsyncMempool(dev_id)
-                    else:
-                        dev._mr = _SynchronousMemoryResource(dev_id)
-
-                    dev._has_inited = False
-                    dev._properties = None
-                    _tls.devices.append(dev)
+        if not hasattr(_tls, "devices"):
+            total = handle_return(runtime.cudaGetDeviceCount())
+            _tls.devices = []
+            for dev_id in range(total):
+                dev = super().__new__(cls)
+                dev._id = dev_id
+                # If the device is in TCC mode, or does not support memory pools for some other reason,
+                # use the SynchronousMemoryResource which does not use memory pools.
+                if (
+                    handle_return(
+                        runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrMemoryPoolsSupported, 0)
+                    )
+                ) == 1:
+                    dev._mr = _DefaultAsyncMempool(dev_id)
+                else:
+                    dev._mr = _SynchronousMemoryResource(dev_id)
+
+                dev._has_inited = False
+                dev._properties = None
+                _tls.devices.append(dev)
 
         return _tls.devices[device_id]
 
@@ -1029,13 +1035,11 @@ def properties(self) -> DeviceProperties:
     @property
     def compute_capability(self) -> ComputeCapability:
         """Return a named tuple with 2 fields: major and minor."""
-        major = handle_return(
-            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMajor, self._id)
-        )
-        minor = handle_return(
-            runtime.cudaDeviceGetAttribute(runtime.cudaDeviceAttr.cudaDevAttrComputeCapabilityMinor, self._id)
-        )
-        return ComputeCapability(major, minor)
+        if "compute_capability" in self.properties._cache:
+            return self.properties._cache["compute_capability"]
+        cc = ComputeCapability(self.properties.compute_capability_major, self.properties.compute_capability_minor)
+        self.properties._cache["compute_capability"] = cc
+        return cc
 
     @property
     @precondition(_check_context_initialized)
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
@@ -41,8 +41,7 @@ def _device_unset_current():
         return
     handle_return(driver.cuCtxPopCurrent())
     if hasattr(_device._tls, "devices"):
-        with _device._tls_lock:
-            del _device._tls.devices
+        del _device._tls.devices
 
 
 @pytest.fixture(scope="function")