[SWDEV-477447] Set _HAS_PYNVML to false if amdsmi not installed (#1535)

jataylo · web-flow · commit 470272cc371e · 2024-08-19T06:41:50.000-07:00
Fix from pytorch#132990 cherry picked into 6.2 to be resolved for 6.2.1 """ This is a bugfix that was recently encountered in ROCm/Deepspeed. Currently if a library installs pynvml and runs on ROCm pytorch will break as _HAS_PYNVML is set to true and it will attempt to use amdsmi library for the device_count call which will not be installed. This fix will set _HAS_PYNVML to false on ROCm if amdsmi is not installed. """ Will need to be cherry picked to release/2.3 and rocm6.3_internal_testing also
diff --git a/torch/cuda/__init__.py b/torch/cuda/__init__.py
@@ -54,13 +54,10 @@
 _PYNVML_ERR = None
 try:
     try:
-        import pynvml  # type: ignore[import]
-
-        _HAS_PYNVML = True
-    except ModuleNotFoundError:
-        pass
-    try:
-        import amdsmi  # type: ignore[import]
+        if not torch.version.hip:
+            import pynvml  # type: ignore[import]
+        else:
+            import amdsmi  # type: ignore[import]
 
         _HAS_PYNVML = True
     except ModuleNotFoundError: