Workaround for dpnp.linalg.qr() to run on CUDA (#2265)

vlad-perevezentsev · web-flow · commit 807179a202b5 · 2025-01-20T14:40:13.000+01:00
This PR suggests adding a workaround like waiting for host task after calling `geqrf` to avoid a race condition due to an issue in oneMath uxlfoundation/oneMath#626 Also updates tests by removing old skips and adds `test_qr_large` in `TestQr`
diff --git a/dpnp/linalg/dpnp_utils_linalg.py b/dpnp/linalg/dpnp_utils_linalg.py
@@ -397,7 +397,14 @@ def _batched_qr(a, mode="reduced"):
         batch_size,
         depends=[copy_ev],
     )
-    _manager.add_event_pair(ht_ev, geqrf_ev)
+
+    # w/a to avoid raice conditional on CUDA during multiple runs
+    # TODO: Remove it ones the OneMath issue is resolved
+    # https://github.com/uxlfoundation/oneMath/issues/626
+    if dpnp.is_cuda_backend(a_sycl_queue):
+        ht_ev.wait()
+    else:
+        _manager.add_event_pair(ht_ev, geqrf_ev)
 
     if mode in ["r", "raw"]:
         if mode == "r":
@@ -2468,7 +2475,14 @@ def dpnp_qr(a, mode="reduced"):
     ht_ev, geqrf_ev = li._geqrf(
         a_sycl_queue, a_t.get_array(), tau_h.get_array(), depends=[copy_ev]
     )
-    _manager.add_event_pair(ht_ev, geqrf_ev)
+
+    # w/a to avoid raice conditional on CUDA during multiple runs
+    # TODO: Remove it ones the OneMath issue is resolved
+    # https://github.com/uxlfoundation/oneMath/issues/626
+    if dpnp.is_cuda_backend(a_sycl_queue):
+        ht_ev.wait()
+    else:
+        _manager.add_event_pair(ht_ev, geqrf_ev)
 
     if mode in ["r", "raw"]:
         if mode == "r":
diff --git a/dpnp/tests/test_linalg.py b/dpnp/tests/test_linalg.py
@@ -2380,12 +2380,6 @@ class TestQr:
     )
     @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
     def test_qr(self, dtype, shape, mode):
-        if (
-            is_cuda_device()
-            and mode in ["complete", "reduced"]
-            and shape in [(16, 16), (2, 2, 4)]
-        ):
-            pytest.skip("SAT-7589")
         a = generate_random_numpy_array(shape, dtype, seed_value=81)
         ia = dpnp.array(a)
 
@@ -2398,24 +2392,48 @@ def test_qr(self, dtype, shape, mode):
 
             # check decomposition
             if mode in ("complete", "reduced"):
-                if a.ndim == 2:
-                    assert_almost_equal(
-                        dpnp.dot(dpnp_q, dpnp_r),
-                        a,
-                        decimal=5,
-                    )
-                else:  # a.ndim > 2
-                    assert_almost_equal(
-                        dpnp.matmul(dpnp_q, dpnp_r),
-                        a,
-                        decimal=5,
-                    )
+                assert_almost_equal(
+                    dpnp.matmul(dpnp_q, dpnp_r),
+                    a,
+                    decimal=5,
+                )
             else:  # mode=="raw"
                 assert_dtype_allclose(dpnp_q, np_q)
 
         if mode in ("raw", "r"):
             assert_dtype_allclose(dpnp_r, np_r)
 
+    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize(
+        "shape",
+        [(32, 32), (8, 16, 16)],
+        ids=[
+            "(32, 32)",
+            "(8, 16, 16)",
+        ],
+    )
+    @pytest.mark.parametrize("mode", ["r", "raw", "complete", "reduced"])
+    def test_qr_large(self, dtype, shape, mode):
+        a = generate_random_numpy_array(shape, dtype, seed_value=81)
+        ia = dpnp.array(a)
+        if mode == "r":
+            np_r = numpy.linalg.qr(a, mode)
+            dpnp_r = dpnp.linalg.qr(ia, mode)
+        else:
+            np_q, np_r = numpy.linalg.qr(a, mode)
+            dpnp_q, dpnp_r = dpnp.linalg.qr(ia, mode)
+            # check decomposition
+            if mode in ("complete", "reduced"):
+                assert_almost_equal(
+                    dpnp.matmul(dpnp_q, dpnp_r),
+                    a,
+                    decimal=5,
+                )
+            else:  # mode=="raw"
+                assert_allclose(np_q, dpnp_q, atol=1e-4)
+        if mode in ("raw", "r"):
+            assert_allclose(np_r, dpnp_r, atol=1e-4)
+
     @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
     @pytest.mark.parametrize(
         "shape",
diff --git a/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py b/dpnp/tests/third_party/cupy/linalg_tests/test_decomposition.py
@@ -163,14 +163,7 @@ def test_decomposition(self, dtype):
 class TestQRDecomposition(unittest.TestCase):
 
     @testing.for_dtypes("fdFD")
-    # skip cases with 'complete' and 'reduce' modes on CUDA (SAT-7611)
     def check_mode(self, array, mode, dtype):
-        if (
-            is_cuda_device()
-            and array.size > 0
-            and mode in ["complete", "reduced"]
-        ):
-            return
         a_cpu = numpy.asarray(array, dtype=dtype)
         a_gpu = cupy.asarray(array, dtype=dtype)
         result_gpu = cupy.linalg.qr(a_gpu, mode=mode)