Implement a workaround to gemm issue in OneMKL (#2082)

antonwolfy · oleksandr-pavlyk · web-flow · commit 178342c826d1 · 2024-10-02T23:08:39.000+02:00
* Implement a workaround to gemm issue in OneMKL

* Fix codespell issue

* Enable w/a also for float dtype

* Add Battlemage G21 arhitecture to w/a

* Disable w/a for Arrow Lake

* Remove Lunar Lake architecture from the w/a

* Applied the pre-commit hooks

* Update dpnp/backend/extensions/blas/gemm.hpp

Co-authored-by: Oleksandr Pavlyk &lt;oleksandr.pavlyk@intel.com&gt;

* Applied pre-commit black hook

* Add more clarification to the comment

* Remove excess semicolon

* Removed const keyword from review comment because ext_oneapi_architecture_is() isn't marked as const

* Applied review comment

* Updated the changelog

---------

Co-authored-by: Oleksandr Pavlyk &lt;oleksandr.pavlyk@intel.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -128,6 +128,7 @@ In addition, this release completes implementation of `dpnp.fft` module and adds
 * Fixed a crash in `dpnp.choose` caused by missing control of releasing temporary allocated device memory [#2063](https://github.com/IntelPython/dpnp/pull/2063)
 * Resolved compilation warning and error while building in debug mode [#2066](https://github.com/IntelPython/dpnp/pull/2066)
 * Fixed an issue with asynchronous execution in `dpnp.fft` module [#2067](https://github.com/IntelPython/dpnp/pull/2067)
+* Added a workaround to fix the incorrect result from `dpnp.matmul` computing on Lunar Lake or Arrow Lake Battlemage graphics [#2082](https://github.com/IntelPython/dpnp/pull/2082)
 
 ## [0.15.0] - 05/25/2024
 
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -127,6 +127,13 @@ PYBIND11_MODULE(_blas_impl, m)
               py::arg("resultC"), py::arg("depends") = py::list());
     }
 
+    {
+        m.def("_is_lnl_bm_architecture", &blas_ns::_is_lnl_bm_architecture,
+              "Return ``True`` if SYCL device belongs to either Lunar Lake or "
+              "Battlemage G21 Intel GPU architecture",
+              py::arg("device"));
+    }
+
     {
         m.def("_gemm_batch", &blas_ns::gemm_batch,
               "Call `gemm_batch` from OneMKL BLAS library to compute "
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
@@ -323,6 +323,22 @@ std::tuple<sycl::event, sycl::event, bool>
     return std::make_tuple(args_ev, gemm_ev, is_row_major);
 }
 
+bool _is_lnl_bm_architecture(const sycl::device &dev)
+{
+#if !defined(USE_ONEMKL_CUBLAS)
+    namespace syclex = sycl::ext::oneapi::experimental;
+    const auto arch = dev.get_info<syclex::info::device::architecture>();
+    switch (arch) {
+    case syclex::architecture::intel_gpu_lnl_m:   /* Lunar Lake */
+    case syclex::architecture::intel_gpu_bmg_g21: /* Battlemage G21 */
+        return true;
+    default:
+        return false;
+    }
+#endif // !defined(USE_ONEMKL_CUBLAS)
+    return false;
+}
+
 template <typename fnT, typename Tab, typename Tc>
 struct GemmContigFactory
 {
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
@@ -39,6 +39,8 @@ extern std::tuple<sycl::event, sycl::event, bool>
          const dpctl::tensor::usm_ndarray &resultC,
          const std::vector<sycl::event> &depends);
 
+extern bool _is_lnl_bm_architecture(const sycl::device &dev);
+
 extern std::tuple<sycl::event, sycl::event, bool>
     gemm_batch(sycl::queue &exec_q,
                const dpctl::tensor::usm_ndarray &matrixA,
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -894,6 +894,34 @@ def dpnp_matmul(
                 )
                 _manager.add_event_pair(ht_ev, gemv_ev)
             elif call_flag == "gemm":
+                # MKLD-17976: due to known issue in OneMKL on Lunar Lake and
+                # Battlemage G21 Intel GPU architectures, it forces
+                # to implement a temporary workaround with extra copying of
+                # an input array in case when it has a small size and
+                # non-zero offset
+                # The issue was detected by failing tests for eig/eigh
+                # TODO: remove the workaround once OneMKL issue is resolved
+                if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()):
+
+                    def _need_to_copy(a):
+                        a_usm = dpnp.get_usm_ndarray(a)
+                        if a_usm._element_offset > 0 and a_usm.size < 16:
+                            return True
+                        return False
+
+                    x1 = _copy_array(
+                        x1,
+                        copy_flag=_need_to_copy(x1),
+                        dtype=compute_dtype,
+                        order=res_order,
+                    )
+                    x2 = _copy_array(
+                        x2,
+                        copy_flag=_need_to_copy(x2),
+                        dtype=compute_dtype,
+                        order=res_order,
+                    )
+
                 result = _gemm_matmul(
                     exec_q,
                     x1,

Original file line number	Diff line number	Diff line change
`@@ -127,6 +127,13 @@ PYBIND11_MODULE(_blas_impl, m)`
`127`	`127`	`py::arg("resultC"), py::arg("depends") = py::list());`
`128`	`128`	`}`
`129`	`129`
	`130`	`+ {`
	`131`	`+ m.def("_is_lnl_bm_architecture", &blas_ns::_is_lnl_bm_architecture,`
	`132`	+ "Return ``True`` if SYCL device belongs to either Lunar Lake or "
	`133`	`+ "Battlemage G21 Intel GPU architecture",`
	`134`	`+ py::arg("device"));`
	`135`	`+ }`
	`136`	`+`
`130`	`137`	`{`
`131`	`138`	`m.def("_gemm_batch", &blas_ns::gemm_batch,`
`132`	`139`	"Call `gemm_batch` from OneMKL BLAS library to compute "