Update the w/a based on input from OneMKL team

antonwolfy · antonwolfy · commit 11d3a948bcca · 2024-10-08T17:22:29.000+02:00
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -134,6 +134,13 @@ PYBIND11_MODULE(_blas_impl, m)
               py::arg("device"));
     }
 
+    {
+        m.def("_is_16_bytes_aligned", &blas_ns::_is_16_bytes_aligned,
+              "Return ``True`` if pointer on USM allocation has 16 bytes "
+              "alignment in memory",
+              py::arg("a"));
+    }
+
     {
         m.def("_gemm_batch", &blas_ns::gemm_batch,
               "Call `gemm_batch` from OneMKL BLAS library to compute "
diff --git a/dpnp/backend/extensions/blas/gemm.cpp b/dpnp/backend/extensions/blas/gemm.cpp
@@ -26,6 +26,7 @@
 #include <pybind11/pybind11.h>
 
 // dpctl tensor headers
+#include "kernels/alignment.hpp"
 #include "utils/memory_overlap.hpp"
 #include "utils/output_validation.hpp"
 #include "utils/type_utils.hpp"
@@ -339,6 +340,12 @@ bool _is_lnl_bm_architecture(const sycl::device &dev)
     return false;
 }
 
+bool _is_16_bytes_aligned(const dpctl::tensor::usm_ndarray &a)
+{
+    return dpctl::tensor::kernels::alignment_utils::is_aligned<16>(
+        a.get_data());
+}
+
 template <typename fnT, typename Tab, typename Tc>
 struct GemmContigFactory
 {
diff --git a/dpnp/backend/extensions/blas/gemm.hpp b/dpnp/backend/extensions/blas/gemm.hpp
@@ -40,6 +40,7 @@ extern std::tuple<sycl::event, sycl::event, bool>
          const std::vector<sycl::event> &depends);
 
 extern bool _is_lnl_bm_architecture(const sycl::device &dev);
+extern bool _is_16_bytes_aligned(const dpctl::tensor::usm_ndarray &a);
 
 extern std::tuple<sycl::event, sycl::event, bool>
     gemm_batch(sycl::queue &exec_q,
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -897,27 +897,19 @@ def dpnp_matmul(
                 # MKLD-17976: due to known issue in OneMKL on Lunar Lake and
                 # Battlemage G21 Intel GPU architectures, it forces
                 # to implement a temporary workaround with extra copying of
-                # an input array in case when it has a small size and
-                # non-zero offset
-                # The issue was detected by failing tests for eig/eigh
+                # an input array in case when it does not have 16 bytes
+                # alignment in the memory.
                 # TODO: remove the workaround once OneMKL issue is resolved
                 if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()):
-
-                    def _need_to_copy(a):
-                        a_usm = dpnp.get_usm_ndarray(a)
-                        if a_usm._element_offset > 0 and a_usm.size < 16:
-                            return True
-                        return False
-
                     x1 = _copy_array(
                         x1,
-                        copy_flag=_need_to_copy(x1),
+                        copy_flag=bi._is_16_bytes_aligned(x1),
                         dtype=compute_dtype,
                         order=res_order,
                     )
                     x2 = _copy_array(
                         x2,
-                        copy_flag=_need_to_copy(x2),
+                        copy_flag=bi._is_16_bytes_aligned(x2),
                         dtype=compute_dtype,
                         order=res_order,
                     )
@@ -929,6 +921,26 @@ def _need_to_copy(a):
                     result,
                 )
             else:  # call_flag == "gemm_batch"
+                # MKLD-17976: due to known issue in OneMKL on Lunar Lake and
+                # Battlemage G21 Intel GPU architectures, it forces
+                # to implement a temporary workaround with extra copying of
+                # an input array in case when it does not have 16 bytes
+                # alignment in the memory.
+                # TODO: remove the workaround once OneMKL issue is resolved
+                if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()):
+                    x1 = _copy_array(
+                        x1,
+                        copy_flag=bi._is_16_bytes_aligned(x1),
+                        dtype=compute_dtype,
+                        order=res_order,
+                    )
+                    x2 = _copy_array(
+                        x2,
+                        copy_flag=bi._is_16_bytes_aligned(x2),
+                        dtype=compute_dtype,
+                        order=res_order,
+                    )
+
                 result = _gemm_batch_matmul(
                     exec_q,
                     x1,

Original file line number	Diff line number	Diff line change
`@@ -134,6 +134,13 @@ PYBIND11_MODULE(_blas_impl, m)`
`134`	`134`	`py::arg("device"));`
`135`	`135`	`}`
`136`	`136`
	`137`	`+ {`
	`138`	`+ m.def("_is_16_bytes_aligned", &blas_ns::_is_16_bytes_aligned,`
	`139`	+ "Return ``True`` if pointer on USM allocation has 16 bytes "
	`140`	`+ "alignment in memory",`
	`141`	`+ py::arg("a"));`
	`142`	`+ }`
	`143`	`+`
`137`	`144`	`{`
`138`	`145`	`m.def("_gemm_batch", &blas_ns::gemm_batch,`
`139`	`146`	"Call `gemm_batch` from OneMKL BLAS library to compute "