IntelPython · antonwolfy · Oct 14, 2024 · Oct 14, 2024 · Oct 14, 2024
@@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
-## [0.16.0] - 09/DD/2024
+## [0.16.0] - 10/14/2024
 
 This release reaches an important milestone by making offloading fully asynchronous. Calls to `dpnp` submit tasks for execution to DPC++ runtime and return without waiting for execution of these tasks to finish. The sequential semantics a user comes to expect from execution of Python script is preserved though.
 In addition, this release completes implementation of `dpnp.fft` module and adds several new array manipulation, indexing and elementwise routines. Moreover, it adds support to build `dpnp` for Nvidia GPUs.
@@ -120,7 +120,6 @@ In addition, this release completes implementation of `dpnp.fft` module and adds
 * Fixed a crash in `dpnp.choose` caused by missing control of releasing temporary allocated device memory [#2063](https://github.com/IntelPython/dpnp/pull/2063)
 * Resolved compilation warning and error while building in debug mode [#2066](https://github.com/IntelPython/dpnp/pull/2066)
 * Fixed an issue with asynchronous execution in `dpnp.fft` module [#2067](https://github.com/IntelPython/dpnp/pull/2067)
-* Added a workaround to fix the incorrect result from `dpnp.matmul` computing on Lunar Lake or Arrow Lake Battlemage graphics [#2082](https://github.com/IntelPython/dpnp/pull/2082)
 
 ## [0.15.0] - 05/25/2024
 

@@ -127,13 +127,6 @@ PYBIND11_MODULE(_blas_impl, m)
               py::arg("resultC"), py::arg("depends") = py::list());
     }
 
-    {
-        m.def("_is_lnl_bm_architecture", &blas_ns::_is_lnl_bm_architecture,
-              "Return ``True`` if SYCL device belongs to either Lunar Lake or "
-              "Battlemage G21 Intel GPU architecture",
-              py::arg("device"));
-    }
-
     {
         m.def("_gemm_batch", &blas_ns::gemm_batch,
               "Call `gemm_batch` from OneMKL BLAS library to compute "

@@ -323,22 +323,6 @@ std::tuple<sycl::event, sycl::event, bool>
     return std::make_tuple(args_ev, gemm_ev, is_row_major);
 }
 
-bool _is_lnl_bm_architecture(const sycl::device &dev)
-{
-#if !defined(USE_ONEMKL_CUBLAS)
-    namespace syclex = sycl::ext::oneapi::experimental;
-    const auto arch = dev.get_info<syclex::info::device::architecture>();
-    switch (arch) {
-    case syclex::architecture::intel_gpu_lnl_m:   /* Lunar Lake */
-    case syclex::architecture::intel_gpu_bmg_g21: /* Battlemage G21 */
-        return true;
-    default:
-        return false;
-    }
-#endif // !defined(USE_ONEMKL_CUBLAS)
-    return false;
-}
-
 template <typename fnT, typename Tab, typename Tc>
 struct GemmContigFactory
 {

@@ -39,8 +39,6 @@ extern std::tuple<sycl::event, sycl::event, bool>
          const dpctl::tensor::usm_ndarray &resultC,
          const std::vector<sycl::event> &depends);
 
-extern bool _is_lnl_bm_architecture(const sycl::device &dev);
-
 extern std::tuple<sycl::event, sycl::event, bool>
     gemm_batch(sycl::queue &exec_q,
                const dpctl::tensor::usm_ndarray &matrixA,

@@ -894,34 +894,6 @@ def dpnp_matmul(
                 )
                 _manager.add_event_pair(ht_ev, gemv_ev)
             elif call_flag == "gemm":
-                # MKLD-17976: due to known issue in OneMKL on Lunar Lake and
-                # Battlemage G21 Intel GPU architectures, it forces
-                # to implement a temporary workaround with extra copying of
-                # an input array in case when it has a small size and
-                # non-zero offset
-                # The issue was detected by failing tests for eig/eigh
-                # TODO: remove the workaround once OneMKL issue is resolved
-                if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()):
-
-                    def _need_to_copy(a):
-                        a_usm = dpnp.get_usm_ndarray(a)
-                        if a_usm._element_offset > 0 and a_usm.size < 16:
-                            return True
-                        return False
-
-                    x1 = _copy_array(
-                        x1,
-                        copy_flag=_need_to_copy(x1),
-                        dtype=compute_dtype,
-                        order=res_order,
-                    )
-                    x2 = _copy_array(
-                        x2,
-                        copy_flag=_need_to_copy(x2),
-                        dtype=compute_dtype,
-                        order=res_order,
-                    )
-
                 result = _gemm_matmul(
                     exec_q,
                     x1,

@@ -3824,6 +3824,24 @@ def test_matmul_alias(self):
         result2 = dpnp.linalg.matmul(a, b)
         assert_array_equal(result1, result2)
 
+    @pytest.mark.parametrize(
+        "sh1, sh2",
+        [
+            ((2, 3, 3), (2, 3, 3)),
+            ((3, 3, 3, 3), (3, 3, 3, 3)),
+        ],
+        ids=["gemm", "gemm_batch"],
+    )
+    def test_matmul_with_offsets(self, sh1, sh2):
+        size1, size2 = numpy.prod(sh1, dtype=int), numpy.prod(sh2, dtype=int)
+        a = numpy.random.randint(-5, 5, size1).reshape(sh1).astype("f8")
+        b = numpy.random.randint(-5, 5, size2).reshape(sh2).astype("f8")
+        ia, ib = dpnp.array(a), dpnp.array(b)
+
+        result = ia[1] @ ib[1]
+        expected = a[1] @ b[1]
+        assert_array_equal(result, expected)
+
 
 class TestMatmulInvalidCases:
     @pytest.mark.parametrize(