Skip to content

Update a workaround to gemm issue in OneMKL #2096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions dpnp/backend/extensions/blas/blas_py.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,13 @@ PYBIND11_MODULE(_blas_impl, m)
py::arg("device"));
}

{
m.def("_is_16_bytes_aligned", &blas_ns::_is_16_bytes_aligned,
"Return ``True`` if pointer on USM allocation has 16 bytes "
"alignment in memory",
py::arg("a"));
}

{
m.def("_gemm_batch", &blas_ns::gemm_batch,
"Call `gemm_batch` from OneMKL BLAS library to compute "
Expand Down
7 changes: 7 additions & 0 deletions dpnp/backend/extensions/blas/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <pybind11/pybind11.h>

// dpctl tensor headers
#include "kernels/alignment.hpp"
#include "utils/memory_overlap.hpp"
#include "utils/output_validation.hpp"
#include "utils/type_utils.hpp"
Expand Down Expand Up @@ -339,6 +340,12 @@ bool _is_lnl_bm_architecture(const sycl::device &dev)
return false;
}

bool _is_16_bytes_aligned(const dpctl::tensor::usm_ndarray &a)
{
return dpctl::tensor::kernels::alignment_utils::is_aligned<16>(
a.get_data());
}

template <typename fnT, typename Tab, typename Tc>
struct GemmContigFactory
{
Expand Down
1 change: 1 addition & 0 deletions dpnp/backend/extensions/blas/gemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ extern std::tuple<sycl::event, sycl::event, bool>
const std::vector<sycl::event> &depends);

extern bool _is_lnl_bm_architecture(const sycl::device &dev);
extern bool _is_16_bytes_aligned(const dpctl::tensor::usm_ndarray &a);

extern std::tuple<sycl::event, sycl::event, bool>
gemm_batch(sycl::queue &exec_q,
Expand Down
36 changes: 24 additions & 12 deletions dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
Original file line number Diff line number Diff line change
Expand Up @@ -897,27 +897,19 @@ def dpnp_matmul(
# MKLD-17976: due to known issue in OneMKL on Lunar Lake and
# Battlemage G21 Intel GPU architectures, it forces
# to implement a temporary workaround with extra copying of
# an input array in case when it has a small size and
# non-zero offset
# The issue was detected by failing tests for eig/eigh
# an input array in case when it does not have 16 bytes
# alignment in the memory.
# TODO: remove the workaround once OneMKL issue is resolved
if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()):

def _need_to_copy(a):
a_usm = dpnp.get_usm_ndarray(a)
if a_usm._element_offset > 0 and a_usm.size < 16:
return True
return False

x1 = _copy_array(
x1,
copy_flag=_need_to_copy(x1),
copy_flag=bi._is_16_bytes_aligned(x1),
dtype=compute_dtype,
order=res_order,
)
x2 = _copy_array(
x2,
copy_flag=_need_to_copy(x2),
copy_flag=bi._is_16_bytes_aligned(x2),
dtype=compute_dtype,
order=res_order,
)
Expand All @@ -929,6 +921,26 @@ def _need_to_copy(a):
result,
)
else: # call_flag == "gemm_batch"
# MKLD-17976: due to known issue in OneMKL on Lunar Lake and
# Battlemage G21 Intel GPU architectures, it forces
# to implement a temporary workaround with extra copying of
# an input array in case when it does not have 16 bytes
# alignment in the memory.
# TODO: remove the workaround once OneMKL issue is resolved
if bi._is_lnl_bm_architecture(exec_q.get_sycl_device()):
x1 = _copy_array(
x1,
copy_flag=bi._is_16_bytes_aligned(x1),
dtype=compute_dtype,
order=res_order,
)
x2 = _copy_array(
x2,
copy_flag=bi._is_16_bytes_aligned(x2),
dtype=compute_dtype,
order=res_order,
)

result = _gemm_batch_matmul(
exec_q,
x1,
Expand Down
26 changes: 26 additions & 0 deletions tests/test_mathematical.py
Original file line number Diff line number Diff line change
Expand Up @@ -3824,6 +3824,32 @@ def test_matmul_alias(self):
result2 = dpnp.linalg.matmul(a, b)
assert_array_equal(result1, result2)

@pytest.mark.parametrize(
"sh1, sh2",
[
((2, 3, 3), (3, 3)),
((3, 4, 4, 4), (4, 4, 4)),
],
ids=["gemm", "gemm_batch"],
)
def test_matmul_with_offsets(self, sh1, sh2):
size1, size2 = numpy.prod(sh1, dtype=int), numpy.prod(sh2, dtype=int)
a = numpy.random.randint(-5, 5, size1).reshape(sh1)
b = numpy.random.randint(-5, 5, size2).reshape(sh2)
ia, ib = dpnp.array(a), dpnp.array(b)

result = ia[1] @ ib
expected = a[1] @ b
assert_array_equal(result, expected)

result = ib @ ia[1]
expected = b @ a[1]
assert_array_equal(result, expected)

result = ia[1] @ ia[1]
expected = a[1] @ a[1]
assert_array_equal(result, expected)


class TestMatmulInvalidCases:
@pytest.mark.parametrize(
Expand Down
Loading