Skip to content

Commit 65adc39

Browse files
authored
Merge 7311ae3 into 90d67f5
2 parents 90d67f5 + 7311ae3 commit 65adc39

File tree

4 files changed

+69
-0
lines changed

4 files changed

+69
-0
lines changed

dpnp/backend/extensions/blas/blas_py.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,13 @@ PYBIND11_MODULE(_blas_impl, m)
127127
py::arg("resultC"), py::arg("depends") = py::list());
128128
}
129129

130+
{
131+
m.def("_is_lnl_arl_architecture", &blas_ns::_is_lnl_arl_architecture,
132+
"Return ``True`` if SYCL device belongs to either Lunar Lake or "
133+
"Arrow Lake architecture",
134+
py::arg("device"));
135+
}
136+
130137
{
131138
m.def("_gemm_batch", &blas_ns::gemm_batch,
132139
"Call `gemm_batch` from OneMKL BLAS library to compute "

dpnp/backend/extensions/blas/gemm.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,41 @@ std::tuple<sycl::event, sycl::event, bool>
323323
return std::make_tuple(args_ev, gemm_ev, is_row_major);
324324
}
325325

326+
bool _is_lnl_arl_architecture(sycl::device &dev)
327+
{
328+
#if !defined(USE_ONEMKL_CUBLAS)
329+
if (dev.ext_oneapi_architecture_is(
330+
sycl::ext::oneapi::experimental::architecture::
331+
intel_gpu_20_4_4)) /* Lunar Lake */
332+
{
333+
std::cout << "_is_lnl_arl_architecture: running on Lunar Lake" << std::endl;
334+
return true;
335+
}
336+
else if (dev.ext_oneapi_architecture_is(
337+
sycl::ext::oneapi::experimental::architecture::
338+
intel_gpu_12_74_4)) /* Arrow Lake H */
339+
{
340+
std::cout << "_is_lnl_arl_architecture: running on Arrow Lake H" << std::endl;
341+
return true;
342+
}
343+
else if (dev.ext_oneapi_architecture_is(
344+
sycl::ext::oneapi::experimental::architecture::
345+
intel_gpu_arl_u)) /* Arrow Lake U */
346+
{
347+
std::cout << "_is_lnl_arl_architecture: running on Arrow Lake U" << std::endl;
348+
return true;
349+
}
350+
else if (dev.ext_oneapi_architecture_is(
351+
sycl::ext::oneapi::experimental::architecture::
352+
intel_gpu_arl_s)) /* Arrow Lake S */
353+
{
354+
std::cout << "_is_lnl_arl_architecture: running on Arrow Lake S" << std::endl;
355+
return true;
356+
}
357+
#endif // !defined(USE_ONEMKL_CUBLAS)
358+
return false;
359+
}
360+
326361
template <typename fnT, typename Tab, typename Tc>
327362
struct GemmContigFactory
328363
{

dpnp/backend/extensions/blas/gemm.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ extern std::tuple<sycl::event, sycl::event, bool>
3939
const dpctl::tensor::usm_ndarray &resultC,
4040
const std::vector<sycl::event> &depends);
4141

42+
extern bool _is_lnl_arl_architecture(sycl::device &dev);
43+
4244
extern std::tuple<sycl::event, sycl::event, bool>
4345
gemm_batch(sycl::queue &exec_q,
4446
const dpctl::tensor::usm_ndarray &matrixA,

dpnp/dpnp_utils/dpnp_utils_linearalgebra.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -894,6 +894,31 @@ def dpnp_matmul(
894894
)
895895
_manager.add_event_pair(ht_ev, gemv_ev)
896896
elif call_flag == "gemm":
897+
# MKLD-17976: due to known issue in OneMKL on Lunar Lake and
898+
# Arrow Lake architectures, it forces to implement a temporary
899+
# workaround with extra copying of an input array in case when
900+
# it has a small size and non-zero offset
901+
# TODO: remove the workaround once OneMKL issue is resolved
902+
if bi._is_lnl_arl_architecture(exec_q.get_sycl_device()):
903+
def _need_to_copy(a):
904+
a_usm = dpnp.get_usm_ndarray(a)
905+
if a_usm._element_offset > 0 and a_usm.size < 16:
906+
return True
907+
return False
908+
909+
x1 = _copy_array(
910+
x1,
911+
copy_flag=_need_to_copy(x1),
912+
dtype=compute_dtype,
913+
order=res_order,
914+
)
915+
x2 = _copy_array(
916+
x2,
917+
copy_flag=_need_to_copy(x2),
918+
dtype=compute_dtype,
919+
order=res_order,
920+
)
921+
897922
result = _gemm_matmul(
898923
exec_q,
899924
x1,

0 commit comments

Comments
 (0)