Skip to content

[SYCL][joint matrix] add implementation for prefetch and overloads of load/store on annotated pointers #12066

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Dec 21, 2023
Merged
6 changes: 6 additions & 0 deletions sycl/include/CL/__spirv/spirv_ops.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#ifdef __SYCL_DEVICE_ONLY__

extern __DPCPP_SYCL_EXTERNAL float __spirv_RoundFToTF32INTEL(float a);

template <typename T, typename Tp, std::size_t R, std::size_t C,
__spv::MatrixUse U,
__spv::MatrixLayout L = __spv::MatrixLayout::RowMajor,
Expand Down Expand Up @@ -139,6 +140,11 @@ extern __DPCPP_SYCL_EXTERNAL __spv::__spirv_JointMatrixINTEL<T, R, C, L, S, U> *
__spirv_VectorInsertDynamic(__spv::__spirv_JointMatrixINTEL<T, R, C, L, S, U> *,
Ts val, size_t i);

template <typename T, std::size_t NumRows, std::size_t NumCols>
extern __DPCPP_SYCL_EXTERNAL void __spirv_JointMatrixPrefetchINTEL(
T *Ptr, std::size_t coordX, std::size_t coordY, unsigned int CacheLevel,
__spv::MatrixLayout Layout, std::size_t Stride);

#ifndef __SPIRV_BUILTIN_DECLARATIONS__
#error \
"SPIR-V built-ins are not available. Please set -fdeclare-spirv-builtins flag."
Expand Down
45 changes: 45 additions & 0 deletions sycl/include/sycl/ext/oneapi/matrix/matrix-intel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,51 @@ joint_matrix_store(Group,
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <
typename Group, typename T, typename Tp,
sycl::ext::oneapi::experimental::matrix::use Use, size_t NumRows,
size_t NumCols, sycl::ext::oneapi::experimental::matrix::layout Layout,
typename PropertyListT,
std::enable_if_t<Use == sycl::ext::oneapi::experimental::matrix::use::a ||
Use == sycl::ext::oneapi::experimental::matrix::use::b,
bool> = true>
inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
Group,
const sycl::ext::oneapi::experimental::matrix::joint_matrix<
Group, Tp, Use, NumRows, NumCols, Layout> &src,
ext::oneapi::experimental::annotated_ptr<T, PropertyListT> dst,
size_t stride) {
#if defined(__SYCL_DEVICE_ONLY__)
#if defined(__NVPTX__)
std::ignore = src;
std::ignore = dst;
std::ignore = stride;
throw runtime_error(
"This version of the matrix extension is only currently supported on "
"intel devices",
PI_ERROR_INVALID_DEVICE);
#else
// intel's impl
T *Ptr = dst.get();
__spirv_JointMatrixStoreINTEL<T, Tp, NumRows, NumCols,
sycl::ext::oneapi::experimental::matrix::
spv_matrix_use_traits<Use>::value,
sycl::ext::oneapi::experimental::matrix::
spv_matrix_layout_traits<Layout>::value>(
Ptr, src.spvm, stride,
sycl::ext::oneapi::experimental::matrix::spv_matrix_layout_traits<
Layout>::value,
sycl::ext::oneapi::experimental::matrix::spv_scope_traits<Group>::value);
#endif // defined(__NVPTX__)
#else
std::ignore = src;
std::ignore = dst;
std::ignore = stride;
throw runtime_error("joint matrix is not supported on host device.",
PI_ERROR_INVALID_DEVICE);
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <typename Group, typename T,
sycl::ext::oneapi::experimental::matrix::use Use, size_t Rows,
size_t Cols, sycl::ext::oneapi::experimental::matrix::layout Layout,
Expand Down
15 changes: 15 additions & 0 deletions sycl/include/sycl/ext/oneapi/matrix/matrix-unified-utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,21 @@ convertMatrixUseStringToEnum(const char *UseString) {
}
return std::nullopt;
}

inline __SYCL_ALWAYS_INLINE __spv::MatrixLayout joint_matrix_layout_to_spv(
sycl::ext::oneapi::experimental::matrix::layout Layout) {
switch (Layout) {
case sycl::ext::oneapi::experimental::matrix::layout::row_major:
return __spv::MatrixLayout::RowMajor;
case sycl::ext::oneapi::experimental::matrix::layout::col_major:
return __spv::MatrixLayout::ColumnMajor;
case sycl::ext::oneapi::experimental::matrix::layout::ext_intel_packed:
return __spv::MatrixLayout::Packed;
case sycl::ext::oneapi::experimental::matrix::layout::dynamic:
return __spv::MatrixLayout::Dynamic;
}
}

} // namespace detail
} // namespace _V1
} // namespace sycl
226 changes: 168 additions & 58 deletions sycl/include/sycl/ext/oneapi/matrix/matrix-unified.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@
#include <sycl/exception.hpp> // for runtime_error
#include <sycl/ext/oneapi/matrix/matrix-unified-utils.hpp> // for layout, use, tf32, convertMatrixUseEnumToString
#include <sycl/ext/oneapi/matrix/query-types.hpp> // for convertTypeToMatrixTypeString
#include <sycl/marray.hpp> // for marray
#include <sycl/multi_ptr.hpp> // for multi_ptr
#include <sycl/marray.hpp> // for marray
#include <sycl/multi_ptr.hpp> // for multi_ptr

#include <cstring> // for size_t, memcpy
#include <stdint.h> // for uint32_t
Expand Down Expand Up @@ -165,34 +165,12 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
std::ignore = sg;
using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
DecorT *Ptr = sycl::detail::getDecorated<DecorT>(src);
switch (Layout) {
default:
assert(false && "Invalid Memory Layout!");
case layout::row_major:
res.spvm = __spirv_JointMatrixLoadINTEL<
DecorT, S, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, stride, __spv::MatrixLayout::RowMajor,
spv_scope_traits<Group>::value);
break;
case layout::col_major:
res.spvm = __spirv_JointMatrixLoadINTEL<
DecorT, S, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, stride, __spv::MatrixLayout::ColumnMajor,
spv_scope_traits<Group>::value);
break;
case layout::ext_intel_packed:
res.spvm = __spirv_JointMatrixLoadINTEL<
DecorT, S, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, stride, __spv::MatrixLayout::Packed,
spv_scope_traits<Group>::value);
break;
}
res.spvm = __spirv_JointMatrixLoadINTEL<
DecorT, S, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, stride, sycl::detail::joint_matrix_layout_to_spv(Layout),
spv_scope_traits<Group>::value);
#endif // defined(__NVPTX__)
#else
std::ignore = sg;
Expand Down Expand Up @@ -250,6 +228,83 @@ joint_matrix_load(Group sg,
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <typename Group, typename S, typename T, size_t NumRows,
size_t NumCols, typename PropertyListT,
std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value,
bool> = true>
inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
Group sg,
joint_matrix<Group, S, use::accumulator, NumRows, NumCols,
sycl::ext::oneapi::experimental::matrix::layout::dynamic> &res,
ext::oneapi::experimental::annotated_ptr<T, PropertyListT> src,
size_t stride, sycl::ext::oneapi::experimental::matrix::layout Layout) {
#if defined(__SYCL_DEVICE_ONLY__)
#if defined(__NVPTX__)
std::ignore = sg;
throw runtime_error("Use joint_matrix_load on multi_ptr on Nvidia device.",
PI_ERROR_INVALID_DEVICE);
#elif defined(__HIP_PLATFORM_AMD_MFMA__)
throw runtime_error("Use joint_matrix_load on multi_ptr on AMD device.",
PI_ERROR_INVALID_DEVICE);
#else
std::ignore = sg;
T *Ptr = src.get();
res.spvm = __spirv_JointMatrixLoadINTEL<
T, S, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, stride, sycl::detail::joint_matrix_layout_to_spv(Layout),
spv_scope_traits<Group>::value);
#endif // defined(__NVPTX__)
#else
std::ignore = sg;
std::ignore = res;
std::ignore = src;
std::ignore = stride;
std::ignore = Layout;
throw runtime_error("joint matrix is not supported on host device.",
PI_ERROR_INVALID_DEVICE);
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <
typename Group, typename S, typename T, use Use, size_t NumRows,
size_t NumCols, matrix::layout Layout, typename PropertyListT,
std::enable_if_t<std::is_same<S, std::remove_const_t<T>>::value ||
(std::is_same<S, precision::tf32>::value &&
std::is_same<std::remove_const_t<T>, float>::value),
bool> = true>
inline __SYCL_ALWAYS_INLINE void joint_matrix_load(
Group sg, joint_matrix<Group, S, Use, NumRows, NumCols, Layout> &res,
ext::oneapi::experimental::annotated_ptr<T, PropertyListT> src,
size_t stride) {
#if defined(__SYCL_DEVICE_ONLY__)
#if defined(__NVPTX__)
std::ignore = sg;
throw runtime_error("Use joint_matrix_load on multi_ptr on Nvidia device.",
PI_ERROR_INVALID_DEVICE);
#elif defined(__HIP_PLATFORM_AMD_MFMA__)
throw runtime_error("Use joint_matrix_load on multi_ptr on AMD device.",
PI_ERROR_INVALID_DEVICE);
#else
std::ignore = sg;
T *Ptr = src.get();
res.spvm =
__spirv_JointMatrixLoadINTEL<T, S, NumRows, NumCols,
spv_matrix_use_traits<Use>::value,
spv_matrix_layout_traits<Layout>::value>(
Ptr, stride, spv_matrix_layout_traits<Layout>::value,
spv_scope_traits<Group>::value);
#endif // defined(__NVPTX__)
#else
std::ignore = sg;
std::ignore = res;
std::ignore = src;
std::ignore = stride;
throw runtime_error("joint matrix is not supported on host device.",
PI_ERROR_INVALID_DEVICE);
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <typename Group, typename T, size_t NumRows, size_t NumCols,
access::address_space Space, access::decorated IsDecorated>
inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
Expand All @@ -275,34 +330,49 @@ inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
std::ignore = sg;
using DecorT = typename sycl::detail::DecoratedType<T, Space>::type;
DecorT *Ptr = sycl::detail::getDecorated<DecorT>(dst);
switch (Layout) {
default:
assert(false && "Invalid Memory Layout!");
case layout::row_major:
__spirv_JointMatrixStoreINTEL<
DecorT, T, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, src.spvm, stride, __spv::MatrixLayout::RowMajor,
spv_scope_traits<Group>::value);
break;
case layout::col_major:
__spirv_JointMatrixStoreINTEL<
DecorT, T, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, src.spvm, stride, __spv::MatrixLayout::ColumnMajor,
spv_scope_traits<Group>::value);
break;
case layout::ext_intel_packed:
__spirv_JointMatrixStoreINTEL<
DecorT, T, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, src.spvm, stride, __spv::MatrixLayout::Packed,
spv_scope_traits<Group>::value);
break;
}
__spirv_JointMatrixStoreINTEL<
DecorT, T, NumRows, NumCols,
spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, src.spvm, stride, sycl::detail::joint_matrix_layout_to_spv(Layout),
spv_scope_traits<Group>::value);
#endif // defined(__NVPTX__)
#else
std::ignore = sg;
std::ignore = src;
std::ignore = dst;
std::ignore = stride;
std::ignore = Layout;
throw runtime_error("joint matrix is not supported on host device.",
PI_ERROR_INVALID_DEVICE);
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <typename Group, typename T, size_t NumRows, size_t NumCols,
typename PropertyListT>
inline __SYCL_ALWAYS_INLINE void joint_matrix_store(
Group sg,
const joint_matrix<Group, T, use::accumulator, NumRows, NumCols,
sycl::ext::oneapi::experimental::matrix::layout::dynamic>
&src,
ext::oneapi::experimental::annotated_ptr<T, PropertyListT> dst,
size_t stride, sycl::ext::oneapi::experimental::matrix::layout Layout) {
#if defined(__SYCL_DEVICE_ONLY__)
#if defined(__NVPTX__)
std::ignore = sg;
throw runtime_error("Use joint_matrix_store on multi_ptr on Nvidia device.",
PI_ERROR_INVALID_DEVICE);
#elif defined(__HIP_PLATFORM_AMD_MFMA__)
throw runtime_error("Use joint_matrix_store on multi_ptr on AMD device.",
PI_ERROR_INVALID_DEVICE);
#else
std::ignore = sg;
T *Ptr = dst.get();
__spirv_JointMatrixStoreINTEL<
T, T, NumRows, NumCols, spv_matrix_use_traits<use::accumulator>::value,
spv_matrix_layout_traits<layout::dynamic>::value>(
Ptr, src.spvm, stride, sycl::detail::joint_matrix_layout_to_spv(Layout),
spv_scope_traits<Group>::value);
#endif // defined(__NVPTX__)
#else
std::ignore = sg;
Expand Down Expand Up @@ -429,6 +499,46 @@ inline __SYCL_ALWAYS_INLINE float round_to_tf32(const float &a) {
return ret;
#endif // defined(__SYCL_DEVICE_ONLY__)
}

template <size_t NumRows, size_t NumCols, typename Group, typename T,
typename Properties = ext::oneapi::experimental::empty_properties_t>
inline __SYCL_ALWAYS_INLINE void
joint_matrix_prefetch(Group sg, T *Ptr, size_t stride,
sycl::ext::oneapi::experimental::matrix::layout Layout,
Properties properties = {}) {
#if defined(__SYCL_DEVICE_ONLY__)
#if defined(__NVPTX__)
std::ignore = sg;
std::ignore = properties;
throw runtime_error(
"joint_matrix_prefetch is not supported on Nvidia device.",
PI_ERROR_INVALID_DEVICE);
#elif defined(__HIP_PLATFORM_AMD_MFMA__)
std::ignore = sg;
std::ignore = properties;
throw runtime_error("joint_matrix_prefetch is not supported on AMD device.",
PI_ERROR_INVALID_DEVICE);
#else
std::ignore = sg;
auto prop = properties.template get_property<prefetch_hint_key>();
// Will be removed once SPIRV implementation also uses offsetpointer
size_t coordX = 0;
size_t coordY = 0;
__spirv_JointMatrixPrefetchINTEL<T, NumRows, NumCols>(
Ptr, coordX, coordY, detail::PropertyMetaInfo<decltype(prop)>::value,
sycl::detail::joint_matrix_layout_to_spv(Layout), stride);
#endif // defined(__NVPTX__)
#else
std::ignore = sg;
std::ignore = Ptr;
std::ignore = stride;
std::ignore = Layout;
std::ignore = properties;
throw runtime_error("joint matrix is not supported on host device.",
PI_ERROR_INVALID_DEVICE);
#endif // defined(__SYCL_DEVICE_ONLY__)
}

} // namespace matrix
} // namespace experimental
} // namespace oneapi
Expand Down
21 changes: 21 additions & 0 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_annotated_ptr.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//==-------- joint_matrix_annotated_ptr.cpp - DPC++ joint_matrix-----------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// REQUIRES: matrix

// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// Currently row major B fails when annotated_ptr is used
// XFAIL: gpu

#include "../common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;

#include "../joint_matrix_annotated_ptr_impl.hpp"
17 changes: 17 additions & 0 deletions sycl/test-e2e/Matrix/SG32/joint_matrix_prefetch.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
//==-------- joint_matrix_prefetch.cpp - DPC++ joint_matrix----------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// RUN: %{build} -o %t.out
// RUN: %{run} %t.out

// XFAIL:*

#include "../common.hpp"

#define SG_SZ 32
constexpr size_t TN = 16;
#include "../joint_matrix_prefetch_impl.hpp"
Loading