Skip to content

[SYCL][ESIMD] Add 8/16-bit type support to lsc_block_load/store #6757

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 22, 2022
Merged
126 changes: 97 additions & 29 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -474,8 +474,16 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
/// Supported platforms: DG2, PVC
/// VISA instruction: lsc_load.ugm
///
/// Collects elements located at specified address and returns them
/// as a single \ref simd object.
/// Accesses contiguous block of memory of `NElts * S` bytes starting from
/// given address, where S is a byte size of an "element" defined by the \c DS
/// template parameter. The maximum size of accessed block is 512 bytes for PVC
/// and 256 bytes for ACM (DG2).
/// When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned,
/// otherwise - 4-bytes aligned. Allowed values for the data size are
/// \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are
/// 1, 2, 3, 4, 8, 16, 32, 64.
/// Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts
/// must be 64.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to load per address.
Expand All @@ -492,22 +500,34 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
__ESIMD_API __ESIMD_NS::simd<T, NElts>
lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed load is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
addrs.data());
constexpr int SmallIntFactor =
Copy link
Contributor

@kbobrovs kbobrovs Sep 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please extend documentation for the lsc_block_load/store and make sure checks verify the constraints:

/// Accesses contiguous block of memory of `NElts * S` bytes  starting from
/// given address, where S is a byte size of an "element" defined by the \c DS
/// template parameter. The maximum size of accessed block is 512 bytes for PVC
/// and 256 bytes for ACM (DG2).
/// When \с DS equals \с lsc_data_size::u64, the address must be 8-byte aligned,
/// otherwise - 4-bytes aligned. Allowed values for the data size are
/// \с lsc_data_size::u32 and \с lsc_data_size::u64. Allowed NElts values are
/// 1, 2, 3, 4, 8, 16, 32, 64.
/// Note that to access 512 bytes, DS must be \с lsc_data_size::u64 and \c NElts
/// must be 64.

(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed load");

detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor == 1) {
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
_DS, _VS, _Transposed, N>(pred.data(),
addrs.data());
} else {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> result =
__esimd_lsc_load_stateless<uint32_t, L1H, L3H, _AddressScale,
_ImmOffset, lsc_data_size::u32, _VS,
_Transposed, N>(pred.data(), addrs.data());
return result.template bit_cast_view<T>();
}
}

/// Accessor-based transposed gather with 1 channel.
Expand All @@ -516,6 +536,8 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
///
/// Collects elements located at surface and returns them
/// as a single \ref simd object.
/// See comments in the \ref lsc_block_load API for description and parameter
/// constraints.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to load per address.
Expand All @@ -541,22 +563,36 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
return lsc_block_load<T, NElts, DS, L1H, L3H>(
__ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed load is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(), si);
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed load");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();

if constexpr (SmallIntFactor == 1) {
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
offsets.data(), si);
} else {

__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> result =
__esimd_lsc_load_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(), si);
return result.template bit_cast_view<T>();
}
#endif
}

Expand Down Expand Up @@ -622,6 +658,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();

static_assert(
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed prefetch is supported only for data size u32 or u64");
Expand All @@ -630,6 +667,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd_mask<N> pred = 1;

__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
__esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
Expand Down Expand Up @@ -894,6 +932,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
/// VISA instruction: lsc_store.ugm
///
/// Scatters elements to specific address.
/// See comments in the \ref lsc_block_load API for description and parameter
/// constraints.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
Expand All @@ -910,29 +950,44 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
__ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
__ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed store is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), addrs.data(),
vals.data());
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed store");
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor == 1) {

__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(), addrs.data(),
vals.data());
} else {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> tmp =
vals.template bit_cast_view<uint32_t>();

__esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), addrs.data(), tmp.data());
}
}

/// Accessor-based transposed scatter with 1 channel.
/// Supported platforms: DG2, PVC
/// VISA instruction: lsc_store.ugm
///
/// Scatters elements to surface.
/// See comments in the \ref lsc_block_load API for description and parameter
/// constraints.
///
/// @tparam T is element type.
/// @tparam NElts is the number of elements to store per address.
Expand All @@ -958,23 +1013,36 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
lsc_block_store<T, NElts, DS, L1H>(
__ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed store is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;

__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
vals.data(), si);
constexpr int SmallIntFactor =
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);

detail::check_lsc_vector_size<NElts / SmallIntFactor>();
static_assert(NElts % SmallIntFactor == 0,
"Number of elements is not supported by Transposed store");
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
if constexpr (SmallIntFactor > 1) {
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> Tmp =
vals.template bit_cast_view<uint32_t>();
__esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(), Tmp.data(), si);
} else {
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
vals.data(), si);
}
#endif
}

Expand Down