Skip to content

[SYCL][ESIMD] Add 8/16-bit type support to lsc_block_load/store #6757

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 22, 2022
Merged
115 changes: 87 additions & 28 deletions sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -495,22 +495,35 @@ template <typename T, uint8_t NElts = 1,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
__ESIMD_API __ESIMD_NS::simd<T, NElts>
lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed load is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
addrs.data());
if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
detail::check_lsc_vector_size<NElts>();
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
_DS, _VS, _Transposed, N>(pred.data(),
addrs.data());
} else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
static_assert(NElts % NElemsInDword == 0,
"Number of elements is not supported by Transposed load");
detail::check_lsc_vector_size<NElts / NElemsInDword>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / NElemsInDword>();

__ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> result =
__esimd_lsc_load_stateless<uint32_t, L1H, L3H, _AddressScale,
_ImmOffset, lsc_data_size::u32, _VS,
_Transposed, N>(pred.data(), addrs.data());
return result.template bit_cast_view<T>();
}
}

/// Accessor-based transposed gather with 1 channel.
Expand Down Expand Up @@ -545,22 +558,38 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
return lsc_block_load<T, NElts, DS, L1H, L3H>(
__ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed load is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(), si);

if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
detail::check_lsc_vector_size<NElts>();
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
offsets.data(), si);
} else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
static_assert(NElts % NElemsInDword == 0,
"Number of elements is not supported by Transposed load");

detail::check_lsc_vector_size<NElts / NElemsInDword>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / NElemsInDword>();

__ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> result =
__esimd_lsc_load_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(), si);
return result.template bit_cast_view<T>();
}
#endif
}

Expand Down Expand Up @@ -627,6 +656,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();

static_assert(
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed prefetch is supported only for data size u32 or u64");
Expand All @@ -635,6 +665,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd_mask<N> pred = 1;

__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
__esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(),
Expand Down Expand Up @@ -918,22 +949,35 @@ template <typename T, uint8_t NElts = 1,
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
__ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
__ESIMD_NS::simd_mask<1> pred = 1) {
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed store is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), addrs.data(),
vals.data());
if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
detail::check_lsc_vector_size<NElts>();
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
_VS, _Transposed, N>(pred.data(), addrs.data(),
vals.data());
} else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
static_assert(NElts % NElemsInDword == 0,
"Number of elements is not supported by Transposed store");
detail::check_lsc_vector_size<NElts / NElemsInDword>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / NElemsInDword>();
__ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> tmp =
vals.template bit_cast_view<uint32_t>();

__esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), addrs.data(), tmp.data());
}
}

/// Accessor-based transposed scatter with 1 channel.
Expand Down Expand Up @@ -967,23 +1011,38 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
lsc_block_store<T, NElts, DS, L1H>(
__ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
#else
detail::check_lsc_vector_size<NElts>();
detail::check_lsc_data_size<T, DS>();
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
constexpr uint16_t _AddressScale = 1;
constexpr int _ImmOffset = 0;
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
"Transposed store is supported only for data size u32 or u64");
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
constexpr detail::lsc_data_order _Transposed =
detail::lsc_data_order::transpose;
constexpr int N = 1;

__ESIMD_NS::simd<uint32_t, N> offsets = offset;
auto si = __ESIMD_NS::get_surface_index(acc);
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
vals.data(), si);

if constexpr (_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64) {
detail::check_lsc_vector_size<NElts>();
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
_Transposed, N>(pred.data(), offsets.data(),
vals.data(), si);
} else if constexpr (_DS == lsc_data_size::u16 || _DS == lsc_data_size::u8) {
constexpr int NElemsInDword = _DS == lsc_data_size::u16 ? 2 : 4;
static_assert(NElts % NElemsInDword == 0,
"Number of elements is not supported by Transposed store");
detail::check_lsc_vector_size<NElts / NElemsInDword>();
constexpr detail::lsc_vector_size _VS =
detail::to_lsc_vector_size<NElts / NElemsInDword>();
__ESIMD_NS::simd<uint32_t, NElts / NElemsInDword> tmp =
vals.template bit_cast_view<uint32_t>();

__esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
lsc_data_size::u32, _VS, _Transposed, N>(
pred.data(), offsets.data(), tmp.data(), si);
}
#endif
}

Expand Down