Skip to content

Commit f9d8059

Browse files
authored
[SYCL][ESIMD] Add 8/16-bit type support to lsc_block_load/store (#6757)
1 parent b5023ea commit f9d8059

File tree

1 file changed

+97
-29
lines changed
  • sycl/include/sycl/ext/intel/experimental/esimd

1 file changed

+97
-29
lines changed

sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp

Lines changed: 97 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -474,8 +474,16 @@ lsc_gather(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
474474
/// Supported platforms: DG2, PVC
475475
/// VISA instruction: lsc_load.ugm
476476
///
477-
/// Collects elements located at specified address and returns them
478-
/// as a single \ref simd object.
477+
/// Accesses contiguous block of memory of `NElts * S` bytes starting from
478+
/// given address, where S is a byte size of an "element" defined by the \c DS
479+
/// template parameter. The maximum size of accessed block is 512 bytes for PVC
480+
/// and 256 bytes for ACM (DG2).
481+
/// When \? DS equals \? lsc_data_size::u64, the address must be 8-byte aligned,
482+
/// otherwise - 4-bytes aligned. Allowed values for the data size are
483+
/// \? lsc_data_size::u32 and \? lsc_data_size::u64. Allowed NElts values are
484+
/// 1, 2, 3, 4, 8, 16, 32, 64.
485+
/// Note that to access 512 bytes, DS must be \? lsc_data_size::u64 and \c NElts
486+
/// must be 64.
479487
///
480488
/// @tparam T is element type.
481489
/// @tparam NElts is the number of elements to load per address.
@@ -492,22 +500,34 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
492500
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
493501
__ESIMD_API __ESIMD_NS::simd<T, NElts>
494502
lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
495-
detail::check_lsc_vector_size<NElts>();
496503
detail::check_lsc_data_size<T, DS>();
497504
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
498505
constexpr uint16_t _AddressScale = 1;
499506
constexpr int _ImmOffset = 0;
500507
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
501-
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
502-
"Transposed load is supported only for data size u32 or u64");
503-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
504508
constexpr detail::lsc_data_order _Transposed =
505509
detail::lsc_data_order::transpose;
506510
constexpr int N = 1;
507511
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
508-
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
509-
_VS, _Transposed, N>(pred.data(),
510-
addrs.data());
512+
constexpr int SmallIntFactor =
513+
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
514+
static_assert(NElts % SmallIntFactor == 0,
515+
"Number of elements is not supported by Transposed load");
516+
517+
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
518+
constexpr detail::lsc_vector_size _VS =
519+
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
520+
if constexpr (SmallIntFactor == 1) {
521+
return __esimd_lsc_load_stateless<T, L1H, L3H, _AddressScale, _ImmOffset,
522+
_DS, _VS, _Transposed, N>(pred.data(),
523+
addrs.data());
524+
} else {
525+
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> result =
526+
__esimd_lsc_load_stateless<uint32_t, L1H, L3H, _AddressScale,
527+
_ImmOffset, lsc_data_size::u32, _VS,
528+
_Transposed, N>(pred.data(), addrs.data());
529+
return result.template bit_cast_view<T>();
530+
}
511531
}
512532

513533
/// Accessor-based transposed gather with 1 channel.
@@ -516,6 +536,8 @@ lsc_block_load(const T *p, __ESIMD_NS::simd_mask<1> pred = 1) {
516536
///
517537
/// Collects elements located at surface and returns them
518538
/// as a single \ref simd object.
539+
/// See comments in the \ref lsc_block_load API for description and parameter
540+
/// constraints.
519541
///
520542
/// @tparam T is element type.
521543
/// @tparam NElts is the number of elements to load per address.
@@ -541,22 +563,36 @@ lsc_block_load(AccessorTy acc, uint32_t offset,
541563
return lsc_block_load<T, NElts, DS, L1H, L3H>(
542564
__ESIMD_DNS::accessorToPointer<T>(acc, offset), pred);
543565
#else
544-
detail::check_lsc_vector_size<NElts>();
545566
detail::check_lsc_data_size<T, DS>();
546567
detail::check_lsc_cache_hint<detail::lsc_action::load, L1H, L3H>();
547568
constexpr uint16_t _AddressScale = 1;
548569
constexpr int _ImmOffset = 0;
549570
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
550-
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
551-
"Transposed load is supported only for data size u32 or u64");
552-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
553571
constexpr detail::lsc_data_order _Transposed =
554572
detail::lsc_data_order::transpose;
555573
constexpr int N = 1;
556574
__ESIMD_NS::simd<uint32_t, N> offsets = offset;
557575
auto si = __ESIMD_NS::get_surface_index(acc);
558-
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
559-
_Transposed, N>(pred.data(), offsets.data(), si);
576+
constexpr int SmallIntFactor =
577+
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
578+
static_assert(NElts % SmallIntFactor == 0,
579+
"Number of elements is not supported by Transposed load");
580+
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
581+
constexpr detail::lsc_vector_size _VS =
582+
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
583+
584+
if constexpr (SmallIntFactor == 1) {
585+
return __esimd_lsc_load_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
586+
_VS, _Transposed, N>(pred.data(),
587+
offsets.data(), si);
588+
} else {
589+
590+
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> result =
591+
__esimd_lsc_load_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
592+
lsc_data_size::u32, _VS, _Transposed, N>(
593+
pred.data(), offsets.data(), si);
594+
return result.template bit_cast_view<T>();
595+
}
560596
#endif
561597
}
562598

@@ -622,6 +658,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
622658
constexpr uint16_t _AddressScale = 1;
623659
constexpr int _ImmOffset = 0;
624660
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
661+
625662
static_assert(
626663
_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
627664
"Transposed prefetch is supported only for data size u32 or u64");
@@ -630,6 +667,7 @@ __ESIMD_API void lsc_prefetch(const T *p) {
630667
detail::lsc_data_order::transpose;
631668
constexpr int N = 1;
632669
__ESIMD_NS::simd_mask<N> pred = 1;
670+
633671
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
634672
__esimd_lsc_prefetch_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
635673
_VS, _Transposed, N>(pred.data(),
@@ -894,6 +932,8 @@ lsc_scatter(AccessorTy acc, __ESIMD_NS::simd<uint32_t, N> offsets,
894932
/// VISA instruction: lsc_store.ugm
895933
///
896934
/// Scatters elements to specific address.
935+
/// See comments in the \ref lsc_block_load API for description and parameter
936+
/// constraints.
897937
///
898938
/// @tparam T is element type.
899939
/// @tparam NElts is the number of elements to store per address.
@@ -910,29 +950,44 @@ template <typename T, int NElts, lsc_data_size DS = lsc_data_size::default_size,
910950
cache_hint L1H = cache_hint::none, cache_hint L3H = cache_hint::none>
911951
__ESIMD_API void lsc_block_store(T *p, __ESIMD_NS::simd<T, NElts> vals,
912952
__ESIMD_NS::simd_mask<1> pred = 1) {
913-
detail::check_lsc_vector_size<NElts>();
914953
detail::check_lsc_data_size<T, DS>();
915954
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
916955
constexpr uint16_t _AddressScale = 1;
917956
constexpr int _ImmOffset = 0;
918957
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
919-
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
920-
"Transposed store is supported only for data size u32 or u64");
921-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
922958
constexpr detail::lsc_data_order _Transposed =
923959
detail::lsc_data_order::transpose;
924960
constexpr int N = 1;
925961
__ESIMD_NS::simd<uintptr_t, N> addrs = reinterpret_cast<uintptr_t>(p);
926-
__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
927-
_Transposed, N>(pred.data(), addrs.data(),
928-
vals.data());
962+
constexpr int SmallIntFactor =
963+
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
964+
static_assert(NElts % SmallIntFactor == 0,
965+
"Number of elements is not supported by Transposed store");
966+
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
967+
constexpr detail::lsc_vector_size _VS =
968+
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
969+
if constexpr (SmallIntFactor == 1) {
970+
971+
__esimd_lsc_store_stateless<T, L1H, L3H, _AddressScale, _ImmOffset, _DS,
972+
_VS, _Transposed, N>(pred.data(), addrs.data(),
973+
vals.data());
974+
} else {
975+
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> tmp =
976+
vals.template bit_cast_view<uint32_t>();
977+
978+
__esimd_lsc_store_stateless<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
979+
lsc_data_size::u32, _VS, _Transposed, N>(
980+
pred.data(), addrs.data(), tmp.data());
981+
}
929982
}
930983

931984
/// Accessor-based transposed scatter with 1 channel.
932985
/// Supported platforms: DG2, PVC
933986
/// VISA instruction: lsc_store.ugm
934987
///
935988
/// Scatters elements to surface.
989+
/// See comments in the \ref lsc_block_load API for description and parameter
990+
/// constraints.
936991
///
937992
/// @tparam T is element type.
938993
/// @tparam NElts is the number of elements to store per address.
@@ -958,23 +1013,36 @@ lsc_block_store(AccessorTy acc, uint32_t offset,
9581013
lsc_block_store<T, NElts, DS, L1H>(
9591014
__ESIMD_DNS::accessorToPointer<T>(acc, offset), vals, pred);
9601015
#else
961-
detail::check_lsc_vector_size<NElts>();
9621016
detail::check_lsc_data_size<T, DS>();
9631017
detail::check_lsc_cache_hint<detail::lsc_action::store, L1H, L3H>();
9641018
constexpr uint16_t _AddressScale = 1;
9651019
constexpr int _ImmOffset = 0;
9661020
constexpr lsc_data_size _DS = detail::finalize_data_size<T, DS>();
967-
static_assert(_DS == lsc_data_size::u32 || _DS == lsc_data_size::u64,
968-
"Transposed store is supported only for data size u32 or u64");
969-
constexpr detail::lsc_vector_size _VS = detail::to_lsc_vector_size<NElts>();
9701021
constexpr detail::lsc_data_order _Transposed =
9711022
detail::lsc_data_order::transpose;
9721023
constexpr int N = 1;
1024+
9731025
__ESIMD_NS::simd<uint32_t, N> offsets = offset;
9741026
auto si = __ESIMD_NS::get_surface_index(acc);
975-
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
976-
_Transposed, N>(pred.data(), offsets.data(),
977-
vals.data(), si);
1027+
constexpr int SmallIntFactor =
1028+
(_DS == lsc_data_size::u16) ? 2 : (_DS == lsc_data_size::u8 ? 4 : 1);
1029+
1030+
detail::check_lsc_vector_size<NElts / SmallIntFactor>();
1031+
static_assert(NElts % SmallIntFactor == 0,
1032+
"Number of elements is not supported by Transposed store");
1033+
constexpr detail::lsc_vector_size _VS =
1034+
detail::to_lsc_vector_size<NElts / SmallIntFactor>();
1035+
if constexpr (SmallIntFactor > 1) {
1036+
__ESIMD_NS::simd<uint32_t, NElts / SmallIntFactor> Tmp =
1037+
vals.template bit_cast_view<uint32_t>();
1038+
__esimd_lsc_store_bti<uint32_t, L1H, L3H, _AddressScale, _ImmOffset,
1039+
lsc_data_size::u32, _VS, _Transposed, N>(
1040+
pred.data(), offsets.data(), Tmp.data(), si);
1041+
} else {
1042+
__esimd_lsc_store_bti<T, L1H, L3H, _AddressScale, _ImmOffset, _DS, _VS,
1043+
_Transposed, N>(pred.data(), offsets.data(),
1044+
vals.data(), si);
1045+
}
9781046
#endif
9791047
}
9801048

0 commit comments

Comments
 (0)