Skip to content

Commit d2bc9d1

Browse files
authored
[ESIMD] Add lsc_gather() and lsc_slm_gather() with merging semantics (#8528)
The additional function prototypes for lsc_gather() and lsc_slm_gather() have the additional operand 'OldValues', which contains the elements being copied to the returned result if the corresponding element of the mask/predicate argument is 0. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 3ae0889 commit d2bc9d1

File tree

3 files changed

+278
-26
lines changed

3 files changed

+278
-26
lines changed

llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -518,10 +518,18 @@ class ESIMDIntrinDescTable {
518518
{"lsc.load.slm",
519519
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
520520
t8(6), t8(7), c8(0), a(1), c32(0)}}},
521+
{"lsc_load_merge_slm",
522+
{"lsc.load.merge.slm",
523+
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
524+
t8(6), t8(7), c8(0), a(1), c32(0), a(2)}}},
521525
{"lsc_load_bti",
522526
{"lsc.load.bti",
523527
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
524528
t8(6), t8(7), c8(0), a(1), aSI(2)}}},
529+
{"lsc_load_merge_bti",
530+
{"lsc.load.merge.bti",
531+
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
532+
t8(6), t8(7), c8(0), a(1), aSI(2), a(2)}}},
525533
{"lsc_load_stateless",
526534
{"lsc.load.stateless",
527535
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),

sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp

Lines changed: 73 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -341,16 +341,17 @@ constexpr unsigned loadstoreAlignMask() {
341341
template <typename Ty, uint16_t AddressScale, int ImmOffset,
342342
__ESIMD_ENS::lsc_data_size DS, __ESIMD_EDNS::lsc_vector_size VS,
343343
__ESIMD_EDNS::lsc_data_order _Transposed, int N, uint32_t MASK>
344-
auto __esimd_emu_lsc_offset_read(
344+
auto __esimd_emu_lsc_offset_read_merge(
345345
__ESIMD_DNS::simd_mask_storage_t<N> Pred,
346346
__ESIMD_DNS::vector_type_t<uint32_t, N> Offsets, char *ReadBase,
347+
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues,
347348
int BufByteWidth = INT_MAX) {
348349
// TODO : Support AddressScale, ImmOffset
349350
static_assert(AddressScale == 1);
350351
static_assert(ImmOffset == 0);
351352
static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h);
352353

353-
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> Output = 0;
354+
auto Output = OldValues;
354355

355356
constexpr int ChanlCount = __ESIMD_EDNS::to_int<VS>();
356357

@@ -372,6 +373,10 @@ auto __esimd_emu_lsc_offset_read(
372373

373374
if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) {
374375
Output[VecIdx] = *((Ty *)(ReadBase + ByteDistance));
376+
if constexpr (DS == __ESIMD_ENS::lsc_data_size::u8u32)
377+
Output[VecIdx] &= 0xff;
378+
else if constexpr (DS == __ESIMD_ENS::lsc_data_size::u16u32)
379+
Output[VecIdx] &= 0xffff;
375380
}
376381
}
377382
}
@@ -788,25 +793,49 @@ auto __esimd_emu_lsc_xatomic_offset_access_2(
788793
/// @tparam N is the SIMD size of operation (the number of addresses to access)
789794
/// @param pred is predicates.
790795
/// @param offsets is the zero-based offsets for SLM buffer in bytes.
796+
/// @param OldValues contains the vector which elements are copied
797+
/// to the returned result when the corresponding element of \p pred is 0.
791798
/// @return is a vector of type T and size N * to_int<VS>()
792799
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
793800
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
794801
__ESIMD_EDNS::lsc_vector_size VS,
795802
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
796803
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
797-
__esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
798-
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
804+
__esimd_lsc_load_merge_slm(
805+
__ESIMD_DNS::simd_mask_storage_t<N> pred,
806+
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
807+
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
808+
0)
799809
#ifdef __SYCL_DEVICE_ONLY__
800810
;
801811
#else // __SYCL_DEVICE_ONLY__
802812
{
803813
sycl::detail::ESIMDDeviceInterface *I =
804814
sycl::detail::getESIMDDeviceInterface();
805815

806-
return __esimd_emu_lsc_offset_read<Ty, AddressScale, ImmOffset, DS, VS,
807-
_Transposed, N,
808-
loadstoreAlignMask<Ty, VS, DS, N>()>(
809-
pred, offsets, I->__cm_emu_get_slm_ptr());
816+
return __esimd_emu_lsc_offset_read_merge<Ty, AddressScale, ImmOffset, DS, VS,
817+
_Transposed, N,
818+
loadstoreAlignMask<Ty, VS, DS, N>()>(
819+
pred, offsets, I->__cm_emu_get_slm_ptr(), OldValues);
820+
}
821+
#endif // __SYCL_DEVICE_ONLY__
822+
823+
/// Similar to __esimd_lsc_load_merge_slm(), but the argument OldValues is not
824+
/// explicitly specified, which results into random values in those elements of
825+
/// the returned result for which the corresponding element in \p pred is 0.
826+
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
827+
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
828+
__ESIMD_EDNS::lsc_vector_size VS,
829+
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
830+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
831+
__esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
832+
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
833+
#ifdef __SYCL_DEVICE_ONLY__
834+
;
835+
#else // __SYCL_DEVICE_ONLY__
836+
{
837+
return __esimd_lsc_load_merge_slm<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
838+
VS, _Transposed, N>(pred, offsets);
810839
}
811840
#endif // __SYCL_DEVICE_ONLY__
812841

@@ -829,16 +858,20 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
829858
/// @param pred is predicates.
830859
/// @param offsets is the zero-based offsets in bytes.
831860
/// @param surf_ind is the surface index.
861+
/// @param OldValues contains the vector which elements are copied
862+
/// to the returned result when the corresponding element of \p pred is 0.
832863
/// @return is a vector of type T and N * to_int<VS>()
833864
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
834865
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
835866
__ESIMD_EDNS::lsc_vector_size VS,
836867
__ESIMD_EDNS::lsc_data_order _Transposed, int N,
837868
typename SurfIndAliasTy>
838869
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
839-
__esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,
840-
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
841-
SurfIndAliasTy surf_ind)
870+
__esimd_lsc_load_merge_bti(
871+
__ESIMD_DNS::simd_mask_storage_t<N> pred,
872+
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets, SurfIndAliasTy surf_ind,
873+
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
874+
0)
842875
#ifdef __SYCL_DEVICE_ONLY__
843876
;
844877
#else // __SYCL_DEVICE_ONLY__
@@ -854,10 +887,32 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,
854887

855888
std::lock_guard<std::mutex> lock(*mutexLock);
856889

857-
return __esimd_emu_lsc_offset_read<Ty, AddressScale, ImmOffset, DS, VS,
858-
_Transposed, N,
859-
loadstoreAlignMask<Ty, VS, DS, N>()>(
860-
pred, offsets, readBase, width);
890+
return __esimd_emu_lsc_offset_read_merge<Ty, AddressScale, ImmOffset, DS, VS,
891+
_Transposed, N,
892+
loadstoreAlignMask<Ty, VS, DS, N>()>(
893+
pred, offsets, readBase, OldValues, width);
894+
}
895+
#endif // __SYCL_DEVICE_ONLY__
896+
897+
/// Similar to __esimd_lsc_load_merge_bti(), but the argument OldValues is not
898+
/// explicitly specified, which results into random values in those elements of
899+
/// the returned result for which the corresponding element in \p pred is 0.
900+
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
901+
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
902+
__ESIMD_EDNS::lsc_vector_size VS,
903+
__ESIMD_EDNS::lsc_data_order _Transposed, int N,
904+
typename SurfIndAliasTy>
905+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
906+
__esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,
907+
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
908+
SurfIndAliasTy surf_ind)
909+
#ifdef __SYCL_DEVICE_ONLY__
910+
;
911+
#else // __SYCL_DEVICE_ONLY__
912+
{
913+
return __esimd_lsc_load_merge_bti<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
914+
VS, _Transposed, N, SurfIndAliasTy>(
915+
pred, offsets, surf_ind);
861916
}
862917
#endif // __SYCL_DEVICE_ONLY__
863918

@@ -889,7 +944,8 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
889944
__esimd_lsc_load_merge_stateless(
890945
__ESIMD_DNS::simd_mask_storage_t<N> pred,
891946
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
892-
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> old_values)
947+
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> old_values =
948+
0)
893949
#ifdef __SYCL_DEVICE_ONLY__
894950
;
895951
#else // __SYCL_DEVICE_ONLY__
@@ -957,10 +1013,8 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
9571013
;
9581014
#else // __SYCL_DEVICE_ONLY__
9591015
{
960-
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues = 0;
9611016
return __esimd_lsc_load_merge_stateless<Ty, L1H, L3H, AddressScale, ImmOffset,
962-
DS, VS, _Transposed, N>(pred, addrs,
963-
OldValues);
1017+
DS, VS, _Transposed, N>(pred, addrs);
9641018
}
9651019
#endif // __SYCL_DEVICE_ONLY__
9661020

0 commit comments

Comments
 (0)