Skip to content

[ESIMD] Add lsc_gather() and lsc_slm_gather() with merging semantics #8528

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -518,10 +518,18 @@ class ESIMDIntrinDescTable {
{"lsc.load.slm",
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
t8(6), t8(7), c8(0), a(1), c32(0)}}},
{"lsc_load_merge_slm",
{"lsc.load.merge.slm",
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
t8(6), t8(7), c8(0), a(1), c32(0), a(2)}}},
{"lsc_load_bti",
{"lsc.load.bti",
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
t8(6), t8(7), c8(0), a(1), aSI(2)}}},
{"lsc_load_merge_bti",
{"lsc.load.merge.bti",
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
t8(6), t8(7), c8(0), a(1), aSI(2), a(2)}}},
{"lsc_load_stateless",
{"lsc.load.stateless",
{ai1(0), c8(lsc_subopcode::load), t8(1), t8(2), t16(3), t32(4), t8(5),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -341,16 +341,17 @@ constexpr unsigned loadstoreAlignMask() {
template <typename Ty, uint16_t AddressScale, int ImmOffset,
__ESIMD_ENS::lsc_data_size DS, __ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N, uint32_t MASK>
auto __esimd_emu_lsc_offset_read(
auto __esimd_emu_lsc_offset_read_merge(
__ESIMD_DNS::simd_mask_storage_t<N> Pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> Offsets, char *ReadBase,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues,
int BufByteWidth = INT_MAX) {
// TODO : Support AddressScale, ImmOffset
static_assert(AddressScale == 1);
static_assert(ImmOffset == 0);
static_assert(DS != __ESIMD_ENS::lsc_data_size::u16u32h);

__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> Output = 0;
auto Output = OldValues;

constexpr int ChanlCount = __ESIMD_EDNS::to_int<VS>();

Expand All @@ -372,6 +373,10 @@ auto __esimd_emu_lsc_offset_read(

if ((ByteDistance >= 0) && (ByteDistance < BufByteWidth)) {
Output[VecIdx] = *((Ty *)(ReadBase + ByteDistance));
if constexpr (DS == __ESIMD_ENS::lsc_data_size::u8u32)
Output[VecIdx] &= 0xff;
else if constexpr (DS == __ESIMD_ENS::lsc_data_size::u16u32)
Output[VecIdx] &= 0xffff;
}
}
}
Expand Down Expand Up @@ -788,25 +793,49 @@ auto __esimd_emu_lsc_xatomic_offset_access_2(
/// @tparam N is the SIMD size of operation (the number of addresses to access)
/// @param pred is predicates.
/// @param offsets is the zero-based offsets for SLM buffer in bytes.
/// @param OldValues contains the vector which elements are copied
/// to the returned result when the corresponding element of \p pred is 0.
/// @return is a vector of type T and size N * to_int<VS>()
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
__ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
__esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
__esimd_lsc_load_merge_slm(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
0)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
sycl::detail::ESIMDDeviceInterface *I =
sycl::detail::getESIMDDeviceInterface();

return __esimd_emu_lsc_offset_read<Ty, AddressScale, ImmOffset, DS, VS,
_Transposed, N,
loadstoreAlignMask<Ty, VS, DS, N>()>(
pred, offsets, I->__cm_emu_get_slm_ptr());
return __esimd_emu_lsc_offset_read_merge<Ty, AddressScale, ImmOffset, DS, VS,
_Transposed, N,
loadstoreAlignMask<Ty, VS, DS, N>()>(
pred, offsets, I->__cm_emu_get_slm_ptr(), OldValues);
}
#endif // __SYCL_DEVICE_ONLY__

/// Similar to __esimd_lsc_load_merge_slm(), but the argument OldValues is not
/// explicitly specified, which results into random values in those elements of
/// the returned result for which the corresponding element in \p pred is 0.
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
__ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
__esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
return __esimd_lsc_load_merge_slm<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
VS, _Transposed, N>(pred, offsets);
}
#endif // __SYCL_DEVICE_ONLY__

Expand All @@ -829,16 +858,20 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
/// @param pred is predicates.
/// @param offsets is the zero-based offsets in bytes.
/// @param surf_ind is the surface index.
/// @param OldValues contains the vector which elements are copied
/// to the returned result when the corresponding element of \p pred is 0.
/// @return is a vector of type T and N * to_int<VS>()
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
__ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N,
typename SurfIndAliasTy>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
__esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
SurfIndAliasTy surf_ind)
__esimd_lsc_load_merge_bti(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets, SurfIndAliasTy surf_ind,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues =
0)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
Expand All @@ -854,10 +887,32 @@ __esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,

std::lock_guard<std::mutex> lock(*mutexLock);

return __esimd_emu_lsc_offset_read<Ty, AddressScale, ImmOffset, DS, VS,
_Transposed, N,
loadstoreAlignMask<Ty, VS, DS, N>()>(
pred, offsets, readBase, width);
return __esimd_emu_lsc_offset_read_merge<Ty, AddressScale, ImmOffset, DS, VS,
_Transposed, N,
loadstoreAlignMask<Ty, VS, DS, N>()>(
pred, offsets, readBase, OldValues, width);
}
#endif // __SYCL_DEVICE_ONLY__

/// Similar to __esimd_lsc_load_merge_bti(), but the argument OldValues is not
/// explicitly specified, which results into random values in those elements of
/// the returned result for which the corresponding element in \p pred is 0.
template <typename Ty, __ESIMD_ENS::cache_hint L1H, __ESIMD_ENS::cache_hint L3H,
uint16_t AddressScale, int ImmOffset, __ESIMD_ENS::lsc_data_size DS,
__ESIMD_EDNS::lsc_vector_size VS,
__ESIMD_EDNS::lsc_data_order _Transposed, int N,
typename SurfIndAliasTy>
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
__esimd_lsc_load_bti(__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets,
SurfIndAliasTy surf_ind)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
{
return __esimd_lsc_load_merge_bti<Ty, L1H, L3H, AddressScale, ImmOffset, DS,
VS, _Transposed, N, SurfIndAliasTy>(
pred, offsets, surf_ind);
}
#endif // __SYCL_DEVICE_ONLY__

Expand Down Expand Up @@ -889,7 +944,8 @@ __ESIMD_INTRIN __ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()>
__esimd_lsc_load_merge_stateless(
__ESIMD_DNS::simd_mask_storage_t<N> pred,
__ESIMD_DNS::vector_type_t<uintptr_t, N> addrs,
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> old_values)
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> old_values =
0)
#ifdef __SYCL_DEVICE_ONLY__
;
#else // __SYCL_DEVICE_ONLY__
Expand Down Expand Up @@ -957,10 +1013,8 @@ __esimd_lsc_load_stateless(__ESIMD_DNS::simd_mask_storage_t<N> pred,
;
#else // __SYCL_DEVICE_ONLY__
{
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_EDNS::to_int<VS>()> OldValues = 0;
return __esimd_lsc_load_merge_stateless<Ty, L1H, L3H, AddressScale, ImmOffset,
DS, VS, _Transposed, N>(pred, addrs,
OldValues);
DS, VS, _Transposed, N>(pred, addrs);
}
#endif // __SYCL_DEVICE_ONLY__

Expand Down
Loading