Skip to content

Commit 701e480

Browse files
authored
[ESIMD] Fix slm_gather/scatter. (#4771)
[ESIMD] Fix slm_gather/scatter. - remove limitations on element size being 4 bytes only - enable 1,8,16,32 elements as allowed by hardware - share implementation with surface-based gather/scatter Signed-off-by: Konstantin S Bobrovsky <[email protected]>
1 parent 7a9335d commit 701e480

File tree

6 files changed

+235
-138
lines changed

6 files changed

+235
-138
lines changed

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,15 @@ class ESIMDIntrinDescTable {
367367
{"gather_scaled",
368368
{"gather.scaled", {ai1(0), t(3), t(4), aSI(1), a(2), a(3), u(-1)}}},
369369

370+
// arg0: i32 log2 num blocks, CONSTANT (0/1/2 for num blocks 1/2/4)
371+
// arg1: i16 scale, CONSTANT
372+
// arg2: i32 surface index
373+
// arg3: i32 global offset in bytes
374+
// arg4: vXi32 element offset in bytes (overloaded)
375+
// arg5: vXi1 predicate (overloaded)
376+
{"gather_masked_scaled2",
377+
{"gather.masked.scaled2", {t(3), t(4), aSI(0), a(1), a(2), ai1(3)}}},
378+
370379
// arg0: vXi1 predicate (overloaded)
371380
// arg1: i32 log2 num blocks, CONSTANT (0/1/2 for num blocks 1/2/4)
372381
// arg2: i16 scale, CONSTANT

sycl/include/sycl/ext/intel/experimental/esimd/detail/memory_intrin.hpp

Lines changed: 48 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ class AccessorPrivateProxy {
4747
};
4848

4949
template <int ElemsPerAddr,
50-
typename = sycl::detail::enable_if_t<
51-
(ElemsPerAddr == 1 || ElemsPerAddr == 2 || ElemsPerAddr == 4)>>
50+
typename = std::enable_if_t<(ElemsPerAddr == 1 || ElemsPerAddr == 2 ||
51+
ElemsPerAddr == 4)>>
5252
constexpr unsigned int ElemsPerAddrEncoding() {
5353
// encoding requires log2 of ElemsPerAddr
5454
if constexpr (ElemsPerAddr == 1)
@@ -342,7 +342,7 @@ __esimd_gather_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
342342
;
343343
#else
344344
{
345-
static_assert(N == 1 || N == 8 || N == 16);
345+
static_assert(N == 1 || N == 8 || N == 16 || N == 32);
346346
static_assert(TySizeLog2 <= 2 && Scale == 0);
347347
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
348348
throw cl::sycl::feature_not_supported();
@@ -392,9 +392,6 @@ __esimd_scatter_scaled(__SEIEED::simd_mask_storage_t<N> pred,
392392
}
393393
#endif // __SYCL_DEVICE_ONLY__
394394

395-
// TODO bring the parameter order of __esimd* intrinsics in accordance with the
396-
// correponsing BE intrinsicics parameter order.
397-
398395
// flat_atomic: flat-address atomic
399396
template <__SEIEE::atomic_op Op, typename Ty, int N,
400397
__SEIEE::CacheHint L1H = __SEIEE::CacheHint::None,
@@ -473,7 +470,7 @@ __ESIMD_INTRIN void __esimd_fence(uint8_t cntl)
473470

474471
// Scaled gather from a surface.
475472
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
476-
int16_t SCALE = 0>
473+
int16_t Scale = 0>
477474
__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N>
478475
__esimd_gather_scaled(__SEIEED::simd_mask_storage_t<N> pred,
479476
SurfIndAliasTy surf_ind, uint32_t global_offset,
@@ -486,6 +483,40 @@ __esimd_gather_scaled(__SEIEED::simd_mask_storage_t<N> pred,
486483
}
487484
#endif // __SYCL_DEVICE_ONLY__
488485

486+
/// Predicated (masked) scaled gather from a surface.
487+
///
488+
/// Template (compile-time constant) parameters:
489+
/// @tparam Ty - element type
490+
/// @tparam N - the number of elements to read
491+
/// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
492+
/// accessor used to denote the surface
493+
/// @tparam TySizeLog2 - Log2 of the number of bytes written per element:
494+
/// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
495+
/// @tparam Scale - offset scale; only 0 is supported for now
496+
///
497+
/// Formal parameters:
498+
/// @param surf_ind - the surface index, taken from the SYCL memory object
499+
/// @param global_offset - offset added to each individual element's offset to
500+
/// compute actual memory access offset for that element
501+
/// @param offsets - per-element offsets
502+
/// @param pred - per-element predicates; elements with zero corresponding
503+
/// predicates are not written
504+
/// @return - elements read ("gathered") from memory
505+
506+
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
507+
int16_t Scale = 0>
508+
__ESIMD_INTRIN __SEIEED::vector_type_t<Ty, N>
509+
__esimd_gather_masked_scaled2(SurfIndAliasTy surf_ind, uint32_t global_offset,
510+
__SEIEED::vector_type_t<uint32_t, N> offsets,
511+
__SEIEED::simd_mask_storage_t<N> pred)
512+
#ifdef __SYCL_DEVICE_ONLY__
513+
;
514+
#else
515+
{
516+
throw cl::sycl::feature_not_supported();
517+
}
518+
#endif // __SYCL_DEVICE_ONLY__
519+
489520
// Reads a block of data from given surface at given offset, offset must be
490521
// 16-byte-aligned.
491522
template <typename Ty, int N, typename SurfIndAliasTy, int32_t IsModified = 0>
@@ -705,7 +736,6 @@ __ESIMD_INTRIN void __esimd_media_st(TACC handle, unsigned x, unsigned y,
705736
}
706737
#endif // __SYCL_DEVICE_ONLY__
707738

708-
#ifdef __SYCL_DEVICE_ONLY__
709739
/// \brief Converts given value to a surface index.
710740
/// The input must always be a result of
711741
/// detail::AccessorPrivateProxy::getNativeImageObj(acc)
@@ -724,15 +754,17 @@ __ESIMD_INTRIN void __esimd_media_st(TACC handle, unsigned x, unsigned y,
724754
/// pointer, where we can do ptr to uint32_t conversion.
725755
/// This intrinsic can be called only from the device code, as
726756
/// accessor => memory handle translation for host is different.
727-
///
728-
/// @param SYCL accessor's native memory object extracted from it via
757+
/// @param acc the SYCL accessor.
729758
/// getNativeImageObj.
730-
///
731-
/// Returns the surface index (binding table index) value 'sid' corresponds to.
732-
///
733-
template <typename SurfIndAliasTy>
734-
__ESIMD_INTRIN __SEIEE::SurfaceIndex
735-
__esimd_get_surface_index(SurfIndAliasTy sid);
759+
/// Returns the binding table index value.
760+
template <typename MemObjTy>
761+
__ESIMD_INTRIN __SEIEE::SurfaceIndex __esimd_get_surface_index(MemObjTy obj)
762+
#ifdef __SYCL_DEVICE_ONLY__
763+
;
764+
#else
765+
{
766+
throw cl::sycl::feature_not_supported();
767+
}
736768
#endif // __SYCL_DEVICE_ONLY__
737769

738770
/// \brief Raw sends load.

sycl/include/sycl/ext/intel/experimental/esimd/detail/simd_obj_impl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ template <typename Ty, int N, class Derived, class SFINAE> class simd_obj_impl {
506506
/// elements in this object. Source memory location is represented via a
507507
/// global accessor and offset.
508508
/// @param acc accessor to copy from.
509-
/// @param offset offset to copy from.
509+
/// @param offset offset to copy from (in bytes).
510510
template <typename AccessorT>
511511
ESIMD_INLINE EnableIfAccessor<AccessorT, accessor_mode_cap::can_read,
512512
sycl::access::target::global_buffer, void>

0 commit comments

Comments
 (0)