Skip to content

Commit 3eca2d4

Browse files
authored
[ESIMD] Implement property-based gather(usm, ...) (#12316)
This patch also supports gather and masked-gather of any length N if it does not use L1/L2 hints of VS>1. Additionally for gathers without L1/L2 vs VS>1 this patch replaces the calls of GenX SVM gather calls with LLVM IR if the macro __ESIMD_GATHER_SCATTER_LLVM_IR is defined by user. If it not defined, then using masked gathers with pass_thru operand requires DG2/PVC. --------- Signed-off-by: Vyacheslav N Klochkov <[email protected]> Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
1 parent e7910c3 commit 3eca2d4

File tree

11 files changed

+1094
-136
lines changed

11 files changed

+1094
-136
lines changed

llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include "llvm/ADT/DenseSet.h"
2323
#include "llvm/ADT/SmallVector.h"
2424
#include "llvm/ADT/StringSwitch.h"
25+
#include "llvm/CodeGen/ValueTypes.h"
2526
#include "llvm/Demangle/Demangle.h"
2627
#include "llvm/Demangle/ItaniumDemangle.h"
2728
#include "llvm/GenXIntrinsics/GenXIntrinsics.h"
@@ -970,6 +971,38 @@ static void translateBlockStore(CallInst &CI, bool IsSLM) {
970971
SI->setDebugLoc(CI.getDebugLoc());
971972
}
972973

974+
static void translateGatherLoad(CallInst &CI, bool IsSLM) {
975+
IRBuilder<> Builder(&CI);
976+
constexpr int AlignmentTemplateArgIdx = 2;
977+
APInt Val = parseTemplateArg(CI, AlignmentTemplateArgIdx,
978+
ESIMDIntrinDesc::GenXArgConversion::TO_I64);
979+
Align AlignValue(Val.getZExtValue());
980+
981+
auto OffsetsOp = CI.getArgOperand(0);
982+
auto MaskOp = CI.getArgOperand(1);
983+
auto PassThroughOp = CI.getArgOperand(2);
984+
auto DataType = CI.getType();
985+
986+
// Convert the mask from <N x i16> to <N x i1>.
987+
Value *Zero = ConstantInt::get(MaskOp->getType(), 0);
988+
MaskOp = Builder.CreateICmp(ICmpInst::ICMP_NE, MaskOp, Zero);
989+
990+
// The address space may be 3-SLM, 1-global or private.
991+
// At the moment of calling 'gather()' operation the pointer passed to it
992+
// is already 4-generic. Thus, simply use 4-generic for global and private
993+
// and let GPU BE deduce the actual address space from the use-def graph.
994+
unsigned AS = IsSLM ? 3 : 4;
995+
auto ElemType = DataType->getScalarType();
996+
auto NumElems = (cast<VectorType>(DataType))->getElementCount();
997+
auto VPtrType = VectorType::get(PointerType::get(ElemType, AS), NumElems);
998+
auto VPtrOp = Builder.CreateIntToPtr(OffsetsOp, VPtrType);
999+
1000+
auto LI = Builder.CreateMaskedGather(DataType, VPtrOp, AlignValue, MaskOp,
1001+
PassThroughOp);
1002+
LI->setDebugLoc(CI.getDebugLoc());
1003+
CI.replaceAllUsesWith(LI);
1004+
}
1005+
9731006
// TODO Specify document behavior for slm_init and nbarrier_init when:
9741007
// 1) they are called not from kernels
9751008
// 2) there are multiple such calls reachable from a kernel
@@ -1910,6 +1943,13 @@ size_t SYCLLowerESIMDPass::runOnFunction(Function &F,
19101943
ToErase.push_back(CI);
19111944
continue;
19121945
}
1946+
if (Name.startswith("__esimd_gather_ld") ||
1947+
Name.startswith("__esimd_slm_gather_ld")) {
1948+
translateGatherLoad(*CI, Name.startswith("__esimd_slm_gather_ld"));
1949+
ToErase.push_back(CI);
1950+
continue;
1951+
}
1952+
19131953
if (Name.startswith("__esimd_nbarrier_init")) {
19141954
translateNbarrierInit(*CI);
19151955
ToErase.push_back(CI);

sycl/include/sycl/ext/intel/esimd/detail/intrin.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@
6464
//
6565
template <typename T, int N, int M, int VStride, int Width, int Stride,
6666
int ParentWidth = 0>
67-
__ESIMD_INTRIN __ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
68-
__ESIMD_DNS::vector_type_t<T, M>>
67+
__ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
68+
__ESIMD_DNS::vector_type_t<T, M>>
6969
__esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset);
7070

7171
template <typename T, int N, int M, int ParentWidth = 0>
@@ -263,8 +263,8 @@ __ESIMD_INTRIN uint16_t __esimd_all(__ESIMD_DNS::vector_type_t<T, N> src)
263263
// Implementations of ESIMD intrinsics for the SYCL host device
264264
template <typename T, int N, int M, int VStride, int Width, int Stride,
265265
int ParentWidth>
266-
__ESIMD_INTRIN __ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
267-
__ESIMD_DNS::vector_type_t<T, M>>
266+
__ESIMD_INTRIN std::enable_if_t<(Width > 0) && M % Width == 0,
267+
__ESIMD_DNS::vector_type_t<T, M>>
268268
__esimd_rdregion(__ESIMD_DNS::vector_type_t<T, N> Input, uint16_t Offset) {
269269
uint16_t EltOffset = Offset / sizeof(T);
270270
assert(Offset % sizeof(T) == 0);

sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,20 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
199199
__ESIMD_DNS::vector_type_t<uint32_t, N> offsets)
200200
__ESIMD_INTRIN_END;
201201

202+
// Gather data from the given global or private addresses.
203+
template <typename T, int N, size_t Align>
204+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_gather_ld(
205+
__ESIMD_DNS::vector_type_t<uint64_t, N> vptr,
206+
__ESIMD_DNS::simd_mask_storage_t<N> pred,
207+
__ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;
208+
209+
// Gather data from the given SLM addresses.
210+
template <typename T, int N, size_t Align>
211+
__ESIMD_INTRIN __ESIMD_DNS::vector_type_t<T, N> __esimd_slm_gather_ld(
212+
__ESIMD_DNS::vector_type_t<uint32_t, N> vptr,
213+
__ESIMD_DNS::simd_mask_storage_t<N> pred,
214+
__ESIMD_DNS::vector_type_t<T, N> pass_thru) __ESIMD_INTRIN_END;
215+
202216
/// Surface-based gather.
203217
/// Supported platforms: DG2, PVC
204218
///
@@ -212,8 +226,10 @@ __esimd_lsc_load_slm(__ESIMD_DNS::simd_mask_storage_t<N> pred,
212226
/// @tparam ImmOffset is the immediate offset added to each address.
213227
/// @tparam DS is the data size.
214228
/// @tparam VS is the number of elements to load per address.
215-
/// @tparam Transposed indicates if the data is transposed during the transfer.
216-
/// @tparam N is the SIMD size of operation (the number of addresses to access)
229+
/// @tparam Transposed indicates if the data is transposed during the
230+
/// transfer.
231+
/// @tparam N is the SIMD size of operation (the number of addresses to
232+
/// access)
217233
/// @tparam SurfIndAliasT is the \ref sycl::accessor type.
218234
/// @param pred is predicates.
219235
/// @param offsets is the zero-based offsets in bytes.

0 commit comments

Comments
 (0)