Skip to content

Commit 8208427

Browse files
author
Gang Chen
authored
[SYCL][ESIMD] add indirect read and write methods to simd class (#3039)
Signed-off-by: Gang Y Chen <[email protected]>
1 parent cdb2ebf commit 8208427

File tree

5 files changed

+108
-1
lines changed

5 files changed

+108
-1
lines changed

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,16 @@ class ESIMDIntrinDescTable {
237237
// for integer, "f" - for floating point
238238
{"rdregion",
239239
{"rdregion", {a(0), t(3), t(4), t(5), a(1), t(6)}, nk(-1)}},
240+
{"rdindirect",
241+
{"rdregion", {a(0), c32(0), t(2), c32(0), a(1), t(3)}, nk(-1)}},
240242
{{"wrregion"},
241243
{{"wrregion"},
242244
{a(0), a(1), t(3), t(4), t(5), a(2), t(6), ai1(3)},
243245
nk(-1)}},
246+
{{"wrindirect"},
247+
{{"wrregion"},
248+
{a(0), a(1), c32(0), t(2), c32(0), a(2), t(3), ai1(3)},
249+
nk(-1)}},
244250
{"vload", {"vload", {l(0)}}},
245251
{"vstore", {"vstore", {a(1), a(0)}}},
246252

llvm/test/SYCLLowerIR/esimd_lower_intrins.ll

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -294,6 +294,28 @@ define dso_local spir_func void @FUNC_42() !sycl_explicit_simd !1 {
294294
ret void
295295
}
296296

297+
define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 {
298+
%a_1 = alloca <16 x i32>
299+
%1 = load <16 x i32>, <16 x i32>* %a_1
300+
%a_2 = alloca <8 x i16>
301+
%2 = load <8 x i16>, <8 x i16>* %a_2
302+
%ret_val = call spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32> %1, <8 x i16> %2)
303+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.v8i16(<16 x i32> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 0, <8 x i16> %{{[0-9a-zA-Z_.]+}}, i32 0)
304+
ret <8 x i32> %ret_val
305+
}
306+
307+
define dso_local spir_func <16 x i32> @FUNC_44() !sycl_explicit_simd !1 {
308+
%a_1 = alloca <16 x i32>
309+
%1 = load <16 x i32>, <16 x i32>* %a_1
310+
%a_2 = alloca <8 x i32>
311+
%2 = load <8 x i32>, <8 x i32>* %a_2
312+
%a_3 = alloca <8 x i16>
313+
%3 = load <8 x i16>, <8 x i16>* %a_3
314+
%ret_val = call spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32> %1, <8 x i32> %2, <8 x i16> %3, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
315+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.v8i16.v8i1(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 0, <8 x i16> %{{[0-9a-zA-Z_.]+}}, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
316+
ret <16 x i32> %ret_val
317+
}
318+
297319
declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
298320
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1)
299321
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2)
@@ -335,6 +357,8 @@ declare dso_local spir_func <16 x i32> @_Z18__esimd_sudp4a_satIijjjLi16EEN2cl4sy
335357
declare dso_local spir_func <16 x i32> @_Z18__esimd_ssdp4a_satIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
336358
declare dso_local spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 %0)
337359
declare dso_local spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 %0)
360+
declare dso_local spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32>, <8 x i16>)
361+
declare dso_local spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32>, <8 x i32>, <8 x i16>, <8 x i16>)
338362

339363
attributes #0 = { "genx_byte_offset"="192" "genx_volatile" }
340364

sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,11 @@ template <typename T, int N, int M, int VStride, int Width, int Stride,
6666
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
6767
__esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset);
6868

69+
template <typename T, int N, int M, int ParentWidth = 0>
70+
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
71+
__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
72+
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset);
73+
6974
// __esimd_wrregion returns the updated vector with the region updated.
7075
//
7176
// @param T the element data type, one of i8, i16, i32, i64, half, float,
@@ -120,6 +125,13 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
120125
sycl::INTEL::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
121126
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);
122127

128+
template <typename T, int N, int M, int ParentWidth = 0>
129+
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
130+
__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
131+
sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
132+
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
133+
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);
134+
123135
__SYCL_INLINE_NAMESPACE(cl) {
124136
namespace sycl {
125137
namespace INTEL {
@@ -261,6 +273,20 @@ __esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
261273
return Result;
262274
}
263275

276+
template <typename T, int N, int M, int ParentWidth>
277+
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
278+
__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
279+
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset) {
280+
sycl::INTEL::gpu::vector_type_t<T, M> Result;
281+
for (int i = 0; i < M; ++i) {
282+
uint16_t EltOffset = Offset[i] / sizeof(T);
283+
assert(Offset[i] % sizeof(T) == 0);
284+
assert(EltOffset < N);
285+
Result[i] = Input[EltOffset];
286+
}
287+
return Result;
288+
}
289+
264290
template <typename T, int N, int M, int VStride, int Width, int Stride,
265291
int ParentWidth>
266292
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
@@ -285,4 +311,22 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
285311
return Result;
286312
}
287313

314+
template <typename T, int N, int M, int ParentWidth>
315+
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
316+
__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
317+
sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
318+
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
319+
sycl::INTEL::gpu::mask_type_t<M> Mask) {
320+
sycl::INTEL::gpu::vector_type_t<T, N> Result = OldVal;
321+
for (int i = 0; i < M; ++i) {
322+
if (Mask[i]) {
323+
uint16_t EltOffset = Offset[i] / sizeof(T);
324+
assert(Offset[i] % sizeof(T) == 0);
325+
assert(EltOffset < N);
326+
Result[EltOffset] = NewVal[i];
327+
}
328+
}
329+
return Result;
330+
}
331+
288332
#endif // __SYCL_DEVICE_ONLY__

sycl/include/CL/sycl/INTEL/esimd/esimd.hpp

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,9 +179,32 @@ template <typename Ty, int N> class simd {
179179
// This would allow you to use the subscript operator to write to an
180180
// element.
181181
// {/quote}
182-
/// Read a single element, by value only.
182+
/// Read single element, return value only (not reference).
183183
Ty operator[](int i) const { return data()[i]; }
184184

185+
// TODO ESIMD_EXPERIMENTAL
186+
/// Read multiple elements by their indices in vector
187+
template <int Size>
188+
simd<Ty, Size> iselect(const simd<uint16_t, Size> &Indices) {
189+
vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
190+
return __esimd_rdindirect<Ty, N, Size>(data(), Offsets);
191+
}
192+
// TODO ESIMD_EXPERIMENTAL
193+
/// update single element
194+
void iupdate(ushort Index, Ty V) {
195+
auto Val = data();
196+
Val[Index] = V;
197+
set(Val);
198+
}
199+
// TODO ESIMD_EXPERIMENTAL
200+
/// update multiple elements by their indices in vector
201+
template <int Size>
202+
void iupdate(const simd<uint16_t, Size> &Indices, const simd<Ty, Size> &Val,
203+
mask_type_t<Size> Mask) {
204+
vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
205+
set(__esimd_wrindirect<Ty, N, Size>(data(), Val.data(), Offsets, Mask));
206+
}
207+
185208
// TODO
186209
// @rolandschulz
187210
// {quote}

sycl/test/esimd/simd.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,3 +230,13 @@ bool test_replicate3() __attribute__((sycl_device)) {
230230

231231
return v0_rep[0] == v0[1] && v0_rep[1] == v0[3] && v0_rep[2] == v0[5];
232232
}
233+
234+
bool test_simd_iselect() __attribute__((sycl_device)) {
235+
simd<int, 16> v(0, 1);
236+
simd<ushort, 8> a(0, 2);
237+
auto data = v.iselect(a);
238+
data += 16;
239+
v.iupdate(a, data, 1);
240+
auto ref = v.select<8, 2>(0);
241+
return ref[0] == 16 && ref[14] == 32;
242+
}

0 commit comments

Comments
 (0)