Skip to content

Commit 0aac708

Browse files
authored
[SYCL][ESIMD] Implement accessor-based gather/scatter and scalar mem access. (#2700)
Signed-off-by: Konstantin S Bobrovsky <[email protected]>
1 parent 8fbf4bb commit 0aac708

File tree

4 files changed

+270
-0
lines changed

4 files changed

+270
-0
lines changed

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,15 @@ class ESIMDIntrinDescTable {
252252
{"flat_write", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}},
253253
{"flat_write4",
254254
{"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}},
255+
256+
// surface index-based gather/scatter:
257+
// num blocks, scale, surface index, global offset, elem offsets
258+
{"surf_read", {"gather.scaled2", {t(3), c16(0), aSI(1), a(2), a(3)}}},
259+
// pred, num blocks, scale, surface index, global offset, elem offsets,
260+
// data to write
261+
{"surf_write",
262+
{"scatter.scaled", {ai1(0), t(3), c16(0), aSI(2), a(3), a(4), a(5)}}},
263+
255264
// intrinsics to query thread's coordinates:
256265
{"group_id_x", {"group.id.x", {}}},
257266
{"group_id_y", {"group.id.y", {}}},

sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,92 @@ SYCL_EXTERNAL void __esimd_flat_write4(
8181
sycl::INTEL::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
8282
sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred = 1);
8383

84+
// Low-level surface-based gather. Collects elements located at given offsets in
85+
// a surface and returns them as a single \ref simd object. Element can be
86+
// 1, 2 or 4-byte value, but is always returned as a 4-byte value within the
87+
// resulting simd object, with upper 2 or 3 bytes undefined.
88+
// Template (compile-time constant) parameters:
89+
// @tparam Ty - element type; can only be a 4-byte integer or \c float,
90+
// @tparam N - the number of elements
91+
// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
92+
// accessor used to denote the surface
93+
// @tparam TySizeLog2 - Log2 of the number of bytes read per element:
94+
// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
95+
// @tparam L1H - L1 cache hint
96+
// @tparam L3H - L3 cache hint
97+
//
98+
// Formal parameters:
99+
// @param scale - the scale; must be 0
100+
// @param surf_ind - the surface index, taken from the SYCL memory object
101+
// @param global_offset - offset added to each individual element's offset to
102+
// compute actual memory access offset for that element
103+
// @param elem_offsets - per-element offsets
104+
//
105+
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
106+
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
107+
sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
108+
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<Ty, N>
109+
__esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind,
110+
uint32_t global_offset,
111+
sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets)
112+
#ifdef __SYCL_DEVICE_ONLY__
113+
;
114+
#else
115+
{
116+
static_assert(N == 1 || N == 8 || N == 16);
117+
static_assert(TySizeLog2 <= 2);
118+
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
119+
throw cl::sycl::feature_not_supported();
120+
}
121+
#endif // __SYCL_DEVICE_ONLY__
122+
123+
// Low-level surface-based scatter. Writes elements of a \ref simd object into a
124+
// surface at given offsets. Element can be a 1, 2 or 4-byte value, but it is
125+
// always represented as a 4-byte value within the input simd object,
126+
// unused (not written) upper bytes are ignored.
127+
// Template (compile-time constant) parameters:
128+
// @tparam Ty - element type; can only be a 4-byte integer or \c float,
129+
// @tparam N - the number of elements to write
130+
// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
131+
// accessor used to denote the surface
132+
// @tparam TySizeLog2 - Log2 of the number of bytes written per element:
133+
// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
134+
// @tparam L1H - L1 cache hint
135+
// @tparam L3H - L3 cache hint
136+
//
137+
// Formal parameters:
138+
// @param pred - per-element predicates; elements with zero corresponding
139+
// predicates are not written
140+
// @param scale - the scale; must be 0
141+
// @param surf_ind - the surface index, taken from the SYCL memory object
142+
// @param global_offset - offset added to each individual element's offset to
143+
// compute actual memory access offset for that element
144+
// @param elem_offsets - per-element offsets
145+
// @param vals - values to write
146+
//
147+
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
148+
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
149+
sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
150+
SYCL_EXTERNAL void
151+
__esimd_surf_write(sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred,
152+
int16_t scale, SurfIndAliasTy surf_ind,
153+
uint32_t global_offset,
154+
sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets,
155+
sycl::INTEL::gpu::vector_type_t<Ty, N> vals)
156+
#ifdef __SYCL_DEVICE_ONLY__
157+
;
158+
#else
159+
{
160+
static_assert(N == 1 || N == 8 || N == 16);
161+
static_assert(TySizeLog2 <= 2);
162+
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
163+
throw cl::sycl::feature_not_supported();
164+
}
165+
#endif // __SYCL_DEVICE_ONLY__
166+
167+
// TODO bring the parameter order of __esimd* intrinsics in accordance with the
168+
// correponsing BE intrinsicics parameter order.
169+
84170
// flat_atomic: flat-address atomic
85171
template <sycl::INTEL::gpu::EsimdAtomicOpType Op, typename Ty, int N,
86172
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,

sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ ESIMD_INLINE ESIMD_NODEBUG
116116
addrs.data(), ElemsPerAddrEncoding<ElemsPerAddr>(), pred.data());
117117
}
118118

119+
// TODO bring this SVM-based scatter/gather interface in accordance with
120+
// accessor-based ones - remove the ElemsPerAddr template parameter as it is
121+
// redundant: the only allowed block size in the underlying BE intrinsics is 1
122+
// byte with max number of blocks being 4. This means T template parameter alone
123+
// can model all supported cases.
124+
119125
/// flat-address scatter
120126
template <typename T, int n, int ElemsPerAddr = 1,
121127
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
@@ -238,6 +244,148 @@ ESIMD_INLINE ESIMD_NODEBUG void block_store(AccessorTy acc, uint32_t offset,
238244
#endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__
239245
}
240246

247+
/// Accessor-based gather. Collects elements located at given offsets in
248+
/// an accessor and returns them as a single \ref simd object. An element can be
249+
/// 1, 2 or 4-byte value.
250+
/// Template (compile-time constant) parameters:
251+
/// @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
252+
/// @tparam N - the number of elements
253+
/// @tparam AccessorTy - \ref sycl::accessor type
254+
/// @tparam L1H - L1 cache hint
255+
/// @tparam L3H - L3 cache hint
256+
///
257+
/// Formal parameters:
258+
/// @param acc - the accessor to gather from
259+
/// @param offsets - per-element offsets
260+
/// @param glob_offset - offset added to each individual element's offset to
261+
/// compute actual memory access offset for that element
262+
///
263+
template <typename T, int N, typename AccessorTy,
264+
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
265+
ESIMD_INLINE ESIMD_NODEBUG
266+
typename std::enable_if<(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16) &&
267+
!std::is_pointer<AccessorTy>::value,
268+
simd<T, N>>::type
269+
gather(AccessorTy acc, simd<uint32_t, N> offsets,
270+
uint32_t glob_offset = 0) {
271+
272+
constexpr int TypeSizeLog2 =
273+
sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof(T)>();
274+
// TODO (performance) use hardware-supported scale once BE supports it
275+
constexpr uint32_t scale = 0;
276+
constexpr uint32_t t_scale = sizeof(T);
277+
if constexpr (t_scale > 1) {
278+
glob_offset *= t_scale;
279+
offsets *= t_scale;
280+
}
281+
282+
if constexpr (sizeof(T) < 4) {
283+
static_assert(std::is_integral<T>::value,
284+
"only integral 1- & 2-byte types are supported");
285+
using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t,
286+
uint32_t>::type;
287+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
288+
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
289+
const simd<PromoT, N> promo_vals =
290+
__esimd_surf_read<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H,
291+
L3H>(scale, surf_ind, glob_offset, offsets);
292+
#else
293+
const simd<PromoT, N> promo_vals =
294+
__esimd_surf_read<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
295+
scale, acc, glob_offset, offsets);
296+
#endif
297+
return sycl::INTEL::gpu::convert<T>(promo_vals);
298+
} else {
299+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
300+
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
301+
return __esimd_surf_read<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
302+
scale, surf_ind, glob_offset, offsets);
303+
#else
304+
return __esimd_surf_read<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
305+
scale, acc, glob_offset, offsets);
306+
#endif
307+
}
308+
}
309+
310+
/// Accessor-based scatter. Writes elements of a \ref simd object into an
311+
/// accessor at given offsets. An element can be 1, 2 or 4-byte value.
312+
/// Template (compile-time constant) parameters:
313+
/// @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
314+
/// @tparam N - the number of elements
315+
/// @tparam AccessorTy - \ref sycl::accessor type
316+
/// @tparam L1H - L1 cache hint
317+
/// @tparam L3H - L3 cache hint
318+
///
319+
/// Formal parameters:
320+
/// @param acc - the accessor to scatter to
321+
/// @param vals - values to write
322+
/// @param offsets - per-element offsets
323+
/// @param glob_offset - offset added to each individual element's offset to
324+
/// compute actual memory access offset for that element
325+
/// @param pred - per-element predicates; elements with zero corresponding
326+
/// predicates are not written
327+
///
328+
template <typename T, int N, typename AccessorTy,
329+
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
330+
ESIMD_INLINE ESIMD_NODEBUG
331+
typename std::enable_if<(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16) &&
332+
!std::is_pointer<AccessorTy>::value,
333+
void>::type
334+
scatter(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
335+
uint32_t glob_offset = 0, simd<uint16_t, N> pred = 1) {
336+
337+
constexpr int TypeSizeLog2 =
338+
sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof(T)>();
339+
// TODO (performance) use hardware-supported scale once BE supports it
340+
constexpr uint32_t scale = 0;
341+
constexpr uint32_t t_scale = sizeof(T);
342+
if constexpr (t_scale > 1) {
343+
glob_offset *= t_scale;
344+
offsets *= t_scale;
345+
}
346+
347+
if constexpr (sizeof(T) < 4) {
348+
static_assert(std::is_integral<T>::value,
349+
"only integral 1- & 2-byte types are supported");
350+
using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t,
351+
uint32_t>::type;
352+
const simd<PromoT, N> promo_vals = sycl::INTEL::gpu::convert<PromoT>(vals);
353+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
354+
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
355+
__esimd_surf_write<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
356+
pred, scale, surf_ind, glob_offset, offsets, promo_vals);
357+
#else
358+
__esimd_surf_write<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
359+
pred, scale, acc, glob_offset, offsets, promo_vals);
360+
#endif
361+
} else {
362+
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
363+
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
364+
__esimd_surf_write<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
365+
pred, scale, surf_ind, glob_offset, offsets, vals);
366+
#else
367+
__esimd_surf_write<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
368+
pred, scale, acc, glob_offset, offsets, vals);
369+
#endif
370+
}
371+
}
372+
373+
/// Load a scalar value from an accessor.
374+
template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
375+
CacheHint L3H = CacheHint::None>
376+
ESIMD_INLINE ESIMD_NODEBUG T scalar_load(AccessorTy acc, uint32_t offset) {
377+
const simd<T, 1> Res = gather<T>(acc, simd<uint32_t, 1>{offset});
378+
return Res[0];
379+
}
380+
381+
/// Store a scalar value into an accessor.
382+
template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
383+
CacheHint L3H = CacheHint::None>
384+
ESIMD_INLINE ESIMD_NODEBUG void scalar_store(AccessorTy acc, uint32_t offset,
385+
T val) {
386+
scatter<T>(acc, simd<T, 1>{val}, simd<uint32_t, 1>{offset});
387+
}
388+
241389
// TODO @jasonsewall-intel
242390
// Don't use '4' in the name - instead either make it a parameter or
243391
// (if it must be constant) - try to deduce from other arguments.

sycl/test/esimd/intrins_trans.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,5 +113,32 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
113113
__esimd_vstore<int, 32>(&vc, va.data());
114114
// CHECK: store <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}}
115115

116+
{
117+
sycl::accessor<int, 1, sycl::access::mode::read_write,
118+
sycl::access::target::global_buffer>
119+
acc;
120+
simd<uint32_t, 8> offsets = 1;
121+
simd<uint16_t, 8> pred{1, 0, 1, 0, 1, 0, 1, 0};
122+
123+
// 4-byte element gather
124+
simd<int, 8> v = gather<int, 8>(acc, offsets, 100);
125+
// CHECK: %[[SI3:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
126+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 2, i16 0, i32 %[[SI3]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
127+
128+
// 4-byte element scatter
129+
scatter<int, 8>(acc, v, offsets, 100, pred);
130+
// CHECK: %[[SI4:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
131+
// CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 2, i16 0, i32 %[[SI4]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
132+
133+
// 1-byte element gather
134+
simd<unsigned char, 8> v1 = gather<unsigned char, 8>(acc, offsets, 100);
135+
// CHECK: %[[SI5:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
136+
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 0, i16 0, i32 %[[SI5]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
137+
138+
// 1-byte element scatter
139+
scatter<unsigned char, 8>(acc, v1, offsets, 100, pred);
140+
// CHECK: %[[SI6:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
141+
// CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, i16 0, i32 %[[SI6]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
142+
}
116143
return d;
117144
}

0 commit comments

Comments
 (0)