@@ -116,6 +116,12 @@ ESIMD_INLINE ESIMD_NODEBUG
116
116
addrs.data (), ElemsPerAddrEncoding<ElemsPerAddr>(), pred.data ());
117
117
}
118
118
119
+ // TODO bring this SVM-based scatter/gather interface in accordance with
120
+ // accessor-based ones - remove the ElemsPerAddr template parameter as it is
121
+ // redundant: the only allowed block size in the underlying BE intrinsics is 1
122
+ // byte with max number of blocks being 4. This means T template parameter alone
123
+ // can model all supported cases.
124
+
119
125
// / flat-address scatter
120
126
template <typename T, int n, int ElemsPerAddr = 1 ,
121
127
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
@@ -238,6 +244,148 @@ ESIMD_INLINE ESIMD_NODEBUG void block_store(AccessorTy acc, uint32_t offset,
238
244
#endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__
239
245
}
240
246
247
+ // / Accessor-based gather. Collects elements located at given offsets in
248
+ // / an accessor and returns them as a single \ref simd object. An element can be
249
+ // / 1, 2 or 4-byte value.
250
+ // / Template (compile-time constant) parameters:
251
+ // / @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
252
+ // / @tparam N - the number of elements
253
+ // / @tparam AccessorTy - \ref sycl::accessor type
254
+ // / @tparam L1H - L1 cache hint
255
+ // / @tparam L3H - L3 cache hint
256
+ // /
257
+ // / Formal parameters:
258
+ // / @param acc - the accessor to gather from
259
+ // / @param offsets - per-element offsets
260
+ // / @param glob_offset - offset added to each individual element's offset to
261
+ // / compute actual memory access offset for that element
262
+ // /
263
+ template <typename T, int N, typename AccessorTy,
264
+ CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
265
+ ESIMD_INLINE ESIMD_NODEBUG
266
+ typename std::enable_if<(sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 ) &&
267
+ !std::is_pointer<AccessorTy>::value,
268
+ simd<T, N>>::type
269
+ gather (AccessorTy acc, simd<uint32_t , N> offsets,
270
+ uint32_t glob_offset = 0 ) {
271
+
272
+ constexpr int TypeSizeLog2 =
273
+ sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof (T)>();
274
+ // TODO (performance) use hardware-supported scale once BE supports it
275
+ constexpr uint32_t scale = 0 ;
276
+ constexpr uint32_t t_scale = sizeof (T);
277
+ if constexpr (t_scale > 1 ) {
278
+ glob_offset *= t_scale;
279
+ offsets *= t_scale;
280
+ }
281
+
282
+ if constexpr (sizeof (T) < 4 ) {
283
+ static_assert (std::is_integral<T>::value,
284
+ " only integral 1- & 2-byte types are supported" );
285
+ using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t ,
286
+ uint32_t >::type;
287
+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
288
+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
289
+ const simd<PromoT, N> promo_vals =
290
+ __esimd_surf_read<PromoT, N, decltype (surf_ind), TypeSizeLog2, L1H,
291
+ L3H>(scale, surf_ind, glob_offset, offsets);
292
+ #else
293
+ const simd<PromoT, N> promo_vals =
294
+ __esimd_surf_read<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
295
+ scale, acc, glob_offset, offsets);
296
+ #endif
297
+ return sycl::INTEL::gpu::convert<T>(promo_vals);
298
+ } else {
299
+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
300
+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
301
+ return __esimd_surf_read<T, N, decltype (surf_ind), TypeSizeLog2, L1H, L3H>(
302
+ scale, surf_ind, glob_offset, offsets);
303
+ #else
304
+ return __esimd_surf_read<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
305
+ scale, acc, glob_offset, offsets);
306
+ #endif
307
+ }
308
+ }
309
+
310
+ // / Accessor-based scatter. Writes elements of a \ref simd object into an
311
+ // / accessor at given offsets. An element can be 1, 2 or 4-byte value.
312
+ // / Template (compile-time constant) parameters:
313
+ // / @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
314
+ // / @tparam N - the number of elements
315
+ // / @tparam AccessorTy - \ref sycl::accessor type
316
+ // / @tparam L1H - L1 cache hint
317
+ // / @tparam L3H - L3 cache hint
318
+ // /
319
+ // / Formal parameters:
320
+ // / @param acc - the accessor to scatter to
321
+ // / @param vals - values to write
322
+ // / @param offsets - per-element offsets
323
+ // / @param glob_offset - offset added to each individual element's offset to
324
+ // / compute actual memory access offset for that element
325
+ // / @param pred - per-element predicates; elements with zero corresponding
326
+ // / predicates are not written
327
+ // /
328
+ template <typename T, int N, typename AccessorTy,
329
+ CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
330
+ ESIMD_INLINE ESIMD_NODEBUG
331
+ typename std::enable_if<(sizeof (T) <= 4 ) && (N == 1 || N == 8 || N == 16 ) &&
332
+ !std::is_pointer<AccessorTy>::value,
333
+ void >::type
334
+ scatter (AccessorTy acc, simd<T, N> vals, simd<uint32_t , N> offsets,
335
+ uint32_t glob_offset = 0 , simd<uint16_t , N> pred = 1 ) {
336
+
337
+ constexpr int TypeSizeLog2 =
338
+ sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof (T)>();
339
+ // TODO (performance) use hardware-supported scale once BE supports it
340
+ constexpr uint32_t scale = 0 ;
341
+ constexpr uint32_t t_scale = sizeof (T);
342
+ if constexpr (t_scale > 1 ) {
343
+ glob_offset *= t_scale;
344
+ offsets *= t_scale;
345
+ }
346
+
347
+ if constexpr (sizeof (T) < 4 ) {
348
+ static_assert (std::is_integral<T>::value,
349
+ " only integral 1- & 2-byte types are supported" );
350
+ using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t ,
351
+ uint32_t >::type;
352
+ const simd<PromoT, N> promo_vals = sycl::INTEL::gpu::convert<PromoT>(vals);
353
+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
354
+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
355
+ __esimd_surf_write<PromoT, N, decltype (surf_ind), TypeSizeLog2, L1H, L3H>(
356
+ pred, scale, surf_ind, glob_offset, offsets, promo_vals);
357
+ #else
358
+ __esimd_surf_write<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
359
+ pred, scale, acc, glob_offset, offsets, promo_vals);
360
+ #endif
361
+ } else {
362
+ #if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
363
+ const auto surf_ind = AccessorPrivateProxy::getNativeImageObj (acc);
364
+ __esimd_surf_write<T, N, decltype (surf_ind), TypeSizeLog2, L1H, L3H>(
365
+ pred, scale, surf_ind, glob_offset, offsets, vals);
366
+ #else
367
+ __esimd_surf_write<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
368
+ pred, scale, acc, glob_offset, offsets, vals);
369
+ #endif
370
+ }
371
+ }
372
+
373
+ // / Load a scalar value from an accessor.
374
+ template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
375
+ CacheHint L3H = CacheHint::None>
376
+ ESIMD_INLINE ESIMD_NODEBUG T scalar_load (AccessorTy acc, uint32_t offset) {
377
+ const simd<T, 1 > Res = gather<T>(acc, simd<uint32_t , 1 >{offset});
378
+ return Res[0 ];
379
+ }
380
+
381
+ // / Store a scalar value into an accessor.
382
+ template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
383
+ CacheHint L3H = CacheHint::None>
384
+ ESIMD_INLINE ESIMD_NODEBUG void scalar_store (AccessorTy acc, uint32_t offset,
385
+ T val) {
386
+ scatter<T>(acc, simd<T, 1 >{val}, simd<uint32_t , 1 >{offset});
387
+ }
388
+
241
389
// TODO @jasonsewall-intel
242
390
// Don't use '4' in the name - instead either make it a parameter or
243
391
// (if it must be constant) - try to deduce from other arguments.
0 commit comments