intel · vladimirlaz · Nov 11, 2020 · Oct 27, 2020 · kychendev · Oct 29, 2020
@@ -252,6 +252,15 @@ class ESIMDIntrinDescTable {
         {"flat_write", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}},
         {"flat_write4",
          {"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}},
+
+        // surface index-based gather/scatter:
+        // num blocks, scale, surface index, global offset, elem offsets
+        {"surf_read", {"gather.scaled2", {t(3), c16(0), aSI(1), a(2), a(3)}}},
+        // pred, num blocks, scale, surface index, global offset, elem offsets,
+        // data to write
+        {"surf_write",
+         {"scatter.scaled", {ai1(0), t(3), c16(0), aSI(2), a(3), a(4), a(5)}}},
+
         // intrinsics to query thread's coordinates:
         {"group_id_x", {"group.id.x", {}}},
         {"group_id_y", {"group.id.y", {}}},

@@ -81,6 +81,92 @@ SYCL_EXTERNAL void __esimd_flat_write4(
     sycl::INTEL::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
     sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred = 1);
 
+// Low-level surface-based gather. Collects elements located at given offsets in
+// a surface and returns them as a single \ref simd object. Element can be
+// 1, 2 or 4-byte value, but is always returned as a 4-byte value within the
+// resulting simd object, with upper 2 or 3 bytes undefined.
+// Template (compile-time constant) parameters:
+// @tparam Ty - element type; can only be a 4-byte integer or \c float,
+// @tparam N  - the number of elements
+// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
+//   accessor used to denote the surface
+// @tparam TySizeLog2 - Log2 of the number of bytes read per element:
+//   0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
+// @tparam L1H - L1 cache hint
+// @tparam L3H - L3 cache hint
+//
+// Formal parameters:
+// @param scale - the scale; must be 0
+// @param surf_ind - the surface index, taken from the SYCL memory object
+// @param global_offset - offset added to each individual element's offset to
+//   compute actual memory access offset for that element
+// @param elem_offsets - per-element offsets
+//
+template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
+          sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
+          sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
+SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<Ty, N>
+__esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind,
+                  uint32_t global_offset,
+                  sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else
+{
+  static_assert(N == 1 || N == 8 || N == 16);
+  static_assert(TySizeLog2 <= 2);
+  static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+// Low-level surface-based scatter. Writes elements of a \ref simd object into a
+// surface at given offsets. Element can be a 1, 2 or 4-byte value, but it is
+// always represented as a 4-byte value within the input simd object,
+// unused (not written) upper bytes are ignored.
+// Template (compile-time constant) parameters:
+// @tparam Ty - element type; can only be a 4-byte integer or \c float,
+// @tparam N  - the number of elements to write
+// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
+//   accessor used to denote the surface
+// @tparam TySizeLog2 - Log2 of the number of bytes written per element:
+//   0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
+// @tparam L1H - L1 cache hint
+// @tparam L3H - L3 cache hint
+//
+// Formal parameters:
+// @param pred - per-element predicates; elements with zero corresponding
+//   predicates are not written
+// @param scale - the scale; must be 0
+// @param surf_ind - the surface index, taken from the SYCL memory object
+// @param global_offset - offset added to each individual element's offset to
+//   compute actual memory access offset for that element
+// @param elem_offsets - per-element offsets
+// @param vals - values to write
+//
+template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
+          sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
+          sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
+SYCL_EXTERNAL void
+__esimd_surf_write(sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred,
+                   int16_t scale, SurfIndAliasTy surf_ind,
+                   uint32_t global_offset,
+                   sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets,
+                   sycl::INTEL::gpu::vector_type_t<Ty, N> vals)
+#ifdef __SYCL_DEVICE_ONLY__
+    ;
+#else
+{
+  static_assert(N == 1 || N == 8 || N == 16);
+  static_assert(TySizeLog2 <= 2);
+  static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
+  throw cl::sycl::feature_not_supported();
+}
+#endif // __SYCL_DEVICE_ONLY__
+
+// TODO bring the parameter order of __esimd* intrinsics in accordance with the
+// correponsing BE intrinsicics parameter order.
+
 // flat_atomic: flat-address atomic
 template <sycl::INTEL::gpu::EsimdAtomicOpType Op, typename Ty, int N,
           sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,

@@ -116,6 +116,12 @@ ESIMD_INLINE ESIMD_NODEBUG
         addrs.data(), ElemsPerAddrEncoding<ElemsPerAddr>(), pred.data());
 }
 
+// TODO bring this SVM-based scatter/gather interface in accordance with
+// accessor-based ones - remove the ElemsPerAddr template parameter as it is
+// redundant: the only allowed block size in the underlying BE intrinsics is 1
+// byte with max number of blocks being 4. This means T template parameter alone
+// can model all supported cases.
+
 /// flat-address scatter
 template <typename T, int n, int ElemsPerAddr = 1,
           CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
@@ -238,6 +244,148 @@ ESIMD_INLINE ESIMD_NODEBUG void block_store(AccessorTy acc, uint32_t offset,
 #endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__
 }
 
+/// Accessor-based gather. Collects elements located at given offsets in
+/// an accessor and returns them as a single \ref simd object. An element can be
+/// 1, 2 or 4-byte value.
+/// Template (compile-time constant) parameters:
+/// @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
+/// @tparam N  - the number of elements
+/// @tparam AccessorTy - \ref sycl::accessor type
+/// @tparam L1H - L1 cache hint
+/// @tparam L3H - L3 cache hint
+///
+/// Formal parameters:
+/// @param acc - the accessor to gather from
+/// @param offsets - per-element offsets
+/// @param glob_offset - offset added to each individual element's offset to
+///   compute actual memory access offset for that element
+///
+template <typename T, int N, typename AccessorTy,
+          CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
+ESIMD_INLINE ESIMD_NODEBUG
+    typename std::enable_if<(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16) &&
+                                !std::is_pointer<AccessorTy>::value,
+                            simd<T, N>>::type
+    gather(AccessorTy acc, simd<uint32_t, N> offsets,
+           uint32_t glob_offset = 0) {
+
+  constexpr int TypeSizeLog2 =
+      sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof(T)>();
+  // TODO (performance) use hardware-supported scale once BE supports it
+  constexpr uint32_t scale = 0;
+  constexpr uint32_t t_scale = sizeof(T);
+  if constexpr (t_scale > 1) {
+    glob_offset *= t_scale;
+    offsets *= t_scale;
+  }
+
+  if constexpr (sizeof(T) < 4) {
+    static_assert(std::is_integral<T>::value,
+                  "only integral 1- & 2-byte types are supported");
+    using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t,
+                                             uint32_t>::type;
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
+    const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
+    const simd<PromoT, N> promo_vals =
+        __esimd_surf_read<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H,
+                          L3H>(scale, surf_ind, glob_offset, offsets);
+#else
+    const simd<PromoT, N> promo_vals =
+        __esimd_surf_read<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
+            scale, acc, glob_offset, offsets);
+#endif
+    return sycl::INTEL::gpu::convert<T>(promo_vals);
+  } else {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
+    const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
+    return __esimd_surf_read<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
+        scale, surf_ind, glob_offset, offsets);
+#else
+    return __esimd_surf_read<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
+        scale, acc, glob_offset, offsets);
+#endif
+  }
+}
+
+/// Accessor-based scatter. Writes elements of a \ref simd object into an
+/// accessor at given offsets. An element can be 1, 2 or 4-byte value.
+/// Template (compile-time constant) parameters:
+/// @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
+/// @tparam N  - the number of elements
+/// @tparam AccessorTy - \ref sycl::accessor type
+/// @tparam L1H - L1 cache hint
+/// @tparam L3H - L3 cache hint
+///
+/// Formal parameters:
+/// @param acc - the accessor to scatter to
+/// @param vals - values to write
+/// @param offsets - per-element offsets
+/// @param glob_offset - offset added to each individual element's offset to
+///   compute actual memory access offset for that element
+/// @param pred - per-element predicates; elements with zero corresponding
+///   predicates are not written
+///
+template <typename T, int N, typename AccessorTy,
+          CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
+ESIMD_INLINE ESIMD_NODEBUG
+    typename std::enable_if<(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16) &&
+                                !std::is_pointer<AccessorTy>::value,
+                            void>::type
+    scatter(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
+            uint32_t glob_offset = 0, simd<uint16_t, N> pred = 1) {
+
+  constexpr int TypeSizeLog2 =
+      sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof(T)>();
+  // TODO (performance) use hardware-supported scale once BE supports it
+  constexpr uint32_t scale = 0;
+  constexpr uint32_t t_scale = sizeof(T);
+  if constexpr (t_scale > 1) {
+    glob_offset *= t_scale;
+    offsets *= t_scale;
+  }
+
+  if constexpr (sizeof(T) < 4) {
+    static_assert(std::is_integral<T>::value,
+                  "only integral 1- & 2-byte types are supported");
+    using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t,
+                                             uint32_t>::type;
+    const simd<PromoT, N> promo_vals = sycl::INTEL::gpu::convert<PromoT>(vals);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
+    const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
+    __esimd_surf_write<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
+        pred, scale, surf_ind, glob_offset, offsets, promo_vals);
+#else
+    __esimd_surf_write<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
+        pred, scale, acc, glob_offset, offsets, promo_vals);
+#endif
+  } else {
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
+    const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
+    __esimd_surf_write<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
+        pred, scale, surf_ind, glob_offset, offsets, vals);
+#else
+    __esimd_surf_write<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
+        pred, scale, acc, glob_offset, offsets, vals);
+#endif
+  }
+}
+
+/// Load a scalar value from an accessor.
+template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
+          CacheHint L3H = CacheHint::None>
+ESIMD_INLINE ESIMD_NODEBUG T scalar_load(AccessorTy acc, uint32_t offset) {
+  const simd<T, 1> Res = gather<T>(acc, simd<uint32_t, 1>{offset});
+  return Res[0];
+}
+
+/// Store a scalar value into an accessor.
+template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
+          CacheHint L3H = CacheHint::None>
+ESIMD_INLINE ESIMD_NODEBUG void scalar_store(AccessorTy acc, uint32_t offset,
+                                             T val) {
+  scatter<T>(acc, simd<T, 1>{val}, simd<uint32_t, 1>{offset});
+}
+
 // TODO @jasonsewall-intel
 // Don't use '4' in the name - instead either make it a parameter or
 // (if it must be constant) - try to deduce from other arguments.

@@ -113,5 +113,32 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
   __esimd_vstore<int, 32>(&vc, va.data());
   // CHECK: store <32 x i32>  %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}}
 
+  {
+    sycl::accessor<int, 1, sycl::access::mode::read_write,
+                   sycl::access::target::global_buffer>
+        acc;
+    simd<uint32_t, 8> offsets = 1;
+    simd<uint16_t, 8> pred{1, 0, 1, 0, 1, 0, 1, 0};
+
+    // 4-byte element gather
+    simd<int, 8> v = gather<int, 8>(acc, offsets, 100);
+    // CHECK: %[[SI3:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
+    // CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 2, i16 0, i32 %[[SI3]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
+
+    // 4-byte element scatter
+    scatter<int, 8>(acc, v, offsets, 100, pred);
+    // CHECK: %[[SI4:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
+    // CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 2, i16 0, i32 %[[SI4]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
+
+    // 1-byte element gather
+    simd<unsigned char, 8> v1 = gather<unsigned char, 8>(acc, offsets, 100);
+    // CHECK: %[[SI5:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
+    // CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 0, i16 0, i32 %[[SI5]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
+
+    // 1-byte element scatter
+    scatter<unsigned char, 8>(acc, v1, offsets, 100, pred);
+    // CHECK: %[[SI6:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
+    // CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, i16 0, i32 %[[SI6]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
+  }
   return d;
 }