[SYCL][ESIMD] add indirect read and write methods to simd class (#3039)

Gang Chen · web-flow · commit 82084270b589 · 2021-01-20T10:46:32.000-08:00
Signed-off-by: Gang Y Chen &lt;gang.y.chen@intel.com&gt;
diff --git a/llvm/lib/SYCLLowerIR/LowerESIMD.cpp b/llvm/lib/SYCLLowerIR/LowerESIMD.cpp
@@ -237,10 +237,16 @@ class ESIMDIntrinDescTable {
         //          for integer, "f" - for floating point
         {"rdregion",
          {"rdregion", {a(0), t(3), t(4), t(5), a(1), t(6)}, nk(-1)}},
+        {"rdindirect",
+         {"rdregion", {a(0), c32(0), t(2), c32(0), a(1), t(3)}, nk(-1)}},
         {{"wrregion"},
          {{"wrregion"},
           {a(0), a(1), t(3), t(4), t(5), a(2), t(6), ai1(3)},
           nk(-1)}},
+        {{"wrindirect"},
+         {{"wrregion"},
+          {a(0), a(1), c32(0), t(2), c32(0), a(2), t(3), ai1(3)},
+          nk(-1)}},
         {"vload", {"vload", {l(0)}}},
         {"vstore", {"vstore", {a(1), a(0)}}},
 
diff --git a/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll b/llvm/test/SYCLLowerIR/esimd_lower_intrins.ll
@@ -294,6 +294,28 @@ define dso_local spir_func void  @FUNC_42() !sycl_explicit_simd !1 {
   ret void
 }
 
+define dso_local spir_func <8 x i32>  @FUNC_43() !sycl_explicit_simd !1 {
+  %a_1 = alloca <16 x i32>
+  %1 = load <16 x i32>, <16 x i32>* %a_1
+  %a_2 = alloca <8 x i16>
+  %2 = load <8 x i16>, <8 x i16>* %a_2
+  %ret_val = call spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32> %1, <8 x i16> %2)
+; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.v8i16(<16 x i32> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 0, <8 x i16> %{{[0-9a-zA-Z_.]+}}, i32 0)
+  ret <8 x i32>  %ret_val
+}
+
+define dso_local spir_func <16 x i32>  @FUNC_44() !sycl_explicit_simd !1 {
+  %a_1 = alloca <16 x i32>
+  %1 = load <16 x i32>, <16 x i32>* %a_1
+  %a_2 = alloca  <8 x i32>
+  %2 = load  <8 x i32>,  <8 x i32>* %a_2
+  %a_3 = alloca  <8 x i16>
+  %3 = load  <8 x i16>,  <8 x i16>* %a_3
+  %ret_val = call spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32> %1, <8 x i32> %2, <8 x i16> %3, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+; CHECK: %{{[0-9a-zA-Z_.]+}} =  call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.v8i16.v8i1(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 0, <8 x i16> %{{[0-9a-zA-Z_.]+}}, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret <16 x i32>  %ret_val
+}
+
 declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
 declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1)
 declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2)
@@ -335,6 +357,8 @@ declare dso_local spir_func <16 x i32> @_Z18__esimd_sudp4a_satIijjjLi16EEN2cl4sy
 declare dso_local spir_func <16 x i32> @_Z18__esimd_ssdp4a_satIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
 declare dso_local spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 %0)
 declare dso_local spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 %0)
+declare dso_local spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32>, <8 x i16>)
+declare dso_local spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32>, <8 x i32>, <8 x i16>, <8 x i16>)
 
 attributes #0 = { "genx_byte_offset"="192" "genx_volatile" }
 
diff --git a/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp b/sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp
@@ -66,6 +66,11 @@ template <typename T, int N, int M, int VStride, int Width, int Stride,
 SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
 __esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset);
 
+template <typename T, int N, int M, int ParentWidth = 0>
+SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
+__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
+                   sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset);
+
 // __esimd_wrregion returns the updated vector with the region updated.
 //
 // @param T the element data type, one of i8, i16, i32, i64, half, float,
@@ -120,6 +125,13 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
                  sycl::INTEL::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
                  sycl::INTEL::gpu::mask_type_t<M> Mask = 1);
 
+template <typename T, int N, int M, int ParentWidth = 0>
+SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
+__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
+                   sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
+                   sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
+                   sycl::INTEL::gpu::mask_type_t<M> Mask = 1);
+
 __SYCL_INLINE_NAMESPACE(cl) {
 namespace sycl {
 namespace INTEL {
@@ -261,6 +273,20 @@ __esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
   return Result;
 }
 
+template <typename T, int N, int M, int ParentWidth>
+SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
+__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
+                   sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset) {
+  sycl::INTEL::gpu::vector_type_t<T, M> Result;
+  for (int i = 0; i < M; ++i) {
+    uint16_t EltOffset = Offset[i] / sizeof(T);
+    assert(Offset[i] % sizeof(T) == 0);
+    assert(EltOffset < N);
+    Result[i] = Input[EltOffset];
+  }
+  return Result;
+}
+
 template <typename T, int N, int M, int VStride, int Width, int Stride,
           int ParentWidth>
 SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
@@ -285,4 +311,22 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
   return Result;
 }
 
+template <typename T, int N, int M, int ParentWidth>
+SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
+__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
+                   sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
+                   sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
+                   sycl::INTEL::gpu::mask_type_t<M> Mask) {
+  sycl::INTEL::gpu::vector_type_t<T, N> Result = OldVal;
+  for (int i = 0; i < M; ++i) {
+    if (Mask[i]) {
+      uint16_t EltOffset = Offset[i] / sizeof(T);
+      assert(Offset[i] % sizeof(T) == 0);
+      assert(EltOffset < N);
+      Result[EltOffset] = NewVal[i];
+    }
+  }
+  return Result;
+}
+
 #endif // __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/CL/sycl/INTEL/esimd/esimd.hpp b/sycl/include/CL/sycl/INTEL/esimd/esimd.hpp
@@ -179,9 +179,32 @@ template <typename Ty, int N> class simd {
   //   This would allow you to use the subscript operator to write to an
   //   element.
   // {/quote}
-  /// Read a single element, by value only.
+  /// Read single element, return value only (not reference).
   Ty operator[](int i) const { return data()[i]; }
 
+  // TODO ESIMD_EXPERIMENTAL
+  /// Read multiple elements by their indices in vector
+  template <int Size>
+  simd<Ty, Size> iselect(const simd<uint16_t, Size> &Indices) {
+    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
+    return __esimd_rdindirect<Ty, N, Size>(data(), Offsets);
+  }
+  // TODO ESIMD_EXPERIMENTAL
+  /// update single element
+  void iupdate(ushort Index, Ty V) {
+    auto Val = data();
+    Val[Index] = V;
+    set(Val);
+  }
+  // TODO ESIMD_EXPERIMENTAL
+  /// update multiple elements by their indices in vector
+  template <int Size>
+  void iupdate(const simd<uint16_t, Size> &Indices, const simd<Ty, Size> &Val,
+               mask_type_t<Size> Mask) {
+    vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
+    set(__esimd_wrindirect<Ty, N, Size>(data(), Val.data(), Offsets, Mask));
+  }
+
   // TODO
   // @rolandschulz
   // {quote}
diff --git a/sycl/test/esimd/simd.cpp b/sycl/test/esimd/simd.cpp
@@ -230,3 +230,13 @@ bool test_replicate3() __attribute__((sycl_device)) {
 
   return v0_rep[0] == v0[1] && v0_rep[1] == v0[3] && v0_rep[2] == v0[5];
 }
+
+bool test_simd_iselect() __attribute__((sycl_device)) {
+  simd<int, 16> v(0, 1);
+  simd<ushort, 8> a(0, 2);
+  auto data = v.iselect(a);
+  data += 16;
+  v.iupdate(a, data, 1);
+  auto ref = v.select<8, 2>(0);
+  return ref[0] == 16 && ref[14] == 32;
+}