intel
diff --git a/‎sycl/include/CL/sycl/intel/esimd.hpp
Lines changed: 3 additions & 0 deletions b/‎sycl/include/CL/sycl/intel/esimd.hpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp
Lines changed: 289 additions & 0 deletions b/‎sycl/include/CL/sycl/intel/esimd/detail/esimd_intrin.hpp
Lines changed: 289 additions & 0 deletions
diff --git a/‎sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp
Lines changed: 1 addition & 0 deletions b/‎sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp
Lines changed: 1 addition & 0 deletions
@@ -10,6 +10,9 @@
 
 #pragma once
 
+#include <CL/sycl/intel/esimd/esimd.hpp>
+#include <CL/sycl/intel/esimd/esimd_view.hpp>
+
 #ifdef __SYCL_DEVICE_ONLY__
 #define SYCL_ESIMD_KERNEL __attribute__((sycl_explicit_simd))
 #define SYCL_ESIMD_FUNCTION __attribute__((sycl_explicit_simd))
 
@@ -0,0 +1,289 @@
+//==------------ esimd_intrin.hpp - DPC++ Explicit SIMD API   --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Declares Explicit SIMD intrinsics used to implement working with
+// the SIMD classes objects.
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
+#include <CL/sycl/intel/esimd/esimd_enum.hpp>
+#include <cstdint>
+
+// \brief __esimd_rdregion: region access intrinsic.
+//
+// @param T the element data type, one of i8, i16, i32, i64, half, float,
+// double. In particular bool (i1) and pointer types are not allowed.
+//
+// @param N the input vector size.
+//
+// @param M the return vector size.
+//
+// @param VStride the vertical stride in elements between rows.
+//
+// @param Width the size or each row, non-zero and even divides `M`.
+//
+// @param Stride horizontal stride in elements within each row.
+//
+// @param ParentWidth the width of the input vector when viewed as a 2D
+// matrix. Ignored if offset is a constant.
+//
+// @param Input the input vector
+//
+// @param Offset the starting offset in bytes.
+//
+// @return the region extracted.
+//
+// This intrinsic computes a vector Result:
+//
+// \code{.cpp}
+// uint16_t EltOffset = Offset / sizeof(T);
+// assert(Offset % sizeof(T) == 0);
+//
+// int NumRows = M / Width;
+// assert(M % Width == 0);
+//
+// int Index = 0;
+// for (int i = 0; i < NumRows; ++i) {
+//   for (int j = 0; j < Width; ++j) {
+//     Result[Index++] = Input[i * VStride +  j * Stride +
+//     EltOffset];
+//   }
+// }
+// \endcode
+//
+template <typename T, int N, int M, int VStride, int Width, int Stride,
+          int ParentWidth = 0>
+SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, M>
+__esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset);
+
+// __esimd_wrregion returns the updated vector with the region updated.
+//
+// @param T the element data type, one of i8, i16, i32, i64, half, float,
+// double. In particular bool (i1) and pointer types are not allowed.
+//
+// @param N the return vector size.
+//
+// @param M the vector size to write.
+//
+// @param VStride the vertical stride in elements between rows.
+//
+// @param Width the size or each row, non-zero and even divides `M`.
+//
+// @param Stride horizontal stride in elements within each row.
+//
+// @param ParentWidth the width of the input vector when viewed as a 2D
+// matrix. Ignored if offset is a constant.
+//
+// @param OldVal the vector to write region into.
+//
+// @param NewVal the vector to write.
+//
+// @param Offset the starting offset in bytes.
+//
+// @return the updated vector with the region modifided.
+//
+// This intrinsic computes a vector Result:
+//
+// \code{.cpp}
+// uint16_t EltOffset = Offset / sizeof(T);
+// assert(Offset % sizeof(T) == 0);
+//
+// int NumRows = M / Width;
+// assert(M % Width == 0);
+//
+// Result = OldValue;
+// int Index = 0;
+// for (int i = 0; i < NumRows; ++i) {
+//   for (int j = 0; j < Width; ++j) {
+//       if (Mask[Index])
+//           Result[i * VStride +  j * Stride + EltOffset] =
+//           NewVal[Index];
+//       ++Index;
+//   }
+// }
+// \endcode
+//
+template <typename T, int N, int M, int VStride, int Width, int Stride,
+          int ParentWidth = 0>
+SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
+__esimd_wrregion(sycl::intel::gpu::vector_type_t<T, N> OldVal,
+                 sycl::intel::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
+                 sycl::intel::gpu::mask_type_t<M> Mask = 1);
+
+__SYCL_INLINE_NAMESPACE(cl) {
+namespace sycl {
+namespace intel {
+namespace gpu {
+// TODO dependencies on the std SYCL concepts like images
+// should be refactored in a separate header
+class AccessorPrivateProxy {
+public:
+#ifdef __SYCL_DEVICE_ONLY__
+  template <typename AccessorTy>
+  static auto getNativeImageObj(const AccessorTy &Acc) {
+    return Acc.getNativeImageObj();
+  }
+#else
+  template <typename AccessorTy>
+  static auto getImageRange(const AccessorTy &Acc) {
+    return Acc.getAccessRange();
+  }
+  static auto getElemSize(const sycl::detail::AccessorBaseHost &Acc) {
+    return Acc.getElemSize();
+  }
+#endif
+};
+
+constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) {
+  // encoding requires 2^ElemsPerAddrEncoded
+  return (1 << ElemsPerAddrEncoded);
+}
+
+/// read from a basic region of a vector, return a vector
+template <typename BT, int BN, typename RTy>
+vector_type_t<typename RTy::element_type, RTy::length>
+    ESIMD_INLINE readRegion(const vector_type_t<BT, BN> &Base, RTy Region) {
+  using ElemTy = typename RTy::element_type;
+  auto Base1 = bitcast<ElemTy, BT, BN>(Base);
+  constexpr int Bytes = BN * sizeof(BT);
+  if constexpr (Bytes == RTy::Size_in_bytes)
+    // This is a no-op format.
+    return Base1;
+  else {
+    static_assert(!RTy::Is_2D);
+    constexpr int N = Bytes / sizeof(ElemTy);
+    // Access the region information.
+    constexpr int M = RTy::Size_x;
+    constexpr int Stride = RTy::Stride_x;
+    int16_t Offset = static_cast<int16_t>(Region.M_offset_x * sizeof(ElemTy));
+    // read-region
+    return __esimd_rdregion<ElemTy, N, M, /*VS*/ 0, M, Stride>(Base1, Offset);
+  }
+}
+
+/// read from a nested region of a vector, return a vector
+template <typename BT, int BN, typename T, typename U>
+ESIMD_INLINE vector_type_t<typename T::element_type, T::length>
+readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
+  // parent-region type
+  using PaTy = typename shape_type<U>::type;
+  constexpr int BN1 = PaTy::length;
+  using BT1 = typename PaTy::element_type;
+  using ElemTy = typename T::element_type;
+  // Recursively read the base
+  auto Base1 = readRegion<BT, BN>(Base, Region.second);
+  if constexpr (!T::Is_2D || BN1 * sizeof(BT1) == T::Size_in_bytes)
+    // 1-D region or format
+    return readRegion<BT1, BN1>(Base1, Region.first);
+  else {
+    static_assert(T::Is_2D);
+    static_assert(std::is_same<ElemTy, BT1>::value);
+    // To read a 2D region, we need the parent region
+    // Read full rows with non-trivial vertical and horizontal stride = 1.
+    constexpr int M = T::Size_y * PaTy::Size_x;
+    constexpr int VS = T::Stride_y * PaTy::Size_x;
+    constexpr int W = PaTy::Size_x;
+    constexpr int HS = 1;
+    constexpr int ParentWidth = PaTy::Size_x;
+    uint16_t Offset = static_cast<uint16_t>(Region.first.M_offset_y *
+                                            PaTy::Size_x * sizeof(ElemTy));
+
+    auto R =
+        __esimd_rdregion<ElemTy, BN1, M, VS, W, HS, ParentWidth>(Base1, Offset);
+
+    // Read columns with non-trivial horizontal stride.
+    constexpr int N1 = M;
+    constexpr int M1 = T::length;
+    constexpr int VS1 = PaTy::Size_x;
+    constexpr int W1 = T::Size_x;
+    constexpr int HS1 = T::Stride_x;
+    uint16_t Offset1 =
+        static_cast<uint16_t>(Region.first.M_offset_x * sizeof(ElemTy));
+
+    return __esimd_rdregion<ElemTy, N1, M1, VS1, W1, HS1, ParentWidth>(R,
+                                                                       Offset1);
+  }
+}
+
+} // namespace gpu
+} // namespace intel
+} // namespace sycl
+} // __SYCL_INLINE_NAMESPACE(cl)
+
+// vload
+//
+// map to the backend vload intrinsic, used by compiler to control
+// optimization on simd object
+//
+template <typename T, int N>
+SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
+__esimd_vload(const sycl::intel::gpu::vector_type_t<T, N> *ptr);
+
+// vstore
+//
+// map to the backend vstore intrinsic, used by compiler to control
+// optimization on simd object
+template <typename T, int N>
+SYCL_EXTERNAL void __esimd_vstore(sycl::intel::gpu::vector_type_t<T, N> *ptr,
+                                  sycl::intel::gpu::vector_type_t<T, N> vals);
+
+template <typename T, int N>
+SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t<T, N> src);
+
+template <typename T, int N>
+SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t<T, N> src);
+
+#ifndef __SYCL_DEVICE_ONLY__
+
+// Implementations of ESIMD intrinsics for the SYCL host device
+template <typename T, int N, int M, int VStride, int Width, int Stride,
+          int ParentWidth>
+SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, M>
+__esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
+  uint16_t EltOffset = Offset / sizeof(T);
+  assert(Offset % sizeof(T) == 0);
+
+  int NumRows = M / Width;
+  assert(M % Width == 0);
+
+  sycl::intel::gpu::vector_type_t<T, M> Result;
+  int Index = 0;
+  for (int i = 0; i < NumRows; ++i) {
+    for (int j = 0; j < Width; ++j) {
+      Result[Index++] = Input[i * VStride + j * Stride + EltOffset];
+    }
+  }
+  return Result;
+}
+
+template <typename T, int N, int M, int VStride, int Width, int Stride,
+          int ParentWidth>
+SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
+__esimd_wrregion(sycl::intel::gpu::vector_type_t<T, N> OldVal,
+                 sycl::intel::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
+                 sycl::intel::gpu::mask_type_t<M> Mask) {
+  uint16_t EltOffset = Offset / sizeof(T);
+  assert(Offset % sizeof(T) == 0);
+
+  int NumRows = M / Width;
+  assert(M % Width == 0);
+
+  sycl::intel::gpu::vector_type_t<T, N> Result = OldVal;
+  int Index = 0;
+  for (int i = 0; i < NumRows; ++i) {
+    for (int j = 0; j < Width; ++j) {
+      if (Mask[Index])
+        Result[i * VStride + j * Stride + EltOffset] = NewVal[Index];
+      ++Index;
+    }
+  }
+  return Result;
+}
+
+#endif // __SYCL_DEVICE_ONLY__
@@ -10,6 +10,7 @@
 
 #pragma once
 
+#include <CL/sycl/detail/defines.hpp>
 #include <CL/sycl/detail/stl_type_traits.hpp> // to define C++14,17 extensions
 #include <CL/sycl/half_type.hpp>
 #include <CL/sycl/intel/esimd/detail/esimd_region.hpp>