Skip to content

Commit 91a25a0

Browse files
[SYCL][ESIMD] Add simd and simd_view classes (#1854)
This patch adds simd and simd_view class implementation used to implement Explicit SIMD vector APIs. Author: Wei Pan Co-authored-by: Chen, Gang Y <[email protected]> Co-authored-by: Bobrovsky, Konstantin S <[email protected]> Co-authored-by: Ashar, Pratik J <[email protected]> Signed-off-by: Denis Bakhvalov <[email protected]>
1 parent 7ca1729 commit 91a25a0

File tree

8 files changed

+1457
-0
lines changed

8 files changed

+1457
-0
lines changed

sycl/include/CL/sycl/intel/esimd.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010

1111
#pragma once
1212

13+
#include <CL/sycl/intel/esimd/esimd.hpp>
14+
#include <CL/sycl/intel/esimd/esimd_view.hpp>
15+
1316
#ifdef __SYCL_DEVICE_ONLY__
1417
#define SYCL_ESIMD_KERNEL __attribute__((sycl_explicit_simd))
1518
#define SYCL_ESIMD_FUNCTION __attribute__((sycl_explicit_simd))
Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
//==------------ esimd_intrin.hpp - DPC++ Explicit SIMD API --------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// Declares Explicit SIMD intrinsics used to implement working with
9+
// the SIMD classes objects.
10+
//===----------------------------------------------------------------------===//
11+
12+
#pragma once
13+
14+
#include <CL/sycl/intel/esimd/detail/esimd_types.hpp>
15+
#include <CL/sycl/intel/esimd/esimd_enum.hpp>
16+
#include <cstdint>
17+
18+
// \brief __esimd_rdregion: region access intrinsic.
19+
//
20+
// @param T the element data type, one of i8, i16, i32, i64, half, float,
21+
// double. In particular bool (i1) and pointer types are not allowed.
22+
//
23+
// @param N the input vector size.
24+
//
25+
// @param M the return vector size.
26+
//
27+
// @param VStride the vertical stride in elements between rows.
28+
//
29+
// @param Width the size or each row, non-zero and even divides `M`.
30+
//
31+
// @param Stride horizontal stride in elements within each row.
32+
//
33+
// @param ParentWidth the width of the input vector when viewed as a 2D
34+
// matrix. Ignored if offset is a constant.
35+
//
36+
// @param Input the input vector
37+
//
38+
// @param Offset the starting offset in bytes.
39+
//
40+
// @return the region extracted.
41+
//
42+
// This intrinsic computes a vector Result:
43+
//
44+
// \code{.cpp}
45+
// uint16_t EltOffset = Offset / sizeof(T);
46+
// assert(Offset % sizeof(T) == 0);
47+
//
48+
// int NumRows = M / Width;
49+
// assert(M % Width == 0);
50+
//
51+
// int Index = 0;
52+
// for (int i = 0; i < NumRows; ++i) {
53+
// for (int j = 0; j < Width; ++j) {
54+
// Result[Index++] = Input[i * VStride + j * Stride +
55+
// EltOffset];
56+
// }
57+
// }
58+
// \endcode
59+
//
60+
template <typename T, int N, int M, int VStride, int Width, int Stride,
61+
int ParentWidth = 0>
62+
SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, M>
63+
__esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset);
64+
65+
// __esimd_wrregion returns the updated vector with the region updated.
66+
//
67+
// @param T the element data type, one of i8, i16, i32, i64, half, float,
68+
// double. In particular bool (i1) and pointer types are not allowed.
69+
//
70+
// @param N the return vector size.
71+
//
72+
// @param M the vector size to write.
73+
//
74+
// @param VStride the vertical stride in elements between rows.
75+
//
76+
// @param Width the size or each row, non-zero and even divides `M`.
77+
//
78+
// @param Stride horizontal stride in elements within each row.
79+
//
80+
// @param ParentWidth the width of the input vector when viewed as a 2D
81+
// matrix. Ignored if offset is a constant.
82+
//
83+
// @param OldVal the vector to write region into.
84+
//
85+
// @param NewVal the vector to write.
86+
//
87+
// @param Offset the starting offset in bytes.
88+
//
89+
// @return the updated vector with the region modifided.
90+
//
91+
// This intrinsic computes a vector Result:
92+
//
93+
// \code{.cpp}
94+
// uint16_t EltOffset = Offset / sizeof(T);
95+
// assert(Offset % sizeof(T) == 0);
96+
//
97+
// int NumRows = M / Width;
98+
// assert(M % Width == 0);
99+
//
100+
// Result = OldValue;
101+
// int Index = 0;
102+
// for (int i = 0; i < NumRows; ++i) {
103+
// for (int j = 0; j < Width; ++j) {
104+
// if (Mask[Index])
105+
// Result[i * VStride + j * Stride + EltOffset] =
106+
// NewVal[Index];
107+
// ++Index;
108+
// }
109+
// }
110+
// \endcode
111+
//
112+
template <typename T, int N, int M, int VStride, int Width, int Stride,
113+
int ParentWidth = 0>
114+
SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
115+
__esimd_wrregion(sycl::intel::gpu::vector_type_t<T, N> OldVal,
116+
sycl::intel::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
117+
sycl::intel::gpu::mask_type_t<M> Mask = 1);
118+
119+
__SYCL_INLINE_NAMESPACE(cl) {
120+
namespace sycl {
121+
namespace intel {
122+
namespace gpu {
123+
// TODO dependencies on the std SYCL concepts like images
124+
// should be refactored in a separate header
125+
class AccessorPrivateProxy {
126+
public:
127+
#ifdef __SYCL_DEVICE_ONLY__
128+
template <typename AccessorTy>
129+
static auto getNativeImageObj(const AccessorTy &Acc) {
130+
return Acc.getNativeImageObj();
131+
}
132+
#else
133+
template <typename AccessorTy>
134+
static auto getImageRange(const AccessorTy &Acc) {
135+
return Acc.getAccessRange();
136+
}
137+
static auto getElemSize(const sycl::detail::AccessorBaseHost &Acc) {
138+
return Acc.getElemSize();
139+
}
140+
#endif
141+
};
142+
143+
constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) {
144+
// encoding requires 2^ElemsPerAddrEncoded
145+
return (1 << ElemsPerAddrEncoded);
146+
}
147+
148+
/// read from a basic region of a vector, return a vector
149+
template <typename BT, int BN, typename RTy>
150+
vector_type_t<typename RTy::element_type, RTy::length>
151+
ESIMD_INLINE readRegion(const vector_type_t<BT, BN> &Base, RTy Region) {
152+
using ElemTy = typename RTy::element_type;
153+
auto Base1 = bitcast<ElemTy, BT, BN>(Base);
154+
constexpr int Bytes = BN * sizeof(BT);
155+
if constexpr (Bytes == RTy::Size_in_bytes)
156+
// This is a no-op format.
157+
return Base1;
158+
else {
159+
static_assert(!RTy::Is_2D);
160+
constexpr int N = Bytes / sizeof(ElemTy);
161+
// Access the region information.
162+
constexpr int M = RTy::Size_x;
163+
constexpr int Stride = RTy::Stride_x;
164+
int16_t Offset = static_cast<int16_t>(Region.M_offset_x * sizeof(ElemTy));
165+
// read-region
166+
return __esimd_rdregion<ElemTy, N, M, /*VS*/ 0, M, Stride>(Base1, Offset);
167+
}
168+
}
169+
170+
/// read from a nested region of a vector, return a vector
171+
template <typename BT, int BN, typename T, typename U>
172+
ESIMD_INLINE vector_type_t<typename T::element_type, T::length>
173+
readRegion(const vector_type_t<BT, BN> &Base, std::pair<T, U> Region) {
174+
// parent-region type
175+
using PaTy = typename shape_type<U>::type;
176+
constexpr int BN1 = PaTy::length;
177+
using BT1 = typename PaTy::element_type;
178+
using ElemTy = typename T::element_type;
179+
// Recursively read the base
180+
auto Base1 = readRegion<BT, BN>(Base, Region.second);
181+
if constexpr (!T::Is_2D || BN1 * sizeof(BT1) == T::Size_in_bytes)
182+
// 1-D region or format
183+
return readRegion<BT1, BN1>(Base1, Region.first);
184+
else {
185+
static_assert(T::Is_2D);
186+
static_assert(std::is_same<ElemTy, BT1>::value);
187+
// To read a 2D region, we need the parent region
188+
// Read full rows with non-trivial vertical and horizontal stride = 1.
189+
constexpr int M = T::Size_y * PaTy::Size_x;
190+
constexpr int VS = T::Stride_y * PaTy::Size_x;
191+
constexpr int W = PaTy::Size_x;
192+
constexpr int HS = 1;
193+
constexpr int ParentWidth = PaTy::Size_x;
194+
uint16_t Offset = static_cast<uint16_t>(Region.first.M_offset_y *
195+
PaTy::Size_x * sizeof(ElemTy));
196+
197+
auto R =
198+
__esimd_rdregion<ElemTy, BN1, M, VS, W, HS, ParentWidth>(Base1, Offset);
199+
200+
// Read columns with non-trivial horizontal stride.
201+
constexpr int N1 = M;
202+
constexpr int M1 = T::length;
203+
constexpr int VS1 = PaTy::Size_x;
204+
constexpr int W1 = T::Size_x;
205+
constexpr int HS1 = T::Stride_x;
206+
uint16_t Offset1 =
207+
static_cast<uint16_t>(Region.first.M_offset_x * sizeof(ElemTy));
208+
209+
return __esimd_rdregion<ElemTy, N1, M1, VS1, W1, HS1, ParentWidth>(R,
210+
Offset1);
211+
}
212+
}
213+
214+
} // namespace gpu
215+
} // namespace intel
216+
} // namespace sycl
217+
} // __SYCL_INLINE_NAMESPACE(cl)
218+
219+
// vload
220+
//
221+
// map to the backend vload intrinsic, used by compiler to control
222+
// optimization on simd object
223+
//
224+
template <typename T, int N>
225+
SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
226+
__esimd_vload(const sycl::intel::gpu::vector_type_t<T, N> *ptr);
227+
228+
// vstore
229+
//
230+
// map to the backend vstore intrinsic, used by compiler to control
231+
// optimization on simd object
232+
template <typename T, int N>
233+
SYCL_EXTERNAL void __esimd_vstore(sycl::intel::gpu::vector_type_t<T, N> *ptr,
234+
sycl::intel::gpu::vector_type_t<T, N> vals);
235+
236+
template <typename T, int N>
237+
SYCL_EXTERNAL uint16_t __esimd_any(sycl::intel::gpu::vector_type_t<T, N> src);
238+
239+
template <typename T, int N>
240+
SYCL_EXTERNAL uint16_t __esimd_all(sycl::intel::gpu::vector_type_t<T, N> src);
241+
242+
#ifndef __SYCL_DEVICE_ONLY__
243+
244+
// Implementations of ESIMD intrinsics for the SYCL host device
245+
template <typename T, int N, int M, int VStride, int Width, int Stride,
246+
int ParentWidth>
247+
SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, M>
248+
__esimd_rdregion(sycl::intel::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
249+
uint16_t EltOffset = Offset / sizeof(T);
250+
assert(Offset % sizeof(T) == 0);
251+
252+
int NumRows = M / Width;
253+
assert(M % Width == 0);
254+
255+
sycl::intel::gpu::vector_type_t<T, M> Result;
256+
int Index = 0;
257+
for (int i = 0; i < NumRows; ++i) {
258+
for (int j = 0; j < Width; ++j) {
259+
Result[Index++] = Input[i * VStride + j * Stride + EltOffset];
260+
}
261+
}
262+
return Result;
263+
}
264+
265+
template <typename T, int N, int M, int VStride, int Width, int Stride,
266+
int ParentWidth>
267+
SYCL_EXTERNAL sycl::intel::gpu::vector_type_t<T, N>
268+
__esimd_wrregion(sycl::intel::gpu::vector_type_t<T, N> OldVal,
269+
sycl::intel::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
270+
sycl::intel::gpu::mask_type_t<M> Mask) {
271+
uint16_t EltOffset = Offset / sizeof(T);
272+
assert(Offset % sizeof(T) == 0);
273+
274+
int NumRows = M / Width;
275+
assert(M % Width == 0);
276+
277+
sycl::intel::gpu::vector_type_t<T, N> Result = OldVal;
278+
int Index = 0;
279+
for (int i = 0; i < NumRows; ++i) {
280+
for (int j = 0; j < Width; ++j) {
281+
if (Mask[Index])
282+
Result[i * VStride + j * Stride + EltOffset] = NewVal[Index];
283+
++Index;
284+
}
285+
}
286+
return Result;
287+
}
288+
289+
#endif // __SYCL_DEVICE_ONLY__

sycl/include/CL/sycl/intel/esimd/detail/esimd_types.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
#pragma once
1212

13+
#include <CL/sycl/detail/defines.hpp>
1314
#include <CL/sycl/detail/stl_type_traits.hpp> // to define C++14,17 extensions
1415
#include <CL/sycl/half_type.hpp>
1516
#include <CL/sycl/intel/esimd/detail/esimd_region.hpp>

0 commit comments

Comments
 (0)