Skip to content

[SYCL][ESIMD] add indirect read and write methods #3039

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions llvm/lib/SYCLLowerIR/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,10 +237,16 @@ class ESIMDIntrinDescTable {
// for integer, "f" - for floating point
{"rdregion",
{"rdregion", {a(0), t(3), t(4), t(5), a(1), t(6)}, nk(-1)}},
{"rdindirect",
{"rdregion", {a(0), c32(0), t(2), c32(0), a(1), t(3)}, nk(-1)}},
{{"wrregion"},
{{"wrregion"},
{a(0), a(1), t(3), t(4), t(5), a(2), t(6), ai1(3)},
nk(-1)}},
{{"wrindirect"},
{{"wrregion"},
{a(0), a(1), c32(0), t(2), c32(0), a(2), t(3), ai1(3)},
nk(-1)}},
{"vload", {"vload", {l(0)}}},
{"vstore", {"vstore", {a(1), a(0)}}},

Expand Down
24 changes: 24 additions & 0 deletions llvm/test/SYCLLowerIR/esimd_lower_intrins.ll
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,28 @@ define dso_local spir_func void @FUNC_42() !sycl_explicit_simd !1 {
ret void
}

define dso_local spir_func <8 x i32> @FUNC_43() !sycl_explicit_simd !1 {
%a_1 = alloca <16 x i32>
%1 = load <16 x i32>, <16 x i32>* %a_1
%a_2 = alloca <8 x i16>
%2 = load <8 x i16>, <8 x i16>* %a_2
%ret_val = call spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32> %1, <8 x i16> %2)
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.rdregioni.v8i32.v16i32.v8i16(<16 x i32> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 0, <8 x i16> %{{[0-9a-zA-Z_.]+}}, i32 0)
ret <8 x i32> %ret_val
}

define dso_local spir_func <16 x i32> @FUNC_44() !sycl_explicit_simd !1 {
%a_1 = alloca <16 x i32>
%1 = load <16 x i32>, <16 x i32>* %a_1
%a_2 = alloca <8 x i32>
%2 = load <8 x i32>, <8 x i32>* %a_2
%a_3 = alloca <8 x i16>
%3 = load <8 x i16>, <8 x i16>* %a_3
%ret_val = call spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32> %1, <8 x i32> %2, <8 x i16> %3, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.wrregioni.v16i32.v8i32.v8i16.v8i1(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, i32 0, i32 8, i32 0, <8 x i16> %{{[0-9a-zA-Z_.]+}}, i32 0, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
ret <16 x i32> %ret_val
}

declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1)
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2)
Expand Down Expand Up @@ -335,6 +357,8 @@ declare dso_local spir_func <16 x i32> @_Z18__esimd_sudp4a_satIijjjLi16EEN2cl4sy
declare dso_local spir_func <16 x i32> @_Z18__esimd_ssdp4a_satIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
declare dso_local spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 %0)
declare dso_local spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 %0)
declare dso_local spir_func <8 x i32> @_Z18__esimd_rdindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT1_EE4typeENS4_IS5_XT0_EE4typeENS4_ItXT1_EE4typeE(<16 x i32>, <8 x i16>)
declare dso_local spir_func <16 x i32> @_Z18__esimd_wrindirectIiLi16ELi8ELi0EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeES7_NS4_IS5_XT1_EE4typeENS4_ItXT1_EE4typeESB_(<16 x i32>, <8 x i32>, <8 x i16>, <8 x i16>)

attributes #0 = { "genx_byte_offset"="192" "genx_volatile" }

Expand Down
44 changes: 44 additions & 0 deletions sycl/include/CL/sycl/INTEL/esimd/detail/esimd_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,11 @@ template <typename T, int N, int M, int VStride, int Width, int Stride,
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset);

template <typename T, int N, int M, int ParentWidth = 0>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset);

// __esimd_wrregion returns the updated vector with the region updated.
//
// @param T the element data type, one of i8, i16, i32, i64, half, float,
Expand Down Expand Up @@ -120,6 +125,13 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal, uint16_t Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);

template <typename T, int N, int M, int ParentWidth = 0>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask = 1);

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace INTEL {
Expand Down Expand Up @@ -261,6 +273,20 @@ __esimd_rdregion(sycl::INTEL::gpu::vector_type_t<T, N> Input, uint16_t Offset) {
return Result;
}

template <typename T, int N, int M, int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, M>
__esimd_rdindirect(sycl::INTEL::gpu::vector_type_t<T, N> Input,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset) {
sycl::INTEL::gpu::vector_type_t<T, M> Result;
for (int i = 0; i < M; ++i) {
uint16_t EltOffset = Offset[i] / sizeof(T);
assert(Offset[i] % sizeof(T) == 0);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could add safety check here and report error on out-of-bound access.

assert(EltOffset < N);
Result[i] = Input[EltOffset];
}
return Result;
}

template <typename T, int N, int M, int VStride, int Width, int Stride,
int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
Expand All @@ -285,4 +311,22 @@ __esimd_wrregion(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
return Result;
}

template <typename T, int N, int M, int ParentWidth>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<T, N>
__esimd_wrindirect(sycl::INTEL::gpu::vector_type_t<T, N> OldVal,
sycl::INTEL::gpu::vector_type_t<T, M> NewVal,
sycl::INTEL::gpu::vector_type_t<uint16_t, M> Offset,
sycl::INTEL::gpu::mask_type_t<M> Mask) {
sycl::INTEL::gpu::vector_type_t<T, N> Result = OldVal;
for (int i = 0; i < M; ++i) {
if (Mask[i]) {
uint16_t EltOffset = Offset[i] / sizeof(T);
assert(Offset[i] % sizeof(T) == 0);
assert(EltOffset < N);
Result[EltOffset] = NewVal[i];
}
}
return Result;
}

#endif // __SYCL_DEVICE_ONLY__
25 changes: 24 additions & 1 deletion sycl/include/CL/sycl/INTEL/esimd/esimd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,32 @@ template <typename Ty, int N> class simd {
// This would allow you to use the subscript operator to write to an
// element.
// {/quote}
/// Read a single element, by value only.
/// Read single element, return value only (not reference).
Ty operator[](int i) const { return data()[i]; }

// TODO ESIMD_EXPERIMENTAL
/// Read multiple elements by their indices in vector
template <int Size>
simd<Ty, Size> iselect(const simd<uint16_t, Size> &Indices) {
vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
return __esimd_rdindirect<Ty, N, Size>(data(), Offsets);
}
// TODO ESIMD_EXPERIMENTAL
/// update single element
void iupdate(ushort Index, Ty V) {
auto Val = data();
Val[Index] = V;
set(Val);
}
// TODO ESIMD_EXPERIMENTAL
/// update multiple elements by their indices in vector
template <int Size>
void iupdate(const simd<uint16_t, Size> &Indices, const simd<Ty, Size> &Val,
mask_type_t<Size> Mask) {
vector_type_t<uint16_t, Size> Offsets = Indices.data() * sizeof(Ty);
set(__esimd_wrindirect<Ty, N, Size>(data(), Val.data(), Offsets, Mask));
}

// TODO
// @rolandschulz
// {quote}
Expand Down
10 changes: 10 additions & 0 deletions sycl/test/esimd/simd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -230,3 +230,13 @@ bool test_replicate3() __attribute__((sycl_device)) {

return v0_rep[0] == v0[1] && v0_rep[1] == v0[3] && v0_rep[2] == v0[5];
}

bool test_simd_iselect() __attribute__((sycl_device)) {
simd<int, 16> v(0, 1);
simd<ushort, 8> a(0, 2);
auto data = v.iselect(a);
data += 16;
v.iupdate(a, data, 1);
auto ref = v.select<8, 2>(0);
return ref[0] == 16 && ref[14] == 32;
}