Skip to content

[SYCL][ESIMD] Change arguments of some APIs to be template parameters #5961

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/SYCLLowerIR/ESIMD/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,10 @@ class ESIMDIntrinDescTable {
{"svm_block_ld_unaligned", {"svm.block.ld.unaligned", {l(0)}}},
{"svm_block_ld", {"svm.block.ld", {l(0)}}},
{"svm_block_st", {"svm.block.st", {l(1)}}},
{"svm_gather", {"svm.gather", {ai1(2), a(1), a(0), u(-1)}}},
{"svm_gather", {"svm.gather", {ai1(1), t(3), a(0), u(-1)}}},
{"svm_gather4_scaled",
{"svm.gather4.scaled", {ai1(1), t(2), c16(0), c64(0), a(0), u(-1)}}},
{"svm_scatter", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}},
{"svm_scatter", {"svm.scatter", {ai1(2), t(3), a(0), a(1)}}},
{"svm_scatter4_scaled",
{"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}},

Expand Down
23 changes: 11 additions & 12 deletions sycl/include/sycl/ext/intel/esimd/detail/memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,12 +81,11 @@ constexpr unsigned int ElemsPerAddrDecoding(unsigned int ElemsPerAddrEncoded) {
} // __SYCL_INLINE_NAMESPACE(cl)

// flat_read does flat-address gather
template <typename Ty, int N, int NumBlk = 0>
template <typename Ty, int N, int NumBlk = 0, int ElemsPerAddr = 0>
__ESIMD_INTRIN
__ESIMD_DNS::vector_type_t<Ty,
N * __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk)>
__esimd_svm_gather(__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
int ElemsPerAddr = NumBlk,
__ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
#ifdef __SYCL_DEVICE_ONLY__
;
Expand All @@ -95,18 +94,18 @@ __ESIMD_INTRIN
auto NumBlkDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk);
__ESIMD_DNS::vector_type_t<Ty, N * __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk)>
V = 0;
ElemsPerAddr = __ESIMD_DNS::ElemsPerAddrDecoding(ElemsPerAddr);
auto ElemsPerAddrDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(ElemsPerAddr);
if (sizeof(Ty) == 2)
ElemsPerAddr = ElemsPerAddr / 2;
ElemsPerAddrDecoded = ElemsPerAddrDecoded / 2;

for (int I = 0; I < N; I++) {
if (pred[I]) {
Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
if (sizeof(Ty) <= 2) {
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
V[I * NumBlkDecoded + J] = *(Addr + J);
} else {
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
V[J * N + I] = *(Addr + J);
}
}
Expand All @@ -116,30 +115,30 @@ __ESIMD_INTRIN
#endif // __SYCL_DEVICE_ONLY__

// flat_write does flat-address scatter
template <typename Ty, int N, int NumBlk = 0>
template <typename Ty, int N, int NumBlk = 0, int ElemsPerAddr = 0>
__ESIMD_INTRIN void __esimd_svm_scatter(
__ESIMD_DNS::vector_type_t<uint64_t, N> addrs,
__ESIMD_DNS::vector_type_t<Ty,
N * __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk)>
vals,
int ElemsPerAddr = NumBlk, __ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
__ESIMD_DNS::simd_mask_storage_t<N> pred = 1)
#ifdef __SYCL_DEVICE_ONLY__
;
#else
{
auto NumBlkDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(NumBlk);
ElemsPerAddr = __ESIMD_DNS::ElemsPerAddrDecoding(ElemsPerAddr);
auto ElemsPerAddrDecoded = __ESIMD_DNS::ElemsPerAddrDecoding(ElemsPerAddr);
if (sizeof(Ty) == 2)
ElemsPerAddr = ElemsPerAddr / 2;
ElemsPerAddrDecoded = ElemsPerAddrDecoded / 2;

for (int I = 0; I < N; I++) {
if (pred[I]) {
Ty *Addr = reinterpret_cast<Ty *>(addrs[I]);
if (sizeof(Ty) <= 2) {
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
*(Addr + J) = vals[I * NumBlkDecoded + J];
} else {
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddr; J++)
for (int J = 0; J < NumBlkDecoded && J < ElemsPerAddrDecoded; J++)
*(Addr + J) = vals[J * N + I];
}
}
Expand Down
35 changes: 22 additions & 13 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,16 +134,19 @@ gather(const Tx *p, simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
addrs = addrs + offsets_i;

if constexpr (sizeof(T) == 1) {
auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>()>(
addrs.data(), detail::ElemsPerAddrEncoding<1>(), mask.data());
auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<4>(),
detail::ElemsPerAddrEncoding<1>()>(
addrs.data(), mask.data());
return __esimd_rdregion<T, N * 4, N, /*VS*/ 0, N, 4>(Ret, 0);
} else if constexpr (sizeof(T) == 2) {
auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>()>(
addrs.data(), detail::ElemsPerAddrEncoding<2>(), mask.data());
auto Ret = __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<2>(),
detail::ElemsPerAddrEncoding<2>()>(
addrs.data(), mask.data());
return __esimd_rdregion<T, N * 2, N, /*VS*/ 0, N, 2>(Ret, 0);
} else
return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>()>(
addrs.data(), detail::ElemsPerAddrEncoding<1>(), mask.data());
return __esimd_svm_gather<T, N, detail::ElemsPerAddrEncoding<1>(),
detail::ElemsPerAddrEncoding<1>()>(addrs.data(),
mask.data());
}

/// Writes ("scatters") elements of the input vector to different memory
Expand All @@ -169,17 +172,19 @@ scatter(Tx *p, simd<uint32_t, N> offsets, simd<Tx, N> vals,
if constexpr (sizeof(T) == 1) {
simd<T, N * 4> D;
D = __esimd_wrregion<T, N * 4, N, /*VS*/ 0, N, 4>(D.data(), vals.data(), 0);
__esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>()>(
addrs.data(), D.data(), detail::ElemsPerAddrEncoding<1>(), mask.data());
__esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<4>(),
detail::ElemsPerAddrEncoding<1>()>(
addrs.data(), D.data(), mask.data());
} else if constexpr (sizeof(T) == 2) {
simd<T, N * 2> D;
D = __esimd_wrregion<T, N * 2, N, /*VS*/ 0, N, 2>(D.data(), vals.data(), 0);
__esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>()>(
addrs.data(), D.data(), detail::ElemsPerAddrEncoding<2>(), mask.data());
__esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<2>(),
detail::ElemsPerAddrEncoding<2>()>(
addrs.data(), D.data(), mask.data());
} else
__esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>()>(
addrs.data(), vals.data(), detail::ElemsPerAddrEncoding<1>(),
mask.data());
__esimd_svm_scatter<T, N, detail::ElemsPerAddrEncoding<1>(),
detail::ElemsPerAddrEncoding<1>()>(
addrs.data(), vals.data(), mask.data());
}

/// Loads a contiguous block of memory from given memory address and returns
Expand Down Expand Up @@ -769,6 +774,9 @@ enum fence_mask : uint8_t {
/// esimd::fence sets the memory read/write order.
/// @tparam cntl A bitmask composed from \c fence_mask bits.
///
template <uint8_t cntl> __ESIMD_API void fence() { __esimd_fence(cntl); }

__SYCL_DEPRECATED("use fence<fence_mask>()")
__ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }

/// Generic work-group barrier.
Expand All @@ -790,6 +798,7 @@ __ESIMD_API void barrier() {
/// @{

/// Declare per-work-group slm size.
/// @param size Shared Local Memory (SLM) size
__ESIMD_API void slm_init(uint32_t size) { __esimd_slm_init(size); }

/// Gather operation over the Shared Local Memory.
Expand Down
8 changes: 7 additions & 1 deletion sycl/include/sycl/ext/intel/experimental/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ namespace __ESIMD_ENS {
/// @addtogroup sycl_esimd_memory
/// @{

/// Generic work-group split barrier
/// Generic work-group split barrier.
/// @tparam flag - split barrier action.
template <split_barrier_action flag> __ESIMD_API void split_barrier() {
__esimd_sbarrier(flag);
}

__SYCL_DEPRECATED("use split_barrier<split_barrier_action>()")
__ESIMD_API void split_barrier(split_barrier_action flag) {
__esimd_sbarrier(flag);
}
Expand Down
20 changes: 20 additions & 0 deletions sycl/test/esimd/deprecated.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify %s

#include <sycl/ext/intel/esimd.hpp>

using namespace sycl::ext::intel::esimd;
using namespace sycl::ext::intel::experimental::esimd;

void test_slm_init() SYCL_ESIMD_FUNCTION { slm_init(1024); }

void test_fence() SYCL_ESIMD_FUNCTION {
fence<fence_mask::global_coherent_fence | fence_mask::local_barrier>();
// expected-warning@+2 {{deprecated}}
// expected-note@sycl/ext/intel/esimd/memory.hpp:* {{has been explicitly marked deprecated here}}
fence(static_cast<fence_mask>(fence_mask::global_coherent_fence |
fence_mask::local_barrier));
}

void test_split_barrier() SYCL_ESIMD_FUNCTION {
split_barrier<split_barrier_action::signal>();
}
8 changes: 4 additions & 4 deletions sycl/test/esimd/intrins_trans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
// CHECK: call void @llvm.genx.svm.block.st.i64.v32i32(i64 %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})

simd<uint32_t, VL> v01 =
__esimd_svm_gather<uint32_t, VL>(v_addr.data(), 0, pred.data());
__esimd_svm_gather<uint32_t, VL>(v_addr.data(), pred.data());
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <32 x i32> @llvm.genx.svm.gather.v32i32.v32i1.v32i64(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> undef)

__esimd_svm_scatter<uint32_t, VL>(v_addr.data(), v01.data(), 0, pred.data());
__esimd_svm_scatter<uint32_t, VL>(v_addr.data(), v01.data(), pred.data());
// CHECK: call void @llvm.genx.svm.scatter.v32i1.v32i64.v32i32(<32 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, <32 x i64> %{{[0-9a-zA-Z_.]+}}, <32 x i32> %{{[0-9a-zA-Z_.]+}})

simd<short, 16> mina(0, 1);
Expand Down Expand Up @@ -228,12 +228,12 @@ test_mem_intrins(uint64_t addr, const vec<float, 8> &xf,
// CHECK-LABEL: call void @llvm.genx.svm.block.st.i64.v8i32(i64 %{{[a-zA-Z0-9.]+}}, <8 x i32> %{{[a-zA-Z0-9.]+}})
}
{
auto x = __esimd_svm_gather<unsigned char, 8>(get8ui64(), 0, get8ui16());
auto x = __esimd_svm_gather<unsigned char, 8>(get8ui64(), get8ui16());
// CHECK-LABEL: %{{[a-zA-Z0-9.]+}} = call <8 x i8> @llvm.genx.svm.gather.v8i8.v8i1.v8i64(<8 x i1> %{{[a-zA-Z0-9.]+}}, i32 0, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x i8> undef)
use(x);
}
{
__esimd_svm_scatter<unsigned char, 8>(get8ui64(), get8ui8(), 0, get8ui16());
__esimd_svm_scatter<unsigned char, 8>(get8ui64(), get8ui8(), get8ui16());
// CHECK-LABEL: call void @llvm.genx.svm.scatter.v8i1.v8i64.v8i8(<8 x i1> %{{[a-zA-Z0-9.]+}}, i32 0, <8 x i64> %{{[a-zA-Z0-9.]+}}, <8 x i8> %{{[a-zA-Z0-9.]+}})
}
{
Expand Down
4 changes: 2 additions & 2 deletions sycl/test/esimd/slm_gather_scatter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,13 @@ void kernel() __attribute__((sycl_device)) {

auto v0 = slm_gather<int, 32>(offsets);

auto fm =
constexpr auto fm =
fence_mask::global_coherent_fence | fence_mask::l3_flush_instructions |
fence_mask::l3_flush_texture_data | fence_mask::l3_flush_constant_data |
fence_mask::l3_flush_rw_data | fence_mask::local_barrier |
fence_mask::l1_flush_ro_data | fence_mask::sw_barrier;

esimd::fence(static_cast<fence_mask>(fm));
esimd::fence<fm>();
esimd::barrier();

v0 = v0 + v1;
Expand Down