Skip to content

[SYCL][ESIMD] Implement accessor-based gather/scatter and scalar I/O #2700

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 11, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions llvm/lib/SYCLLowerIR/LowerESIMD.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,15 @@ class ESIMDIntrinDescTable {
{"flat_write", {"svm.scatter", {ai1(3), a(2), a(0), a(1)}}},
{"flat_write4",
{"svm.scatter4.scaled", {ai1(2), t(2), c16(0), c64(0), a(0), a(1)}}},

// surface index-based gather/scatter:
// num blocks, scale, surface index, global offset, elem offsets
{"surf_read", {"gather.scaled2", {t(3), c16(0), aSI(1), a(2), a(3)}}},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not map to gather.scaled? It may need general predicate support.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question. I'm actually following advice from the BE team (@aus-intel) who recommended to use this one for now. When predication is needed, we can add this w/o breaking user code.

// pred, num blocks, scale, surface index, global offset, elem offsets,
// data to write
{"surf_write",
{"scatter.scaled", {ai1(0), t(3), c16(0), aSI(2), a(3), a(4), a(5)}}},

// intrinsics to query thread's coordinates:
{"group_id_x", {"group.id.x", {}}},
{"group_id_y", {"group.id.y", {}}},
Expand Down
86 changes: 86 additions & 0 deletions sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,92 @@ SYCL_EXTERNAL void __esimd_flat_write4(
sycl::INTEL::gpu::vector_type_t<Ty, N * NumChannels(Mask)> vals,
sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred = 1);

// Low-level surface-based gather. Collects elements located at given offsets in
// a surface and returns them as a single \ref simd object. Element can be
// 1, 2 or 4-byte value, but is always returned as a 4-byte value within the
// resulting simd object, with upper 2 or 3 bytes undefined.
// Template (compile-time constant) parameters:
// @tparam Ty - element type; can only be a 4-byte integer or \c float,
// @tparam N - the number of elements
// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
// accessor used to denote the surface
// @tparam TySizeLog2 - Log2 of the number of bytes read per element:
// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
// @tparam L1H - L1 cache hint
// @tparam L3H - L3 cache hint
//
// Formal parameters:
// @param scale - the scale; must be 0
// @param surf_ind - the surface index, taken from the SYCL memory object
// @param global_offset - offset added to each individual element's offset to
// compute actual memory access offset for that element
// @param elem_offsets - per-element offsets
//
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
SYCL_EXTERNAL sycl::INTEL::gpu::vector_type_t<Ty, N>
__esimd_surf_read(int16_t scale, SurfIndAliasTy surf_ind,
uint32_t global_offset,
sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets)
#ifdef __SYCL_DEVICE_ONLY__
;
#else
{
static_assert(N == 1 || N == 8 || N == 16);
static_assert(TySizeLog2 <= 2);
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
throw cl::sycl::feature_not_supported();
}
#endif // __SYCL_DEVICE_ONLY__

// Low-level surface-based scatter. Writes elements of a \ref simd object into a
// surface at given offsets. Element can be a 1, 2 or 4-byte value, but it is
// always represented as a 4-byte value within the input simd object,
// unused (not written) upper bytes are ignored.
// Template (compile-time constant) parameters:
// @tparam Ty - element type; can only be a 4-byte integer or \c float,
// @tparam N - the number of elements to write
// @tparam SurfIndAliasTy - "surface index alias" type - internal type in the
// accessor used to denote the surface
// @tparam TySizeLog2 - Log2 of the number of bytes written per element:
// 0 - 1 byte, 1 - 2 bytes, 2 - 4 bytes
// @tparam L1H - L1 cache hint
// @tparam L3H - L3 cache hint
//
// Formal parameters:
// @param pred - per-element predicates; elements with zero corresponding
// predicates are not written
// @param scale - the scale; must be 0
// @param surf_ind - the surface index, taken from the SYCL memory object
// @param global_offset - offset added to each individual element's offset to
// compute actual memory access offset for that element
// @param elem_offsets - per-element offsets
// @param vals - values to write
//
template <typename Ty, int N, typename SurfIndAliasTy, int TySizeLog2,
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
sycl::INTEL::gpu::CacheHint L3H = sycl::INTEL::gpu::CacheHint::None>
SYCL_EXTERNAL void
__esimd_surf_write(sycl::INTEL::gpu::vector_type_t<uint16_t, N> pred,
int16_t scale, SurfIndAliasTy surf_ind,
uint32_t global_offset,
sycl::INTEL::gpu::vector_type_t<uint32_t, N> elem_offsets,
sycl::INTEL::gpu::vector_type_t<Ty, N> vals)
#ifdef __SYCL_DEVICE_ONLY__
;
#else
{
static_assert(N == 1 || N == 8 || N == 16);
static_assert(TySizeLog2 <= 2);
static_assert(std::is_integral<Ty>::value || TySizeLog2 == 2);
throw cl::sycl::feature_not_supported();
}
#endif // __SYCL_DEVICE_ONLY__

// TODO bring the parameter order of __esimd* intrinsics in accordance with the
// correponsing BE intrinsicics parameter order.

// flat_atomic: flat-address atomic
template <sycl::INTEL::gpu::EsimdAtomicOpType Op, typename Ty, int N,
sycl::INTEL::gpu::CacheHint L1H = sycl::INTEL::gpu::CacheHint::None,
Expand Down
148 changes: 148 additions & 0 deletions sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ ESIMD_INLINE ESIMD_NODEBUG
addrs.data(), ElemsPerAddrEncoding<ElemsPerAddr>(), pred.data());
}

// TODO bring this SVM-based scatter/gather interface in accordance with
// accessor-based ones - remove the ElemsPerAddr template parameter as it is
// redundant: the only allowed block size in the underlying BE intrinsics is 1
// byte with max number of blocks being 4. This means T template parameter alone
// can model all supported cases.

/// flat-address scatter
template <typename T, int n, int ElemsPerAddr = 1,
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
Expand Down Expand Up @@ -238,6 +244,148 @@ ESIMD_INLINE ESIMD_NODEBUG void block_store(AccessorTy acc, uint32_t offset,
#endif // __SYCL_DEVICE_ONLY__ && __SYCL_EXPLICIT_SIMD__
}

/// Accessor-based gather. Collects elements located at given offsets in
/// an accessor and returns them as a single \ref simd object. An element can be
/// 1, 2 or 4-byte value.
/// Template (compile-time constant) parameters:
/// @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
/// @tparam N - the number of elements
/// @tparam AccessorTy - \ref sycl::accessor type
/// @tparam L1H - L1 cache hint
/// @tparam L3H - L3 cache hint
///
/// Formal parameters:
/// @param acc - the accessor to gather from
/// @param offsets - per-element offsets
/// @param glob_offset - offset added to each individual element's offset to
/// compute actual memory access offset for that element
///
template <typename T, int N, typename AccessorTy,
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
ESIMD_INLINE ESIMD_NODEBUG
typename std::enable_if<(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16) &&
!std::is_pointer<AccessorTy>::value,
simd<T, N>>::type
gather(AccessorTy acc, simd<uint32_t, N> offsets,
uint32_t glob_offset = 0) {

constexpr int TypeSizeLog2 =
sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr uint32_t scale = 0;
constexpr uint32_t t_scale = sizeof(T);
if constexpr (t_scale > 1) {
glob_offset *= t_scale;
offsets *= t_scale;
}

if constexpr (sizeof(T) < 4) {
static_assert(std::is_integral<T>::value,
"only integral 1- & 2-byte types are supported");
using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t,
uint32_t>::type;
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
const simd<PromoT, N> promo_vals =
__esimd_surf_read<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H,
L3H>(scale, surf_ind, glob_offset, offsets);
#else
const simd<PromoT, N> promo_vals =
__esimd_surf_read<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
scale, acc, glob_offset, offsets);
#endif
return sycl::INTEL::gpu::convert<T>(promo_vals);
} else {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
return __esimd_surf_read<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
scale, surf_ind, glob_offset, offsets);
#else
return __esimd_surf_read<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
scale, acc, glob_offset, offsets);
#endif
}
}

/// Accessor-based scatter. Writes elements of a \ref simd object into an
/// accessor at given offsets. An element can be 1, 2 or 4-byte value.
/// Template (compile-time constant) parameters:
/// @tparam T - element type; can only be a 1,2,4-byte integer or \c float,
/// @tparam N - the number of elements
/// @tparam AccessorTy - \ref sycl::accessor type
/// @tparam L1H - L1 cache hint
/// @tparam L3H - L3 cache hint
///
/// Formal parameters:
/// @param acc - the accessor to scatter to
/// @param vals - values to write
/// @param offsets - per-element offsets
/// @param glob_offset - offset added to each individual element's offset to
/// compute actual memory access offset for that element
/// @param pred - per-element predicates; elements with zero corresponding
/// predicates are not written
///
template <typename T, int N, typename AccessorTy,
CacheHint L1H = CacheHint::None, CacheHint L3H = CacheHint::None>
ESIMD_INLINE ESIMD_NODEBUG
typename std::enable_if<(sizeof(T) <= 4) && (N == 1 || N == 8 || N == 16) &&
!std::is_pointer<AccessorTy>::value,
void>::type
scatter(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
uint32_t glob_offset = 0, simd<uint16_t, N> pred = 1) {

constexpr int TypeSizeLog2 =
sycl::INTEL::gpu::ElemsPerAddrEncoding<sizeof(T)>();
// TODO (performance) use hardware-supported scale once BE supports it
constexpr uint32_t scale = 0;
constexpr uint32_t t_scale = sizeof(T);
if constexpr (t_scale > 1) {
glob_offset *= t_scale;
offsets *= t_scale;
}

if constexpr (sizeof(T) < 4) {
static_assert(std::is_integral<T>::value,
"only integral 1- & 2-byte types are supported");
using PromoT = typename std::conditional<std::is_signed<T>::value, int32_t,
uint32_t>::type;
const simd<PromoT, N> promo_vals = sycl::INTEL::gpu::convert<PromoT>(vals);
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
__esimd_surf_write<PromoT, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
pred, scale, surf_ind, glob_offset, offsets, promo_vals);
#else
__esimd_surf_write<PromoT, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
pred, scale, acc, glob_offset, offsets, promo_vals);
#endif
} else {
#if defined(__SYCL_DEVICE_ONLY__) && defined(__SYCL_EXPLICIT_SIMD__)
const auto surf_ind = AccessorPrivateProxy::getNativeImageObj(acc);
__esimd_surf_write<T, N, decltype(surf_ind), TypeSizeLog2, L1H, L3H>(
pred, scale, surf_ind, glob_offset, offsets, vals);
#else
__esimd_surf_write<T, N, AccessorTy, TypeSizeLog2, L1H, L3H>(
pred, scale, acc, glob_offset, offsets, vals);
#endif
}
}

/// Load a scalar value from an accessor.
template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
CacheHint L3H = CacheHint::None>
ESIMD_INLINE ESIMD_NODEBUG T scalar_load(AccessorTy acc, uint32_t offset) {
const simd<T, 1> Res = gather<T>(acc, simd<uint32_t, 1>{offset});
return Res[0];
}

/// Store a scalar value into an accessor.
template <typename T, typename AccessorTy, CacheHint L1H = CacheHint::None,
CacheHint L3H = CacheHint::None>
ESIMD_INLINE ESIMD_NODEBUG void scalar_store(AccessorTy acc, uint32_t offset,
T val) {
scatter<T>(acc, simd<T, 1>{val}, simd<uint32_t, 1>{offset});
}

// TODO @jasonsewall-intel
// Don't use '4' in the name - instead either make it a parameter or
// (if it must be constant) - try to deduce from other arguments.
Expand Down
27 changes: 27 additions & 0 deletions sycl/test/esimd/intrins_trans.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,5 +113,32 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL simd<float, 16> foo() {
__esimd_vstore<int, 32>(&vc, va.data());
// CHECK: store <32 x i32> %{{[0-9a-zA-Z_.]+}}, <32 x i32> addrspace(4)* {{.*}}

{
sycl::accessor<int, 1, sycl::access::mode::read_write,
sycl::access::target::global_buffer>
acc;
simd<uint32_t, 8> offsets = 1;
simd<uint16_t, 8> pred{1, 0, 1, 0, 1, 0, 1, 0};

// 4-byte element gather
simd<int, 8> v = gather<int, 8>(acc, offsets, 100);
// CHECK: %[[SI3:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 2, i16 0, i32 %[[SI3]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})

// 4-byte element scatter
scatter<int, 8>(acc, v, offsets, 100, pred);
// CHECK: %[[SI4:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 2, i16 0, i32 %[[SI4]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})

// 1-byte element gather
simd<unsigned char, 8> v1 = gather<unsigned char, 8>(acc, offsets, 100);
// CHECK: %[[SI5:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.gather.scaled2.v8i32.v8i32(i32 0, i16 0, i32 %[[SI5]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})

// 1-byte element scatter
scatter<unsigned char, 8>(acc, v1, offsets, 100, pred);
// CHECK: %[[SI6:[0-9a-zA-Z_.]+]] = ptrtoint %opencl.image1d_buffer_rw_t addrspace(1)* %{{[0-9a-zA-Z_.]+}} to i32
// CHECK: call void @llvm.genx.scatter.scaled.v8i1.v8i32.v8i32(<8 x i1> %{{[0-9a-zA-Z_.]+}}, i32 0, i16 0, i32 %[[SI6]], i32 %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}}, <8 x i32> %{{[0-9a-zA-Z_.]+}})
}
return d;
}