Skip to content

[SYCL][ESIMD] Add emulation for atomic load/store operations #8045

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jan 20, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
150 changes: 79 additions & 71 deletions sycl/include/sycl/ext/intel/esimd/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,52 @@ constexpr void check_atomic() {
/// @addtogroup sycl_esimd_memory_atomics
/// @{

/// @anchor usm_atomic_update1
/// @brief Single-argument variant of the atomic update operation.
///
/// Atomically updates \c N memory locations represented by a USM pointer and
/// a vector of offsets relative to the pointer, and returns a vector of old
/// values found at the memory locations before update. The update operation
/// has 1 additional argument.
///
/// @tparam Op The atomic operation - can be one of the following:
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::store.
/// @tparam Tx The vector element type.
/// @tparam N The number of memory locations to update.
/// @param p The USM pointer.
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
/// @param src0 The additional argument.
/// @param mask Operation mask, only locations with non-zero in the
/// corresponding mask element are updated.
/// @return A vector of the old values at the memory locations before the
/// update.
///
template <atomic_op Op, typename Tx, int N, typename Toffset>
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
simd<Tx, N> src0, simd_mask<N> mask) {
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
detail::check_atomic<Op, Tx, N, 1>();
if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
(Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
mask);
} else if constexpr (Op == atomic_op::store) {
return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
} else {
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
vAddr += offset_i1;

using T = typename detail::__raw_t<Tx>;
return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
mask.data());
}
}

/// @anchor usm_atomic_update0
/// @brief No-argument variant of the atomic update operation.
///
Expand All @@ -874,7 +920,7 @@ constexpr void check_atomic() {
/// has no arguments in addition to the value at the memory location.
///
/// @tparam Op The atomic operation - can be \c atomic_op::inc or
/// atomic_op::dec.
/// \c atomic_op::dec, \c atomic_op::load.
/// @tparam Tx The vector element type.
/// @tparam N The number of memory locations to update.
/// @param p The USM pointer.
Expand All @@ -889,18 +935,23 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
simd_mask<N> mask) {
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
detail::check_atomic<Op, Tx, N, 0>();
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
vAddr += offset_i1;
using T = typename detail::__raw_t<Tx>;
return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
if constexpr (Op == atomic_op::load) {
return atomic_update<atomic_op::bit_or, Tx, N>(p, offset, simd<Tx, N>(0),
mask);
} else {
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
vAddr += offset_i1;
using T = typename detail::__raw_t<Tx>;
return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
}
}

/// A variation of \c atomic_update API with \c offsets represented as
/// \c simd_view object.
///
/// @tparam Op The atomic operation - can be \c atomic_op::inc or
/// atomic_op::dec.
/// \c atomic_op::dec, \c atomic_op::load.
/// @tparam Tx The vector element type.
/// @tparam N The number of memory locations to update.
/// @param p The USM pointer.
Expand All @@ -919,58 +970,15 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
}

/// @anchor usm_atomic_update1
/// @brief Single-argument variant of the atomic update operation.
///
/// Atomically updates \c N memory locations represented by a USM pointer and
/// a vector of offsets relative to the pointer, and returns a vector of old
/// values found at the memory locations before update. The update operation
/// has 1 additional argument.
///
/// @tparam Op The atomic operation - can be one of the following:
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
/// \c atomic_op::fmax, \c atomic_op::fmin.
/// @tparam Tx The vector element type.
/// @tparam N The number of memory locations to update.
/// @param p The USM pointer.
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
/// @param src0 The additional argument.
/// @param mask Operation mask, only locations with non-zero in the
/// corresponding mask element are updated.
/// @return A vector of the old values at the memory locations before the
/// update.
///
template <atomic_op Op, typename Tx, int N, typename Toffset>
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
simd<Tx, N> src0, simd_mask<N> mask) {
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
(Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
mask);
} else {
detail::check_atomic<Op, Tx, N, 1>();
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
vAddr += offset_i1;

using T = typename detail::__raw_t<Tx>;
return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
mask.data());
}
}

/// A variation of \c atomic_update API with \c offsets represented as
/// \c simd_view object.
///
/// @tparam Op The atomic operation - can be one of the following:
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
/// \c atomic_op::fmax, \c atomic_op::fmin.
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
/// atomic_op::save.
/// @tparam Tx The vector element type.
/// @tparam N The number of memory locations to update.
/// @param p The USM pointer.
Expand Down Expand Up @@ -1014,12 +1022,12 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
simd<Tx, N> src0, simd<Tx, N> src1,
simd_mask<N> mask) {
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
detail::check_atomic<Op, Tx, N, 2>();
if constexpr (Op == atomic_op::fcmpwr) {
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
src1, mask);
} else {
detail::check_atomic<Op, Tx, N, 2>();
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
vAddr += offset_i1;
Expand Down Expand Up @@ -1093,10 +1101,10 @@ __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
/// Generic work-group barrier.
/// Performs barrier synchronization for all threads within the same thread
/// group. The barrier instruction causes the executing thread to wait until
/// all threads in the same thread group have executed the barrier instruction.
/// Memory ordering is also guaranteed by this instruction.
/// The behavior is undefined if this instruction is executed in divergent
/// control flow.
/// all threads in the same thread group have executed the barrier
/// instruction. Memory ordering is also guaranteed by this instruction. The
/// behavior is undefined if this instruction is executed in divergent control
/// flow.
///
__ESIMD_API void barrier() {
__esimd_fence(fence_mask::global_coherent_fence | fence_mask::local_barrier);
Expand Down Expand Up @@ -1144,8 +1152,8 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {

/// Scatter operation over the Shared Local Memory.
/// This API has almost the same interface as the @ref accessor_scatter
/// "accessor-based scatter", except that it does not have the accessor and the
/// global offset parameters.
/// "accessor-based scatter", except that it does not have the accessor and
/// the global offset parameters.
///
template <typename T, int N>
__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
Expand All @@ -1165,9 +1173,9 @@ __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
}

/// Gather data from the Shared Local Memory at specified \c offsets and return
/// it as simd vector. See @ref usm_gather_rgba for information about the
/// operation semantics and parameter restrictions/interdependencies.
/// Gather data from the Shared Local Memory at specified \c offsets and
/// return it as simd vector. See @ref usm_gather_rgba for information about
/// the operation semantics and parameter restrictions/interdependencies.
/// @tparam T The element type of the returned vector.
/// @tparam N The number of elements to access.
/// @tparam RGBAMask Pixel's channel mask.
Expand All @@ -1185,9 +1193,9 @@ slm_gather_rgba(simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
SI, 0 /*global_offset*/, offsets.data(), mask.data());
}

/// Gather data from the Shared Local Memory at specified \c offsets and return
/// it as simd vector. See @ref usm_scatter_rgba for information about the
/// operation semantics and parameter restrictions/interdependencies.
/// Gather data from the Shared Local Memory at specified \c offsets and
/// return it as simd vector. See @ref usm_scatter_rgba for information about
/// the operation semantics and parameter restrictions/interdependencies.
/// @tparam T The element type of the returned vector.
/// @tparam N The number of elements to access.
/// @tparam Mask Pixel's channel mask.
Expand Down Expand Up @@ -1565,9 +1573,9 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
if constexpr (RemN == 1) {
Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
} else if constexpr (RemN == 8 || RemN == 16) {
// TODO: GPU runtime may handle scatter of 16 byte elements incorrectly.
// The code below is a workaround which must be deleted once GPU runtime
// is fixed.
// TODO: GPU runtime may handle scatter of 16 byte elements
// incorrectly. The code below is a workaround which must be deleted
// once GPU runtime is fixed.
if constexpr (sizeof(T) == 1 && RemN == 16) {
if constexpr (Align % OperandSize::DWORD > 0) {
ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {
Expand Down