Skip to content

Commit 86dc3b9

Browse files
authored
[ESIMD] Add emulation for dword atomic load/store operations (#8045)
1 parent cc94da2 commit 86dc3b9

File tree

1 file changed

+79
-71
lines changed

1 file changed

+79
-71
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 79 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,52 @@ constexpr void check_atomic() {
865865
/// @addtogroup sycl_esimd_memory_atomics
866866
/// @{
867867

868+
/// @anchor usm_atomic_update1
869+
/// @brief Single-argument variant of the atomic update operation.
870+
///
871+
/// Atomically updates \c N memory locations represented by a USM pointer and
872+
/// a vector of offsets relative to the pointer, and returns a vector of old
873+
/// values found at the memory locations before update. The update operation
874+
/// has 1 additional argument.
875+
///
876+
/// @tparam Op The atomic operation - can be one of the following:
877+
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
878+
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
879+
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
880+
/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::store.
881+
/// @tparam Tx The vector element type.
882+
/// @tparam N The number of memory locations to update.
883+
/// @param p The USM pointer.
884+
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
885+
/// @param src0 The additional argument.
886+
/// @param mask Operation mask, only locations with non-zero in the
887+
/// corresponding mask element are updated.
888+
/// @return A vector of the old values at the memory locations before the
889+
/// update.
890+
///
891+
template <atomic_op Op, typename Tx, int N, typename Toffset>
892+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
893+
simd<Tx, N> src0, simd_mask<N> mask) {
894+
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
895+
detail::check_atomic<Op, Tx, N, 1>();
896+
if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
897+
(Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
898+
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
899+
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
900+
mask);
901+
} else if constexpr (Op == atomic_op::store) {
902+
return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
903+
} else {
904+
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
905+
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
906+
vAddr += offset_i1;
907+
908+
using T = typename detail::__raw_t<Tx>;
909+
return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
910+
mask.data());
911+
}
912+
}
913+
868914
/// @anchor usm_atomic_update0
869915
/// @brief No-argument variant of the atomic update operation.
870916
///
@@ -874,7 +920,7 @@ constexpr void check_atomic() {
874920
/// has no arguments in addition to the value at the memory location.
875921
///
876922
/// @tparam Op The atomic operation - can be \c atomic_op::inc or
877-
/// atomic_op::dec.
923+
/// \c atomic_op::dec, \c atomic_op::load.
878924
/// @tparam Tx The vector element type.
879925
/// @tparam N The number of memory locations to update.
880926
/// @param p The USM pointer.
@@ -889,18 +935,23 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
889935
simd_mask<N> mask) {
890936
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
891937
detail::check_atomic<Op, Tx, N, 0>();
892-
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
893-
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
894-
vAddr += offset_i1;
895-
using T = typename detail::__raw_t<Tx>;
896-
return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
938+
if constexpr (Op == atomic_op::load) {
939+
return atomic_update<atomic_op::bit_or, Tx, N>(p, offset, simd<Tx, N>(0),
940+
mask);
941+
} else {
942+
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
943+
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
944+
vAddr += offset_i1;
945+
using T = typename detail::__raw_t<Tx>;
946+
return __esimd_svm_atomic0<Op, T, N>(vAddr.data(), mask.data());
947+
}
897948
}
898949

899950
/// A variation of \c atomic_update API with \c offsets represented as
900951
/// \c simd_view object.
901952
///
902953
/// @tparam Op The atomic operation - can be \c atomic_op::inc or
903-
/// atomic_op::dec.
954+
/// \c atomic_op::dec, \c atomic_op::load.
904955
/// @tparam Tx The vector element type.
905956
/// @tparam N The number of memory locations to update.
906957
/// @param p The USM pointer.
@@ -919,58 +970,15 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
919970
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
920971
}
921972

922-
/// @anchor usm_atomic_update1
923-
/// @brief Single-argument variant of the atomic update operation.
924-
///
925-
/// Atomically updates \c N memory locations represented by a USM pointer and
926-
/// a vector of offsets relative to the pointer, and returns a vector of old
927-
/// values found at the memory locations before update. The update operation
928-
/// has 1 additional argument.
929-
///
930-
/// @tparam Op The atomic operation - can be one of the following:
931-
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
932-
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
933-
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
934-
/// \c atomic_op::fmax, \c atomic_op::fmin.
935-
/// @tparam Tx The vector element type.
936-
/// @tparam N The number of memory locations to update.
937-
/// @param p The USM pointer.
938-
/// @param offset The vector of 32-bit or 64-bit offsets in bytes.
939-
/// @param src0 The additional argument.
940-
/// @param mask Operation mask, only locations with non-zero in the
941-
/// corresponding mask element are updated.
942-
/// @return A vector of the old values at the memory locations before the
943-
/// update.
944-
///
945-
template <atomic_op Op, typename Tx, int N, typename Toffset>
946-
__ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
947-
simd<Tx, N> src0, simd_mask<N> mask) {
948-
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
949-
if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
950-
(Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
951-
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
952-
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
953-
mask);
954-
} else {
955-
detail::check_atomic<Op, Tx, N, 1>();
956-
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
957-
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
958-
vAddr += offset_i1;
959-
960-
using T = typename detail::__raw_t<Tx>;
961-
return __esimd_svm_atomic1<Op, T, N>(vAddr.data(), src0.data(),
962-
mask.data());
963-
}
964-
}
965-
966973
/// A variation of \c atomic_update API with \c offsets represented as
967974
/// \c simd_view object.
968975
///
969976
/// @tparam Op The atomic operation - can be one of the following:
970-
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
971-
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
972-
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
973-
/// \c atomic_op::fmax, \c atomic_op::fmin.
977+
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
978+
/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
979+
/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
980+
/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
981+
/// atomic_op::save.
974982
/// @tparam Tx The vector element type.
975983
/// @tparam N The number of memory locations to update.
976984
/// @param p The USM pointer.
@@ -1014,12 +1022,12 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
10141022
simd<Tx, N> src0, simd<Tx, N> src1,
10151023
simd_mask<N> mask) {
10161024
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
1025+
detail::check_atomic<Op, Tx, N, 2>();
10171026
if constexpr (Op == atomic_op::fcmpwr) {
10181027
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
10191028
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
10201029
src1, mask);
10211030
} else {
1022-
detail::check_atomic<Op, Tx, N, 2>();
10231031
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
10241032
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(offset);
10251033
vAddr += offset_i1;
@@ -1093,10 +1101,10 @@ __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
10931101
/// Generic work-group barrier.
10941102
/// Performs barrier synchronization for all threads within the same thread
10951103
/// group. The barrier instruction causes the executing thread to wait until
1096-
/// all threads in the same thread group have executed the barrier instruction.
1097-
/// Memory ordering is also guaranteed by this instruction.
1098-
/// The behavior is undefined if this instruction is executed in divergent
1099-
/// control flow.
1104+
/// all threads in the same thread group have executed the barrier
1105+
/// instruction. Memory ordering is also guaranteed by this instruction. The
1106+
/// behavior is undefined if this instruction is executed in divergent control
1107+
/// flow.
11001108
///
11011109
__ESIMD_API void barrier() {
11021110
__esimd_fence(fence_mask::global_coherent_fence | fence_mask::local_barrier);
@@ -1144,8 +1152,8 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
11441152

11451153
/// Scatter operation over the Shared Local Memory.
11461154
/// This API has almost the same interface as the @ref accessor_scatter
1147-
/// "accessor-based scatter", except that it does not have the accessor and the
1148-
/// global offset parameters.
1155+
/// "accessor-based scatter", except that it does not have the accessor and
1156+
/// the global offset parameters.
11491157
///
11501158
template <typename T, int N>
11511159
__ESIMD_API std::enable_if_t<(N == 1 || N == 8 || N == 16 || N == 32) &&
@@ -1165,9 +1173,9 @@ __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
11651173
slm_scatter<T, 1>(simd<uint32_t, 1>(offset), simd<T, 1>(val), 1);
11661174
}
11671175

1168-
/// Gather data from the Shared Local Memory at specified \c offsets and return
1169-
/// it as simd vector. See @ref usm_gather_rgba for information about the
1170-
/// operation semantics and parameter restrictions/interdependencies.
1176+
/// Gather data from the Shared Local Memory at specified \c offsets and
1177+
/// return it as simd vector. See @ref usm_gather_rgba for information about
1178+
/// the operation semantics and parameter restrictions/interdependencies.
11711179
/// @tparam T The element type of the returned vector.
11721180
/// @tparam N The number of elements to access.
11731181
/// @tparam RGBAMask Pixel's channel mask.
@@ -1185,9 +1193,9 @@ slm_gather_rgba(simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
11851193
SI, 0 /*global_offset*/, offsets.data(), mask.data());
11861194
}
11871195

1188-
/// Gather data from the Shared Local Memory at specified \c offsets and return
1189-
/// it as simd vector. See @ref usm_scatter_rgba for information about the
1190-
/// operation semantics and parameter restrictions/interdependencies.
1196+
/// Gather data from the Shared Local Memory at specified \c offsets and
1197+
/// return it as simd vector. See @ref usm_scatter_rgba for information about
1198+
/// the operation semantics and parameter restrictions/interdependencies.
11911199
/// @tparam T The element type of the returned vector.
11921200
/// @tparam N The number of elements to access.
11931201
/// @tparam Mask Pixel's channel mask.
@@ -1565,9 +1573,9 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
15651573
if constexpr (RemN == 1) {
15661574
Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
15671575
} else if constexpr (RemN == 8 || RemN == 16) {
1568-
// TODO: GPU runtime may handle scatter of 16 byte elements incorrectly.
1569-
// The code below is a workaround which must be deleted once GPU runtime
1570-
// is fixed.
1576+
// TODO: GPU runtime may handle scatter of 16 byte elements
1577+
// incorrectly. The code below is a workaround which must be deleted
1578+
// once GPU runtime is fixed.
15711579
if constexpr (sizeof(T) == 1 && RemN == 16) {
15721580
if constexpr (Align % OperandSize::DWORD > 0) {
15731581
ForHelper<RemN>::unroll([Addr, &Tmp](unsigned Index) {

0 commit comments

Comments
 (0)