@@ -865,6 +865,52 @@ constexpr void check_atomic() {
865
865
// / @addtogroup sycl_esimd_memory_atomics
866
866
// / @{
867
867
868
+ // / @anchor usm_atomic_update1
869
+ // / @brief Single-argument variant of the atomic update operation.
870
+ // /
871
+ // / Atomically updates \c N memory locations represented by a USM pointer and
872
+ // / a vector of offsets relative to the pointer, and returns a vector of old
873
+ // / values found at the memory locations before update. The update operation
874
+ // / has 1 additional argument.
875
+ // /
876
+ // / @tparam Op The atomic operation - can be one of the following:
877
+ // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
878
+ // / \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
879
+ // / \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
880
+ // / \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::store.
881
+ // / @tparam Tx The vector element type.
882
+ // / @tparam N The number of memory locations to update.
883
+ // / @param p The USM pointer.
884
+ // / @param offset The vector of 32-bit or 64-bit offsets in bytes.
885
+ // / @param src0 The additional argument.
886
+ // / @param mask Operation mask, only locations with non-zero in the
887
+ // / corresponding mask element are updated.
888
+ // / @return A vector of the old values at the memory locations before the
889
+ // / update.
890
+ // /
891
+ template <atomic_op Op, typename Tx, int N, typename Toffset>
892
+ __ESIMD_API simd<Tx, N> atomic_update (Tx *p, simd<Toffset, N> offset,
893
+ simd<Tx, N> src0, simd_mask<N> mask) {
894
+ static_assert (std::is_integral_v<Toffset>, " Unsupported offset type" );
895
+ detail::check_atomic<Op, Tx, N, 1 >();
896
+ if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
897
+ (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
898
+ // Auto-convert FP atomics to LSC version. Warning is given - see enum.
899
+ return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
900
+ mask);
901
+ } else if constexpr (Op == atomic_op::store) {
902
+ return atomic_update<atomic_op::xchg, Tx, N>(p, offset, src0, mask);
903
+ } else {
904
+ simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
905
+ simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(offset);
906
+ vAddr += offset_i1;
907
+
908
+ using T = typename detail::__raw_t <Tx>;
909
+ return __esimd_svm_atomic1<Op, T, N>(vAddr.data (), src0.data (),
910
+ mask.data ());
911
+ }
912
+ }
913
+
868
914
// / @anchor usm_atomic_update0
869
915
// / @brief No-argument variant of the atomic update operation.
870
916
// /
@@ -874,7 +920,7 @@ constexpr void check_atomic() {
874
920
// / has no arguments in addition to the value at the memory location.
875
921
// /
876
922
// / @tparam Op The atomic operation - can be \c atomic_op::inc or
877
- // / atomic_op::dec.
923
+ // / \c atomic_op::dec, \c atomic_op::load .
878
924
// / @tparam Tx The vector element type.
879
925
// / @tparam N The number of memory locations to update.
880
926
// / @param p The USM pointer.
@@ -889,18 +935,23 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
889
935
simd_mask<N> mask) {
890
936
static_assert (std::is_integral_v<Toffset>, " Unsupported offset type" );
891
937
detail::check_atomic<Op, Tx, N, 0 >();
892
- simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
893
- simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(offset);
894
- vAddr += offset_i1;
895
- using T = typename detail::__raw_t <Tx>;
896
- return __esimd_svm_atomic0<Op, T, N>(vAddr.data (), mask.data ());
938
+ if constexpr (Op == atomic_op::load) {
939
+ return atomic_update<atomic_op::bit_or, Tx, N>(p, offset, simd<Tx, N>(0 ),
940
+ mask);
941
+ } else {
942
+ simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
943
+ simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(offset);
944
+ vAddr += offset_i1;
945
+ using T = typename detail::__raw_t <Tx>;
946
+ return __esimd_svm_atomic0<Op, T, N>(vAddr.data (), mask.data ());
947
+ }
897
948
}
898
949
899
950
// / A variation of \c atomic_update API with \c offsets represented as
900
951
// / \c simd_view object.
901
952
// /
902
953
// / @tparam Op The atomic operation - can be \c atomic_op::inc or
903
- // / atomic_op::dec.
954
+ // / \c atomic_op::dec, \c atomic_op::load .
904
955
// / @tparam Tx The vector element type.
905
956
// / @tparam N The number of memory locations to update.
906
957
// / @param p The USM pointer.
@@ -919,58 +970,15 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
919
970
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
920
971
}
921
972
922
- // / @anchor usm_atomic_update1
923
- // / @brief Single-argument variant of the atomic update operation.
924
- // /
925
- // / Atomically updates \c N memory locations represented by a USM pointer and
926
- // / a vector of offsets relative to the pointer, and returns a vector of old
927
- // / values found at the memory locations before update. The update operation
928
- // / has 1 additional argument.
929
- // /
930
- // / @tparam Op The atomic operation - can be one of the following:
931
- // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
932
- // / \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
933
- // / \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
934
- // / \c atomic_op::fmax, \c atomic_op::fmin.
935
- // / @tparam Tx The vector element type.
936
- // / @tparam N The number of memory locations to update.
937
- // / @param p The USM pointer.
938
- // / @param offset The vector of 32-bit or 64-bit offsets in bytes.
939
- // / @param src0 The additional argument.
940
- // / @param mask Operation mask, only locations with non-zero in the
941
- // / corresponding mask element are updated.
942
- // / @return A vector of the old values at the memory locations before the
943
- // / update.
944
- // /
945
- template <atomic_op Op, typename Tx, int N, typename Toffset>
946
- __ESIMD_API simd<Tx, N> atomic_update (Tx *p, simd<Toffset, N> offset,
947
- simd<Tx, N> src0, simd_mask<N> mask) {
948
- static_assert (std::is_integral_v<Toffset>, " Unsupported offset type" );
949
- if constexpr ((Op == atomic_op::fmin) || (Op == atomic_op::fmax) ||
950
- (Op == atomic_op::fadd) || (Op == atomic_op::fsub)) {
951
- // Auto-convert FP atomics to LSC version. Warning is given - see enum.
952
- return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
953
- mask);
954
- } else {
955
- detail::check_atomic<Op, Tx, N, 1 >();
956
- simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
957
- simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(offset);
958
- vAddr += offset_i1;
959
-
960
- using T = typename detail::__raw_t <Tx>;
961
- return __esimd_svm_atomic1<Op, T, N>(vAddr.data (), src0.data (),
962
- mask.data ());
963
- }
964
- }
965
-
966
973
// / A variation of \c atomic_update API with \c offsets represented as
967
974
// / \c simd_view object.
968
975
// /
969
976
// / @tparam Op The atomic operation - can be one of the following:
970
- // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
971
- // / \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
972
- // / \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
973
- // / \c atomic_op::fmax, \c atomic_op::fmin.
977
+ // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
978
+ // / atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
979
+ // / atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
980
+ // / atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
981
+ // / atomic_op::save.
974
982
// / @tparam Tx The vector element type.
975
983
// / @tparam N The number of memory locations to update.
976
984
// / @param p The USM pointer.
@@ -1014,12 +1022,12 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
1014
1022
simd<Tx, N> src0, simd<Tx, N> src1,
1015
1023
simd_mask<N> mask) {
1016
1024
static_assert (std::is_integral_v<Toffset>, " Unsupported offset type" );
1025
+ detail::check_atomic<Op, Tx, N, 2 >();
1017
1026
if constexpr (Op == atomic_op::fcmpwr) {
1018
1027
// Auto-convert FP atomics to LSC version. Warning is given - see enum.
1019
1028
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(p, offset, src0,
1020
1029
src1, mask);
1021
1030
} else {
1022
- detail::check_atomic<Op, Tx, N, 2 >();
1023
1031
simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
1024
1032
simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(offset);
1025
1033
vAddr += offset_i1;
@@ -1093,10 +1101,10 @@ __ESIMD_API void fence(fence_mask cntl) { __esimd_fence(cntl); }
1093
1101
// / Generic work-group barrier.
1094
1102
// / Performs barrier synchronization for all threads within the same thread
1095
1103
// / group. The barrier instruction causes the executing thread to wait until
1096
- // / all threads in the same thread group have executed the barrier instruction.
1097
- // / Memory ordering is also guaranteed by this instruction.
1098
- // / The behavior is undefined if this instruction is executed in divergent
1099
- // / control flow.
1104
+ // / all threads in the same thread group have executed the barrier
1105
+ // / instruction. Memory ordering is also guaranteed by this instruction. The
1106
+ // / behavior is undefined if this instruction is executed in divergent control
1107
+ // / flow.
1100
1108
// /
1101
1109
__ESIMD_API void barrier () {
1102
1110
__esimd_fence (fence_mask::global_coherent_fence | fence_mask::local_barrier);
@@ -1144,8 +1152,8 @@ template <typename T> __ESIMD_API T slm_scalar_load(uint32_t offset) {
1144
1152
1145
1153
// / Scatter operation over the Shared Local Memory.
1146
1154
// / This API has almost the same interface as the @ref accessor_scatter
1147
- // / "accessor-based scatter", except that it does not have the accessor and the
1148
- // / global offset parameters.
1155
+ // / "accessor-based scatter", except that it does not have the accessor and
1156
+ // / the global offset parameters.
1149
1157
// /
1150
1158
template <typename T, int N>
1151
1159
__ESIMD_API std::enable_if_t <(N == 1 || N == 8 || N == 16 || N == 32 ) &&
@@ -1165,9 +1173,9 @@ __ESIMD_API void slm_scalar_store(uint32_t offset, T val) {
1165
1173
slm_scatter<T, 1 >(simd<uint32_t , 1 >(offset), simd<T, 1 >(val), 1 );
1166
1174
}
1167
1175
1168
- // / Gather data from the Shared Local Memory at specified \c offsets and return
1169
- // / it as simd vector. See @ref usm_gather_rgba for information about the
1170
- // / operation semantics and parameter restrictions/interdependencies.
1176
+ // / Gather data from the Shared Local Memory at specified \c offsets and
1177
+ // / return it as simd vector. See @ref usm_gather_rgba for information about
1178
+ // / the operation semantics and parameter restrictions/interdependencies.
1171
1179
// / @tparam T The element type of the returned vector.
1172
1180
// / @tparam N The number of elements to access.
1173
1181
// / @tparam RGBAMask Pixel's channel mask.
@@ -1185,9 +1193,9 @@ slm_gather_rgba(simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
1185
1193
SI, 0 /* global_offset*/ , offsets.data (), mask.data ());
1186
1194
}
1187
1195
1188
- // / Gather data from the Shared Local Memory at specified \c offsets and return
1189
- // / it as simd vector. See @ref usm_scatter_rgba for information about the
1190
- // / operation semantics and parameter restrictions/interdependencies.
1196
+ // / Gather data from the Shared Local Memory at specified \c offsets and
1197
+ // / return it as simd vector. See @ref usm_scatter_rgba for information about
1198
+ // / the operation semantics and parameter restrictions/interdependencies.
1191
1199
// / @tparam T The element type of the returned vector.
1192
1200
// / @tparam N The number of elements to access.
1193
1201
// / @tparam Mask Pixel's channel mask.
@@ -1565,9 +1573,9 @@ void simd_obj_impl<T, N, T1, SFINAE>::copy_to(
1565
1573
if constexpr (RemN == 1 ) {
1566
1574
Addr[NumChunks * ChunkSize] = Tmp[NumChunks * ChunkSize];
1567
1575
} else if constexpr (RemN == 8 || RemN == 16 ) {
1568
- // TODO: GPU runtime may handle scatter of 16 byte elements incorrectly.
1569
- // The code below is a workaround which must be deleted once GPU runtime
1570
- // is fixed.
1576
+ // TODO: GPU runtime may handle scatter of 16 byte elements
1577
+ // incorrectly. The code below is a workaround which must be deleted
1578
+ // once GPU runtime is fixed.
1571
1579
if constexpr (sizeof (T) == 1 && RemN == 16 ) {
1572
1580
if constexpr (Align % OperandSize::DWORD > 0 ) {
1573
1581
ForHelper<RemN>::unroll ([Addr, &Tmp](unsigned Index) {
0 commit comments