@@ -175,6 +175,24 @@ __ESIMD_API simd<Tx, N> gather(const Tx *p,
175
175
return gather<Tx, N>(p, simd<Ty, N>(offsets), mask);
176
176
}
177
177
178
+ // / A variation of \c gather API with \c offsets represented as scalar.
179
+ // /
180
+ // / @tparam Tx Element type, must be of size 4 or less.
181
+ // / @tparam N Number of elements to read; can be \c 1, \c 2, \c 4, \c 8, \c 16
182
+ // / or \c 32.
183
+ // / @param p The base address.
184
+ // / @param offset the scalar 32-bit or 64-bit offset in bytes.
185
+ // / ((byte*)p + offset) must be element size aligned.
186
+ // / @param mask The access mask, defaults to all 1s.
187
+ // / @return A vector of elements read. Elements in masked out lanes are
188
+ // / undefined.
189
+ // /
190
+ template <typename Tx, int N, typename Toffset>
191
+ __ESIMD_API std::enable_if_t <std::is_integral_v<Toffset>, simd<Tx, N>>
192
+ gather (const Tx *p, Toffset offset, simd_mask<N> mask = 1 ) {
193
+ return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
194
+ }
195
+
178
196
// / Writes ("scatters") elements of the input vector to different memory
179
197
// / locations. Each memory location is base address plus an offset - a
180
198
// / value of the corresponding element in the input offset vector. Access to
@@ -236,6 +254,23 @@ __ESIMD_API void scatter(Tx *p, simd_view<Toffset, RegionTy> offsets,
236
254
scatter<Tx, N>(p, simd<Ty, N>(offsets), vals, mask);
237
255
}
238
256
257
+ // / A variation of \c scatter API with \c offsets represented as scalar.
258
+ // /
259
+ // / @tparam Tx Element type, must be of size 4 or less.
260
+ // / @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16
261
+ // / or \c 32.
262
+ // / @param p The base address.
263
+ // / @param offset the scalar 32-bit or 64-bit offset in bytes.
264
+ // / ((byte*)p + offset) must be element size aligned.
265
+ // / @param vals The vector to scatter.
266
+ // / @param mask The access mask, defaults to all 1s.
267
+ // /
268
+ template <typename Tx, int N, typename Toffset>
269
+ __ESIMD_API std::enable_if_t <std::is_integral_v<Toffset> && N == 1 >
270
+ scatter (Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1 ) {
271
+ scatter<Tx, N>(p, simd<Toffset, N>(offset), vals, mask);
272
+ }
273
+
239
274
// / Loads a contiguous block of memory from given memory address and returns
240
275
// / the loaded data as a vector. Actual code generated depends on the
241
276
// / alignment parameter.
@@ -635,6 +670,29 @@ gather_rgba(const T *p, simd_view<Toffset, RegionTy> offsets,
635
670
return gather_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), mask);
636
671
}
637
672
673
+ // / A variation of \c gather_rgba API with \c offsets represented as
674
+ // / scalar.
675
+ // /
676
+ // / @tparam T Element type of the returned vector. Must be 4 bytes in size.
677
+ // / @tparam N Number of pixels to access (matches the size of the \c offsets
678
+ // / vector). Must be 8, 16 or 32.
679
+ // / @tparam Mask A pixel's channel mask.
680
+ // / @param p The USM base pointer representing memory address of the access.
681
+ // / @param offset scalar byte offsets of the pixels relative to the base
682
+ // / pointer.
683
+ // / @param mask Memory access mask. Pixels with zero corresponding mask's
684
+ // / predicate are not accessed. Their values in the resulting vector are
685
+ // / undefined.
686
+ // / @return Read data - up to N*4 values of type \c Tx.
687
+ // /
688
+ template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
689
+ int N, typename Toffset>
690
+ __ESIMD_API std::enable_if_t <std::is_integral_v<Toffset>,
691
+ simd<T, N * get_num_channels_enabled (RGBAMask)>>
692
+ gather_rgba(const T *p, Toffset offset, simd_mask<N> mask = 1 ) {
693
+ return gather_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), mask);
694
+ }
695
+
638
696
template <typename T, int N, rgba_channel_mask RGBAMask>
639
697
__SYCL_DEPRECATED (" use gather_rgba<rgba_channel_mask>()" )
640
698
__ESIMD_API std::enable_if_t <
@@ -719,6 +777,30 @@ scatter_rgba(T *p, simd_view<Toffset, RegionTy> offsets,
719
777
scatter_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), vals, mask);
720
778
}
721
779
780
+ // / A variation of \c scatter_rgba API with \c offsets represented as
781
+ // / scalar
782
+ // /
783
+ // / @tparam T Element type of the returned vector. Must be 4 bytes in size.
784
+ // / @tparam N Number of pixels to access (matches the size of the \c offsets
785
+ // / vector). Must be 8, 16 or 32.
786
+ // / @tparam RGBAMask A pixel's channel mask.
787
+ // / @param p The USM base pointer representing memory address of the access.
788
+ // / @param vals values to be written.
789
+ // / @param offset scalar byte offset of the pixels relative to the base
790
+ // / pointer.
791
+ // / @param mask Memory access mask. Pixels with zero corresponding mask's
792
+ // / predicate are not accessed. Their values in the resulting vector are
793
+ // / undefined.
794
+ // /
795
+ template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
796
+ int N, typename Toffset>
797
+ __ESIMD_API std::enable_if_t <std::is_integral_v<Toffset> && N == 1 >
798
+ scatter_rgba (T *p, Toffset offset,
799
+ simd<T, N * get_num_channels_enabled (RGBAMask)> vals,
800
+ simd_mask<N> mask = 1) {
801
+ scatter_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), vals, mask);
802
+ }
803
+
722
804
template <typename T, int N, rgba_channel_mask RGBAMask>
723
805
__SYCL_DEPRECATED (" use scatter_rgba<rgba_channel_mask>()" )
724
806
__ESIMD_API std::
@@ -911,6 +993,61 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
911
993
}
912
994
}
913
995
996
+ // / A variation of \c atomic_update API with \c offsets represented as
997
+ // / \c simd_view object.
998
+ // /
999
+ // / @tparam Op The atomic operation - can be one of the following:
1000
+ // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
1001
+ // / atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
1002
+ // / atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
1003
+ // / atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
1004
+ // / atomic_op::store.
1005
+ // / @tparam Tx The vector element type.
1006
+ // / @tparam N The number of memory locations to update.
1007
+ // / @param p The USM pointer.
1008
+ // / @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
1009
+ // / @param src0 The additional argument.
1010
+ // / @param mask Operation mask, only locations with non-zero in the
1011
+ // / corresponding mask element are updated.
1012
+ // / @return A vector of the old values at the memory locations before the
1013
+ // / update.
1014
+ // /
1015
+ template <atomic_op Op, typename Tx, int N, typename Toffset,
1016
+ typename RegionTy = region1d_t <Toffset, N, 1 >>
1017
+ __ESIMD_API simd<Tx, N> atomic_update (Tx *p,
1018
+ simd_view<Toffset, RegionTy> offsets,
1019
+ simd<Tx, N> src0, simd_mask<N> mask) {
1020
+ using Ty = typename simd_view<Toffset, RegionTy>::element_type;
1021
+ return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, mask);
1022
+ }
1023
+
1024
+ // / A variation of \c atomic_update API with \c offset represented as
1025
+ // / scalar object.
1026
+ // /
1027
+ // / @tparam Op The atomic operation - can be one of the following:
1028
+ // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
1029
+ // / \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
1030
+ // / \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
1031
+ // / \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store.
1032
+ // / @tparam Tx The vector element type.
1033
+ // / @tparam N The number of memory locations to update.
1034
+ // / @param p The USM pointer.
1035
+ // / @param offset The scalar 32-bit or 64-bit offsets in bytes.
1036
+ // / @param src0 The additional argument.
1037
+ // / @param mask Operation mask, only locations with non-zero in the
1038
+ // / corresponding mask element are updated.
1039
+ // / @return A vector of the old values at the memory locations before the
1040
+ // / update.
1041
+ // /
1042
+ template <atomic_op Op, typename Tx, int N, typename Toffset>
1043
+ __ESIMD_API std::enable_if_t <
1044
+ std::is_integral_v<Toffset> &&
1045
+ ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
1046
+ simd<Tx, N>>
1047
+ atomic_update(Tx *p, Toffset offset, simd<Tx, N> src0, simd_mask<N> mask) {
1048
+ return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), src0, mask);
1049
+ }
1050
+
914
1051
// / @anchor usm_atomic_update0
915
1052
// / @brief No-argument variant of the atomic update operation.
916
1053
// /
@@ -970,32 +1107,24 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
970
1107
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
971
1108
}
972
1109
973
- // / A variation of \c atomic_update API with \c offsets represented as
974
- // / \c simd_view object .
1110
+ // / A variation of \c atomic_update API with \c offset represented as
1111
+ // / scalar .
975
1112
// /
976
- // / @tparam Op The atomic operation - can be one of the following:
977
- // / \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
978
- // / atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
979
- // / atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
980
- // / atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
981
- // / atomic_op::save.
1113
+ // / @tparam Op The atomic operation - can be \c atomic_op::inc or
1114
+ // / \c atomic_op::dec, \c atomic_op::load.
982
1115
// / @tparam Tx The vector element type.
983
1116
// / @tparam N The number of memory locations to update.
984
1117
// / @param p The USM pointer.
985
- // / @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
986
- // / @param src0 The additional argument.
1118
+ // / @param offset The scalar 32-bit or 64-bit offset in bytes.
987
1119
// / @param mask Operation mask, only locations with non-zero in the
988
1120
// / corresponding mask element are updated.
989
1121
// / @return A vector of the old values at the memory locations before the
990
1122
// / update.
991
1123
// /
992
- template <atomic_op Op, typename Tx, int N, typename Toffset,
993
- typename RegionTy = region1d_t <Toffset, N, 1 >>
994
- __ESIMD_API simd<Tx, N> atomic_update (Tx *p,
995
- simd_view<Toffset, RegionTy> offsets,
996
- simd<Tx, N> src0, simd_mask<N> mask) {
997
- using Ty = typename simd_view<Toffset, RegionTy>::element_type;
998
- return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, mask);
1124
+ template <atomic_op Op, typename Tx, int N, typename Toffset>
1125
+ __ESIMD_API std::enable_if_t <std::is_integral_v<Toffset>, simd<Tx, N>>
1126
+ atomic_update (Tx *p, Toffset offset, simd_mask<N> mask = 1 ) {
1127
+ return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), mask);
999
1128
}
1000
1129
1001
1130
// / @anchor usm_atomic_update2
@@ -1062,6 +1191,30 @@ atomic_update(Tx *p, simd_view<Toffset, RegionTy> offsets, simd<Tx, N> src0,
1062
1191
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, src1, mask);
1063
1192
}
1064
1193
1194
+ // / A variation of \c atomic_update API with \c offsets represented as
1195
+ // / scalar.
1196
+ // /
1197
+ // / @tparam Op The atomic operation - can be one of the following:
1198
+ // / \c atomic_op::cmpxchg, \c atomic_op::fcmpwr.
1199
+ // / @tparam Tx The vector element type.
1200
+ // / @tparam N The number of memory locations to update.
1201
+ // / @param p The USM pointer.
1202
+ // / @param offset The scalar 32-bit or 64-bit offset in bytes.
1203
+ // / @param src0 The first additional argument (new value).
1204
+ // / @param src1 The second additional argument (expected value).
1205
+ // / @param mask Operation mask, only locations with non-zero in the
1206
+ // / corresponding mask element are updated.
1207
+ // / @return A vector of the old values at the memory locations before the
1208
+ // / update.
1209
+ // /
1210
+ template <atomic_op Op, typename Tx, int N, typename Toffset>
1211
+ __ESIMD_API std::enable_if_t <std::is_integral_v<Toffset>, simd<Tx, N>>
1212
+ atomic_update (Tx *p, Toffset offset, simd<Tx, N> src0, simd<Tx, N> src1,
1213
+ simd_mask<N> mask) {
1214
+ return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), src0, src1,
1215
+ mask);
1216
+ }
1217
+
1065
1218
// / @} sycl_esimd_memory_atomics
1066
1219
1067
1220
// / @addtogroup sycl_esimd_memory
@@ -1187,7 +1340,6 @@ template <typename T, int N, rgba_channel_mask RGBAMask>
1187
1340
__ESIMD_API std::enable_if_t <(N == 8 || N == 16 || N == 32 ) && (sizeof (T) == 4 ),
1188
1341
simd<T, N * get_num_channels_enabled (RGBAMask)>>
1189
1342
slm_gather_rgba(simd<uint32_t , N> offsets, simd_mask<N> mask = 1 ) {
1190
-
1191
1343
const auto SI = __ESIMD_NS::get_surface_index (detail::LocalAccessorMarker ());
1192
1344
return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
1193
1345
SI, 0 /* global_offset*/ , offsets.data (), mask.data ());
@@ -1395,7 +1547,6 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
1395
1547
// / @cond EXCLUDE
1396
1548
1397
1549
namespace detail {
1398
-
1399
1550
// ----- Outlined implementations of simd_obj_impl class memory access APIs.
1400
1551
1401
1552
template <typename T, int N, class T1 , class SFINAE >
0 commit comments