Skip to content

Commit 035ef2b

Browse files
authored
[SYCL][ESIMD] Fix an error when a scalar offset is provided as a parameter to the API (#8075)
1 parent c3c5e92 commit 035ef2b

File tree

2 files changed

+266
-19
lines changed

2 files changed

+266
-19
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 170 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,24 @@ __ESIMD_API simd<Tx, N> gather(const Tx *p,
175175
return gather<Tx, N>(p, simd<Ty, N>(offsets), mask);
176176
}
177177

178+
/// A variation of \c gather API with \c offsets represented as scalar.
179+
///
180+
/// @tparam Tx Element type, must be of size 4 or less.
181+
/// @tparam N Number of elements to read; can be \c 1, \c 2, \c 4, \c 8, \c 16
182+
/// or \c 32.
183+
/// @param p The base address.
184+
/// @param offset the scalar 32-bit or 64-bit offset in bytes.
185+
/// ((byte*)p + offset) must be element size aligned.
186+
/// @param mask The access mask, defaults to all 1s.
187+
/// @return A vector of elements read. Elements in masked out lanes are
188+
/// undefined.
189+
///
190+
template <typename Tx, int N, typename Toffset>
191+
__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
192+
gather(const Tx *p, Toffset offset, simd_mask<N> mask = 1) {
193+
return gather<Tx, N>(p, simd<Toffset, N>(offset), mask);
194+
}
195+
178196
/// Writes ("scatters") elements of the input vector to different memory
179197
/// locations. Each memory location is base address plus an offset - a
180198
/// value of the corresponding element in the input offset vector. Access to
@@ -236,6 +254,23 @@ __ESIMD_API void scatter(Tx *p, simd_view<Toffset, RegionTy> offsets,
236254
scatter<Tx, N>(p, simd<Ty, N>(offsets), vals, mask);
237255
}
238256

257+
/// A variation of \c scatter API with \c offsets represented as scalar.
258+
///
259+
/// @tparam Tx Element type, must be of size 4 or less.
260+
/// @tparam N Number of elements to write; can be \c 1, \c 2, \c 4, \c 8, \c 16
261+
/// or \c 32.
262+
/// @param p The base address.
263+
/// @param offset the scalar 32-bit or 64-bit offset in bytes.
264+
/// ((byte*)p + offset) must be element size aligned.
265+
/// @param vals The vector to scatter.
266+
/// @param mask The access mask, defaults to all 1s.
267+
///
268+
template <typename Tx, int N, typename Toffset>
269+
__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
270+
scatter(Tx *p, Toffset offset, simd<Tx, N> vals, simd_mask<N> mask = 1) {
271+
scatter<Tx, N>(p, simd<Toffset, N>(offset), vals, mask);
272+
}
273+
239274
/// Loads a contiguous block of memory from given memory address and returns
240275
/// the loaded data as a vector. Actual code generated depends on the
241276
/// alignment parameter.
@@ -635,6 +670,29 @@ gather_rgba(const T *p, simd_view<Toffset, RegionTy> offsets,
635670
return gather_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), mask);
636671
}
637672

673+
/// A variation of \c gather_rgba API with \c offsets represented as
674+
/// scalar.
675+
///
676+
/// @tparam T Element type of the returned vector. Must be 4 bytes in size.
677+
/// @tparam N Number of pixels to access (matches the size of the \c offsets
678+
/// vector). Must be 8, 16 or 32.
679+
/// @tparam Mask A pixel's channel mask.
680+
/// @param p The USM base pointer representing memory address of the access.
681+
/// @param offset scalar byte offsets of the pixels relative to the base
682+
/// pointer.
683+
/// @param mask Memory access mask. Pixels with zero corresponding mask's
684+
/// predicate are not accessed. Their values in the resulting vector are
685+
/// undefined.
686+
/// @return Read data - up to N*4 values of type \c Tx.
687+
///
688+
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
689+
int N, typename Toffset>
690+
__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>,
691+
simd<T, N * get_num_channels_enabled(RGBAMask)>>
692+
gather_rgba(const T *p, Toffset offset, simd_mask<N> mask = 1) {
693+
return gather_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), mask);
694+
}
695+
638696
template <typename T, int N, rgba_channel_mask RGBAMask>
639697
__SYCL_DEPRECATED("use gather_rgba<rgba_channel_mask>()")
640698
__ESIMD_API std::enable_if_t<
@@ -719,6 +777,30 @@ scatter_rgba(T *p, simd_view<Toffset, RegionTy> offsets,
719777
scatter_rgba<RGBAMask, T, N>(p, simd<Ty, N>(offsets), vals, mask);
720778
}
721779

780+
/// A variation of \c scatter_rgba API with \c offsets represented as
781+
/// scalar
782+
///
783+
/// @tparam T Element type of the returned vector. Must be 4 bytes in size.
784+
/// @tparam N Number of pixels to access (matches the size of the \c offsets
785+
/// vector). Must be 8, 16 or 32.
786+
/// @tparam RGBAMask A pixel's channel mask.
787+
/// @param p The USM base pointer representing memory address of the access.
788+
/// @param vals values to be written.
789+
/// @param offset scalar byte offset of the pixels relative to the base
790+
/// pointer.
791+
/// @param mask Memory access mask. Pixels with zero corresponding mask's
792+
/// predicate are not accessed. Their values in the resulting vector are
793+
/// undefined.
794+
///
795+
template <rgba_channel_mask RGBAMask = rgba_channel_mask::ABGR, typename T,
796+
int N, typename Toffset>
797+
__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset> && N == 1>
798+
scatter_rgba(T *p, Toffset offset,
799+
simd<T, N * get_num_channels_enabled(RGBAMask)> vals,
800+
simd_mask<N> mask = 1) {
801+
scatter_rgba<RGBAMask, T, N>(p, simd<Toffset, N>(offset), vals, mask);
802+
}
803+
722804
template <typename T, int N, rgba_channel_mask RGBAMask>
723805
__SYCL_DEPRECATED("use scatter_rgba<rgba_channel_mask>()")
724806
__ESIMD_API std::
@@ -911,6 +993,61 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p, simd<Toffset, N> offset,
911993
}
912994
}
913995

996+
/// A variation of \c atomic_update API with \c offsets represented as
997+
/// \c simd_view object.
998+
///
999+
/// @tparam Op The atomic operation - can be one of the following:
1000+
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
1001+
/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
1002+
/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
1003+
/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
1004+
/// atomic_op::store.
1005+
/// @tparam Tx The vector element type.
1006+
/// @tparam N The number of memory locations to update.
1007+
/// @param p The USM pointer.
1008+
/// @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
1009+
/// @param src0 The additional argument.
1010+
/// @param mask Operation mask, only locations with non-zero in the
1011+
/// corresponding mask element are updated.
1012+
/// @return A vector of the old values at the memory locations before the
1013+
/// update.
1014+
///
1015+
template <atomic_op Op, typename Tx, int N, typename Toffset,
1016+
typename RegionTy = region1d_t<Toffset, N, 1>>
1017+
__ESIMD_API simd<Tx, N> atomic_update(Tx *p,
1018+
simd_view<Toffset, RegionTy> offsets,
1019+
simd<Tx, N> src0, simd_mask<N> mask) {
1020+
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
1021+
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, mask);
1022+
}
1023+
1024+
/// A variation of \c atomic_update API with \c offset represented as
1025+
/// scalar object.
1026+
///
1027+
/// @tparam Op The atomic operation - can be one of the following:
1028+
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
1029+
/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
1030+
/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
1031+
/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store.
1032+
/// @tparam Tx The vector element type.
1033+
/// @tparam N The number of memory locations to update.
1034+
/// @param p The USM pointer.
1035+
/// @param offset The scalar 32-bit or 64-bit offsets in bytes.
1036+
/// @param src0 The additional argument.
1037+
/// @param mask Operation mask, only locations with non-zero in the
1038+
/// corresponding mask element are updated.
1039+
/// @return A vector of the old values at the memory locations before the
1040+
/// update.
1041+
///
1042+
template <atomic_op Op, typename Tx, int N, typename Toffset>
1043+
__ESIMD_API std::enable_if_t<
1044+
std::is_integral_v<Toffset> &&
1045+
((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
1046+
simd<Tx, N>>
1047+
atomic_update(Tx *p, Toffset offset, simd<Tx, N> src0, simd_mask<N> mask) {
1048+
return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), src0, mask);
1049+
}
1050+
9141051
/// @anchor usm_atomic_update0
9151052
/// @brief No-argument variant of the atomic update operation.
9161053
///
@@ -970,32 +1107,24 @@ __ESIMD_API simd<Tx, N> atomic_update(Tx *p,
9701107
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), mask);
9711108
}
9721109

973-
/// A variation of \c atomic_update API with \c offsets represented as
974-
/// \c simd_view object.
1110+
/// A variation of \c atomic_update API with \c offset represented as
1111+
/// scalar.
9751112
///
976-
/// @tparam Op The atomic operation - can be one of the following:
977-
/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
978-
/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
979-
/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
980-
/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
981-
/// atomic_op::save.
1113+
/// @tparam Op The atomic operation - can be \c atomic_op::inc or
1114+
/// \c atomic_op::dec, \c atomic_op::load.
9821115
/// @tparam Tx The vector element type.
9831116
/// @tparam N The number of memory locations to update.
9841117
/// @param p The USM pointer.
985-
/// @param offset The simd_view of 32-bit or 64-bit offsets in bytes.
986-
/// @param src0 The additional argument.
1118+
/// @param offset The scalar 32-bit or 64-bit offset in bytes.
9871119
/// @param mask Operation mask, only locations with non-zero in the
9881120
/// corresponding mask element are updated.
9891121
/// @return A vector of the old values at the memory locations before the
9901122
/// update.
9911123
///
992-
template <atomic_op Op, typename Tx, int N, typename Toffset,
993-
typename RegionTy = region1d_t<Toffset, N, 1>>
994-
__ESIMD_API simd<Tx, N> atomic_update(Tx *p,
995-
simd_view<Toffset, RegionTy> offsets,
996-
simd<Tx, N> src0, simd_mask<N> mask) {
997-
using Ty = typename simd_view<Toffset, RegionTy>::element_type;
998-
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, mask);
1124+
template <atomic_op Op, typename Tx, int N, typename Toffset>
1125+
__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
1126+
atomic_update(Tx *p, Toffset offset, simd_mask<N> mask = 1) {
1127+
return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), mask);
9991128
}
10001129

10011130
/// @anchor usm_atomic_update2
@@ -1062,6 +1191,30 @@ atomic_update(Tx *p, simd_view<Toffset, RegionTy> offsets, simd<Tx, N> src0,
10621191
return atomic_update<Op, Tx, N>(p, simd<Ty, N>(offsets), src0, src1, mask);
10631192
}
10641193

1194+
/// A variation of \c atomic_update API with \c offsets represented as
1195+
/// scalar.
1196+
///
1197+
/// @tparam Op The atomic operation - can be one of the following:
1198+
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpwr.
1199+
/// @tparam Tx The vector element type.
1200+
/// @tparam N The number of memory locations to update.
1201+
/// @param p The USM pointer.
1202+
/// @param offset The scalar 32-bit or 64-bit offset in bytes.
1203+
/// @param src0 The first additional argument (new value).
1204+
/// @param src1 The second additional argument (expected value).
1205+
/// @param mask Operation mask, only locations with non-zero in the
1206+
/// corresponding mask element are updated.
1207+
/// @return A vector of the old values at the memory locations before the
1208+
/// update.
1209+
///
1210+
template <atomic_op Op, typename Tx, int N, typename Toffset>
1211+
__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
1212+
atomic_update(Tx *p, Toffset offset, simd<Tx, N> src0, simd<Tx, N> src1,
1213+
simd_mask<N> mask) {
1214+
return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(offset), src0, src1,
1215+
mask);
1216+
}
1217+
10651218
/// @} sycl_esimd_memory_atomics
10661219

10671220
/// @addtogroup sycl_esimd_memory
@@ -1187,7 +1340,6 @@ template <typename T, int N, rgba_channel_mask RGBAMask>
11871340
__ESIMD_API std::enable_if_t<(N == 8 || N == 16 || N == 32) && (sizeof(T) == 4),
11881341
simd<T, N * get_num_channels_enabled(RGBAMask)>>
11891342
slm_gather_rgba(simd<uint32_t, N> offsets, simd_mask<N> mask = 1) {
1190-
11911343
const auto SI = __ESIMD_NS::get_surface_index(detail::LocalAccessorMarker());
11921344
return __esimd_gather4_masked_scaled2<T, N, RGBAMask>(
11931345
SI, 0 /*global_offset*/, offsets.data(), mask.data());
@@ -1395,7 +1547,6 @@ __ESIMD_API void media_block_store(AccessorTy acc, unsigned x, unsigned y,
13951547
/// @cond EXCLUDE
13961548

13971549
namespace detail {
1398-
13991550
// ----- Outlined implementations of simd_obj_impl class memory access APIs.
14001551

14011552
template <typename T, int N, class T1, class SFINAE>

0 commit comments

Comments
 (0)