Skip to content

Commit 19cd614

Browse files
authored
[SYCL][ESIMD] Implement unified memory API - atomic_update(acc,...) with two operands (#12055)
This implements atomic_update with two operands and accessors. We only need four overloads to cover all cases because we can rely on the simd_view to simd implicit conversion operator. The behavior of every possible combination is locked down in a compile time test, and I manually verified the all the combinations are correct and nothing is missed. I also manually tested on PVC. --------- Signed-off-by: Sarnie, Nick <[email protected]>
1 parent 5dc902b commit 19cd614

File tree

5 files changed

+456
-128
lines changed

5 files changed

+456
-128
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 241 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3928,7 +3928,7 @@ atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offsets,
39283928
/// @tparam L2H is L2 cache hint.
39293929
/// @tparam AccessorTy is the \ref sycl::accessor type.
39303930
/// @param acc is the SYCL accessor.
3931-
/// @param offsets is the zero-based offsets.
3931+
/// @param byte_offset is the zero-based offsets.
39323932
/// @param src0 is the first atomic operand.
39333933
/// @param pred is predicates.
39343934
///
@@ -3971,6 +3971,63 @@ atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
39713971
return lsc_format_ret<T>(Tmp);
39723972
#endif
39733973
}
3974+
3975+
/// Accessor-based atomic.
3976+
/// Supported platforms: DG2, PVC
3977+
/// VISA instruction: lsc_atomic_<OP>.ugm
3978+
///
3979+
/// @tparam Op is operation type.
3980+
/// @tparam T is element type.
3981+
/// @tparam N is the number of channels (platform dependent).
3982+
/// @tparam DS is the data size.
3983+
/// @tparam L1H is L1 cache hint.
3984+
/// @tparam L2H is L2 cache hint.
3985+
/// @tparam AccessorTy is the \ref sycl::accessor type.
3986+
/// @param acc is the SYCL accessor.
3987+
/// @param byte_offset is the zero-based offsets.
3988+
/// @param src0 is the first atomic operand (expected value).
3989+
/// @param src1 is the second atomic operand (new value).
3990+
/// @param pred is predicates.
3991+
///
3992+
/// @return A vector of the old values at the memory locations before the
3993+
/// update.
3994+
template <atomic_op Op, typename T, int N, lsc_data_size DS, cache_hint L1H,
3995+
cache_hint L2H, typename AccessorTy, typename Toffset>
3996+
__ESIMD_API std::enable_if_t<
3997+
get_num_args<Op>() == 2 &&
3998+
sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
3999+
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
4000+
simd<T, N>>
4001+
atomic_update_impl(AccessorTy acc, simd<Toffset, N> byte_offset,
4002+
simd<T, N> src0, simd<T, N> src1, simd_mask<N> pred) {
4003+
#ifdef __ESIMD_FORCE_STATELESS_MEM
4004+
return atomic_update_impl<Op, T, N, DS, L1H, L2H>(
4005+
__ESIMD_DNS::accessorToPointer<T>(acc), byte_offset, src0, src1, pred);
4006+
#else
4007+
static_assert(std::is_integral_v<Toffset> && sizeof(Toffset) == 4,
4008+
"Unsupported offset type");
4009+
check_lsc_vector_size<1>();
4010+
check_lsc_data_size<T, DS>();
4011+
check_atomic<Op, T, N, 2, /*IsLSC*/ true>();
4012+
check_cache_hint<cache_action::atomic, L1H, L2H>();
4013+
constexpr uint16_t AddressScale = 1;
4014+
constexpr int ImmOffset = 0;
4015+
constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
4016+
constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
4017+
constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
4018+
using MsgT = typename lsc_expand_type<T>::type;
4019+
constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
4020+
simd<MsgT, N> Msg_data0 = lsc_format_input<MsgT>(src0);
4021+
simd<MsgT, N> Msg_data1 = lsc_format_input<MsgT>(src1);
4022+
auto si = get_surface_index(acc);
4023+
simd<MsgT, N> Tmp =
4024+
__esimd_lsc_xatomic_bti_2<MsgT, IOp, L1H, L2H, AddressScale, ImmOffset,
4025+
EDS, VS, Transposed, N>(
4026+
pred.data(), byte_offset.data(), Msg_data0.data(), Msg_data1.data(),
4027+
si);
4028+
return lsc_format_ret<T>(Tmp);
4029+
#endif
4030+
}
39744031
} // namespace detail
39754032

39764033
/// @addtogroup sycl_esimd_memory_atomics
@@ -6143,58 +6200,220 @@ atomic_update(AccessorTy acc, Toffset offset, simd<Tx, N> src0,
61436200
}
61446201

61456202
/// @anchor accessor_atomic_update2
6203+
/// @brief Two-argument variant of the atomic update operation.
6204+
///
6205+
/// simd<T, N>
6206+
/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
6207+
/// simd<T, N> src0, simd<T, N> src1,
6208+
// simd_mask<N> mask,props = {}); // (acc-au2-1)
6209+
///
6210+
/// simd<T, N>
6211+
/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
6212+
/// simd<T, N> src0, simd<T, N> src1,
6213+
/// props = {}); // (acc-au2-2)
6214+
/// simd<T, N>
6215+
/// atomic_update(AccessorTy acc, simd_view<OffsetObjT, OffsetRegionTy>
6216+
/// byte_offset, simd<T, N> src0, simd<T, N> src1,
6217+
/// simd_mask<N> mask, props = {}); // (acc-au2-3)
6218+
///
6219+
/// simd<T, N>
6220+
/// atomic_update(AccessorTy acc,
6221+
/// simd_view<OffsetObjT, OffsetRegionTy>, byte_offset,
6222+
/// simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
6223+
///
6224+
/// simd<T, N>
6225+
/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
6226+
/// simd<T, N> src0, simd<T, N> src1,
6227+
// simd_mask<N> mask,props = {}); // (acc-au2-1)
6228+
///
61466229
/// Atomically updates \c N memory locations represented by an accessor and
61476230
/// a vector of offsets and returns a vector of old
61486231
/// values found at the memory locations before update. The update operation
61496232
/// has 2 additional arguments.
61506233
///
61516234
/// @tparam Op The atomic operation - can be one of the following:
61526235
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
6153-
/// @tparam Tx The vector element type.
6236+
/// @tparam T The vector element type.
61546237
/// @tparam N The number of memory locations to update.
61556238
/// @tparam AccessorTy type of the SYCL accessor.
61566239
/// @param acc The SYCL accessor.
6157-
/// @param offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
6240+
/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
61586241
/// offsets are supported only when stateless memory accesses are enforced,
61596242
/// i.e. accessor based accesses are automatically converted to stateless
61606243
/// accesses.
61616244
/// @param src0 The first additional argument (new value).
61626245
/// @param src1 The second additional argument (expected value).
61636246
/// @param mask Operation mask, only locations with non-zero in the
61646247
/// corresponding mask element are updated.
6248+
/// @param props The parameter 'props' specifies the optional compile-time
6249+
/// properties list. Only L1/L2 properties are used.
6250+
// Other properties are ignored.
61656251
/// @return A vector of the old values at the memory locations before the
61666252
/// update.
61676253
///
6168-
template <atomic_op Op, typename Tx, int N, typename Toffset,
6169-
typename AccessorTy>
6254+
template <atomic_op Op, typename T, int N, typename Toffset,
6255+
typename AccessorTy,
6256+
typename PropertyListT =
6257+
ext::oneapi::experimental::detail::empty_properties_t>
61706258
__ESIMD_API std::enable_if_t<
6171-
std::is_integral_v<Toffset> &&
6172-
sycl::detail::acc_properties::is_accessor_v<AccessorTy> &&
6173-
!sycl::detail::acc_properties::is_local_accessor_v<AccessorTy>,
6174-
simd<Tx, N>>
6175-
atomic_update(AccessorTy acc, simd<Toffset, N> offset, simd<Tx, N> src0,
6176-
simd<Tx, N> src1, simd_mask<N> mask) {
6259+
__ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
6260+
__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
6261+
ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6262+
simd<T, N>>
6263+
atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
6264+
simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
61776265
#ifdef __ESIMD_FORCE_STATELESS_MEM
6178-
return atomic_update<Op, Tx, N>(__ESIMD_DNS::accessorToPointer<Tx>(acc),
6179-
offset, src0, src1, mask);
6266+
return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
6267+
byte_offset, src0, src1, mask, props);
61806268
#else
6269+
constexpr auto L1Hint =
6270+
detail::getPropertyValue<PropertyListT, cache_hint_L1_key>(
6271+
cache_hint::none);
6272+
6273+
constexpr auto L2Hint =
6274+
detail::getPropertyValue<PropertyListT, cache_hint_L2_key>(
6275+
cache_hint::none);
6276+
6277+
static_assert(!PropertyListT::template has_property<cache_hint_L3_key>(),
6278+
"L3 cache hint is reserved. The old/experimental L3 LSC cache "
6279+
"hint is cache_level::L2 now.");
61816280
static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
61826281
static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
6183-
if constexpr (Op == atomic_op::fcmpxchg) {
6184-
// Auto-convert FP atomics to LSC version.
6185-
return atomic_update<detail::to_lsc_atomic_op<Op>(), Tx, N>(
6186-
acc, offset, src0, src1, mask);
6282+
// Use LSC atomic when cache hints are present, FP atomics is used,
6283+
// non-power of two length is used, or operation width greater than 32.
6284+
if constexpr (L1Hint != cache_hint::none || L2Hint != cache_hint::none ||
6285+
Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32)) {
6286+
// 2-argument lsc_atomic_update arguments order matches the standard one -
6287+
// expected value first, then new value. But atomic_update uses reverse
6288+
// order, hence the src1/src0 swap.
6289+
return detail::atomic_update_impl<
6290+
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint>(
6291+
acc, byte_offset, src1, src0, mask);
61876292
} else {
6188-
detail::check_atomic<Op, Tx, N, 2>();
6189-
static_assert(sizeof(Tx) == 4, "Only 32 bit data is supported");
6293+
detail::check_atomic<Op, T, N, 2>();
6294+
static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
61906295
const auto si = __ESIMD_NS::get_surface_index(acc);
6191-
using T = typename detail::__raw_t<Tx>;
6192-
return __esimd_dword_atomic2<Op, T, N>(mask.data(), si, offset.data(),
6193-
src0.data(), src1.data());
6296+
using Tx = typename detail::__raw_t<T>;
6297+
return __esimd_dword_atomic2<Op, Tx, N>(
6298+
mask.data(), si, byte_offset.data(),
6299+
sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()),
6300+
sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src1.data()));
61946301
}
61956302
#endif
61966303
}
61976304

6305+
/// simd<T, N>
6306+
/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
6307+
/// simd<T, N> src0, simd<T, N> src1,
6308+
/// props = {}); // (acc-au2-2)
6309+
///
6310+
/// A variation of \c atomic_update API with no mask operand.
6311+
///
6312+
/// @tparam Op The atomic operation - can be one of the following:
6313+
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
6314+
/// @tparam T The vector element type.
6315+
/// @tparam N The number of memory locations to update.
6316+
/// @param acc The SYCL accessor.
6317+
/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
6318+
/// @param src0 The first additional argument (new value).
6319+
/// @param src1 The second additional argument (expected value).
6320+
/// @param props The parameter 'props' specifies the optional compile-time
6321+
/// properties list. Only L1/L2 properties are used.
6322+
// Other properties are ignored.
6323+
/// @return A vector of the old values at the memory locations before the
6324+
/// update.
6325+
///
6326+
template <atomic_op Op, typename T, int N, typename Toffset,
6327+
typename AccessorTy,
6328+
typename PropertyListT =
6329+
ext::oneapi::experimental::detail::empty_properties_t>
6330+
__ESIMD_API std::enable_if_t<
6331+
__ESIMD_DNS::get_num_args<Op>() == 2 &&
6332+
__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
6333+
ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6334+
simd<T, N>>
6335+
atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
6336+
simd<T, N> src1, PropertyListT props = {}) {
6337+
simd_mask<N> mask = 1;
6338+
return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
6339+
}
6340+
6341+
/// simd<T, N>
6342+
/// atomic_update(AccessorTy acc, simd_view<OffsetObjT, OffsetRegionTy>
6343+
/// byte_offset, simd<T, N> src0, simd<T, N> src1,
6344+
/// simd_mask<N> mask, props = {}); // (acc-au2-3)
6345+
///
6346+
/// A variation of \c atomic_update API with \c byte_offset represented as
6347+
/// a \c simd_view object.
6348+
///
6349+
/// @tparam Op The atomic operation - can be one of the following:
6350+
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
6351+
/// @tparam T The vector element type.
6352+
/// @tparam N The number of memory locations to update.
6353+
/// @param acc The SYCL accessor.
6354+
/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
6355+
/// @param src0 The first additional argument (new value).
6356+
/// @param src1 The second additional argument (expected value).
6357+
/// @param mask Operation mask, only locations with non-zero in the
6358+
/// corresponding mask element are updated.
6359+
/// @param props The parameter 'props' specifies the optional compile-time
6360+
/// properties list. Only L1/L2 properties are used.
6361+
// Other properties are ignored.
6362+
/// @return A vector of the old values at the memory locations before the
6363+
/// update.
6364+
template <atomic_op Op, typename T, int N, typename OffsetObjT,
6365+
typename AccessorTy, typename OffsetRegionTy,
6366+
typename PropertyListT =
6367+
ext::oneapi::experimental::detail::empty_properties_t>
6368+
__ESIMD_API std::enable_if_t<
6369+
__ESIMD_DNS::get_num_args<Op>() == 2 &&
6370+
__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
6371+
ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6372+
simd<T, N>>
6373+
atomic_update(AccessorTy acc, simd_view<OffsetObjT, OffsetRegionTy> byte_offset,
6374+
simd<T, N> src0, simd<T, N> src1, simd_mask<N> mask,
6375+
PropertyListT props = {}) {
6376+
return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
6377+
props);
6378+
}
6379+
6380+
/// simd<T, N>
6381+
/// atomic_update(AccessorTy acc,
6382+
/// simd_view<OffsetObjT, OffsetRegionTy>, byte_offset,
6383+
/// simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
6384+
///
6385+
/// A variation of \c atomic_update API with \c byte_offset represented as
6386+
/// a \c simd_view object and no mask operand.
6387+
///
6388+
/// @tparam Op The atomic operation - can be one of the following:
6389+
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
6390+
/// @tparam T The vector element type.
6391+
/// @tparam N The number of memory locations to update.
6392+
/// @param acc The SYCL accessor.
6393+
/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
6394+
/// @param src0 The first additional argument (new value).
6395+
/// @param src1 The second additional argument (expected value).
6396+
/// @param props The parameter 'props' specifies the optional compile-time
6397+
/// properties list. Only L1/L2 properties are used.
6398+
// Other properties are ignored.
6399+
/// @return A vector of the old values at the memory locations before the
6400+
/// update.
6401+
template <atomic_op Op, typename T, int N, typename OffsetObjT,
6402+
typename AccessorTy, typename OffsetRegionTy,
6403+
typename PropertyListT =
6404+
ext::oneapi::experimental::detail::empty_properties_t>
6405+
__ESIMD_API std::enable_if_t<
6406+
__ESIMD_DNS::get_num_args<Op>() == 2 &&
6407+
__ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
6408+
ext::oneapi::experimental::is_property_list_v<PropertyListT>,
6409+
simd<T, N>>
6410+
atomic_update(AccessorTy acc, simd_view<OffsetObjT, OffsetRegionTy> byte_offset,
6411+
simd<T, N> src0, simd<T, N> src1, PropertyListT props = {}) {
6412+
simd_mask<N> mask = 1;
6413+
return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
6414+
props);
6415+
}
6416+
61986417
/// Variant of \c atomic_update that uses \c local_accessor as a parameter.
61996418
/// Atomically updates \c N memory locations represented by an accessor and
62006419
/// a vector of offsets and returns a vector of old
@@ -6232,35 +6451,6 @@ atomic_update(AccessorTy acc, simd<uint32_t, N> offset, simd<Tx, N> src0,
62326451
}
62336452
}
62346453

6235-
/// A variation of \c atomic_update API with \c offsets represented as
6236-
/// \c simd_view object.
6237-
///
6238-
/// @tparam Op The atomic operation - can be one of the following:
6239-
/// \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
6240-
/// @tparam Tx The vector element type.
6241-
/// @tparam N The number of memory locations to update.
6242-
/// @tparam AccessorTy type of the SYCL accessor.
6243-
/// @param acc The SYCL accessor.
6244-
/// @param offset The simd_view of 32-bit or 64-bit offsets in bytes. 64-bit
6245-
/// offsets are supported only when stateless memory accesses are enforced,
6246-
/// i.e. accessor based accesses are automatically converted to stateless
6247-
/// accesses.
6248-
/// @param src0 The first additional argument (new value).
6249-
/// @param src1 The second additional argument (expected value).
6250-
/// @param mask Operation mask, only locations with non-zero in the
6251-
/// corresponding mask element are updated.
6252-
/// @return A vector of the old values at the memory locations before the
6253-
/// update.
6254-
///
6255-
template <atomic_op Op, typename Tx, int N, typename Toffset,
6256-
typename AccessorTy, typename RegionTy = region1d_t<Toffset, N, 1>>
6257-
__ESIMD_API std::enable_if_t<
6258-
std::is_integral_v<Toffset> && !std::is_pointer_v<AccessorTy>, simd<Tx, N>>
6259-
atomic_update(AccessorTy acc, simd_view<Toffset, RegionTy> offsets,
6260-
simd<Tx, N> src0, simd<Tx, N> src1, simd_mask<N> mask) {
6261-
return atomic_update<Op, Tx, N>(acc, offsets.read(), src0, src1, mask);
6262-
}
6263-
62646454
/// A variation of \c atomic_update API with \c offsets represented as
62656455
/// scalar.
62666456
///

0 commit comments

Comments
 (0)