@@ -5919,7 +5919,6 @@ __ESIMD_API
5919
5919
// / atomic_update(T *p, simd<Toffset, N> byte_offset,
5920
5920
// / props = {}); /// (usm-au0-2)
5921
5921
// / simd<T, N>
5922
- // /
5923
5922
// / atomic_update(T *p, simd_view<OffsetObjT, RegionTy> byte_offset,
5924
5923
// / simd_mask<N> mask, props = {}); /// (usm-au0-3)
5925
5924
// / simd<T, N>
@@ -5979,27 +5978,45 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
5979
5978
return detail::atomic_update_impl<
5980
5979
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
5981
5980
p, byte_offset, mask);
5982
- } else {
5983
- if constexpr (Op == atomic_op::load) {
5984
- if constexpr (std::is_integral_v<T>) {
5985
- return atomic_update<atomic_op::bit_or, T, N>(
5986
- p, byte_offset, simd<T, N>(0 ), mask, props);
5987
- } else {
5988
- using Tint = detail::uint_type_t <sizeof (T)>;
5989
- simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
5990
- reinterpret_cast <Tint *>(p), byte_offset, simd<Tint, N>(0 ), mask,
5991
- props);
5992
- return Res.template bit_cast_view <T>();
5993
- }
5994
- } else {
5995
- detail::check_atomic<Op, T, N, 0 >();
5981
+ } else if constexpr (N == 16 || N == 32 ) {
5982
+ // TODO: In fact GPU BE supports legalization for any N, even for
5983
+ // non-power-of-2, but it is implemented with an error now. For example,
5984
+ // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
5985
+ // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
5986
+ // and GPU thinks now it is up to 16.
5987
+ // Thus we emulate N=16 with 2 calls with N=8 each.
5988
+ // N=32 is emulated with 4 calls with N=8 each.
5989
+ // Task1: Remove the special-case emulation for N=16 and N=32 below when
5990
+ // GPU driver fixes the error.
5991
+ // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
5992
+ // and let svm.atomic for any N.
5996
5993
5997
- simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
5998
- simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(byte_offset);
5999
- vAddr += offset_i1;
6000
- using Tx = typename detail::__raw_t <T>;
6001
- return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data (), mask.data ());
5994
+ simd<T, N> Res;
5995
+ for (int I = 0 ; I < N; I += 8 ) {
5996
+ simd_mask<8 > Mask8 = mask.template select <8 , 1 >(I);
5997
+ simd<Toffset, 8 > ByteOffset8 = byte_offset.template select <8 , 1 >(I);
5998
+ Res.template select <8 , 1 >(I) =
5999
+ atomic_update<Op, T, 8 >(p, ByteOffset8, Mask8, props);
6002
6000
}
6001
+ return Res;
6002
+ } else if constexpr (Op == atomic_op::load) {
6003
+ if constexpr (std::is_integral_v<T>) {
6004
+ return atomic_update<atomic_op::bit_or, T, N>(p, byte_offset,
6005
+ simd<T, N>(0 ), mask, props);
6006
+ } else {
6007
+ using Tint = detail::uint_type_t <sizeof (T)>;
6008
+ simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
6009
+ reinterpret_cast <Tint *>(p), byte_offset, simd<Tint, N>(0 ), mask,
6010
+ props);
6011
+ return Res.template bit_cast_view <T>();
6012
+ }
6013
+ } else {
6014
+ detail::check_atomic<Op, T, N, 0 >();
6015
+ simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
6016
+ simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(byte_offset);
6017
+ vAddr += offset_i1;
6018
+ using Tx = typename detail::__raw_t <T>;
6019
+ return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data (), mask.data ());
6003
6020
}
6004
6021
}
6005
6022
@@ -6197,28 +6214,47 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
6197
6214
return detail::atomic_update_impl<
6198
6215
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
6199
6216
p, byte_offset, src0, mask);
6200
- } else {
6201
- if constexpr (Op == atomic_op::store) {
6202
- if constexpr (std::is_integral_v<T>) {
6203
- return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
6204
- props);
6205
- } else {
6206
- using Tint = detail::uint_type_t <sizeof (T)>;
6207
- simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6208
- reinterpret_cast <Tint *>(p), byte_offset,
6209
- src0.template bit_cast_view <Tint>(), mask, props);
6210
- return Res.template bit_cast_view <T>();
6211
- }
6217
+ } else if constexpr (N == 16 || N == 32 ) {
6218
+ // TODO: In fact GPU BE supports legalization for any N, even for
6219
+ // non-power-of-2, but it is implemented with an error now. For example,
6220
+ // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6221
+ // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6222
+ // and GPU thinks now it is up to 16.
6223
+ // Thus we emulate N=16 with 2 calls with N=8 each.
6224
+ // N=32 is emulated with 4 calls with N=8 each.
6225
+ // Task1: Remove the special-case emulation for N=16 and N=32 below when
6226
+ // GPU driver fixes the error.
6227
+ // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6228
+ // and let svm.atomic for any N.
6229
+ simd<T, N> Res;
6230
+ for (int I = 0 ; I < N; I += 8 ) {
6231
+ simd_mask<8 > Mask8 = mask.template select <8 , 1 >(I);
6232
+ simd<Toffset, 8 > ByteOffset8 = byte_offset.template select <8 , 1 >(I);
6233
+ simd<T, 8 > Src08 = src0.template select <8 , 1 >(I);
6234
+ Res.template select <8 , 1 >(I) =
6235
+ atomic_update<Op, T, 8 >(p, ByteOffset8, Src08, Mask8, props);
6236
+ }
6237
+ return Res;
6238
+ } else if constexpr (Op == atomic_op::store) {
6239
+ if constexpr (std::is_integral_v<T>) {
6240
+ return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
6241
+ props);
6212
6242
} else {
6213
- detail::check_atomic<Op, T, N, 1 >();
6214
- simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
6215
- simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(byte_offset);
6216
- vAddr += offset_i1;
6217
-
6218
- using Tx = typename detail::__raw_t <T>;
6219
- return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data (), src0.data (),
6220
- mask.data ());
6243
+ using Tint = detail::uint_type_t <sizeof (T)>;
6244
+ simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6245
+ reinterpret_cast <Tint *>(p), byte_offset,
6246
+ src0.template bit_cast_view <Tint>(), mask, props);
6247
+ return Res.template bit_cast_view <T>();
6221
6248
}
6249
+ } else {
6250
+ detail::check_atomic<Op, T, N, 1 >();
6251
+ simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
6252
+ simd<uintptr_t , N> offset_i1 = convert<uintptr_t >(byte_offset);
6253
+ vAddr += offset_i1;
6254
+
6255
+ using Tx = typename detail::__raw_t <T>;
6256
+ return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data (), src0.data (),
6257
+ mask.data ());
6222
6258
}
6223
6259
}
6224
6260
@@ -6443,6 +6479,28 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
6443
6479
return detail::atomic_update_impl<
6444
6480
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
6445
6481
p, byte_offset, src1, src0, mask);
6482
+ } else if constexpr (N == 16 || N == 32 ) {
6483
+ // TODO: In fact GPU BE supports legalization for any N, even for
6484
+ // non-power-of-2, but it is implemented with an error now. For example,
6485
+ // N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6486
+ // (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6487
+ // and GPU thinks now it is up to 16.
6488
+ // Thus we emulate N=16 with 2 calls with N=8 each.
6489
+ // N=32 is emulated with 4 calls with N=8 each.
6490
+ // Task1: Remove the special-case emulation for N=16 and N=32 below when
6491
+ // GPU driver fixes the error.
6492
+ // Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6493
+ // and let svm.atomic for any N.
6494
+ simd<T, N> Res;
6495
+ for (int I = 0 ; I < N; I += 8 ) {
6496
+ simd_mask<8 > Mask8 = mask.template select <8 , 1 >(I);
6497
+ simd<Toffset, 8 > ByteOffset8 = byte_offset.template select <8 , 1 >(I);
6498
+ simd<T, 8 > Src08 = src0.template select <8 , 1 >(I);
6499
+ simd<T, 8 > Src18 = src1.template select <8 , 1 >(I);
6500
+ Res.template select <8 , 1 >(I) =
6501
+ atomic_update<Op, T, 8 >(p, ByteOffset8, Src08, Src18, Mask8, props);
6502
+ }
6503
+ return Res;
6446
6504
} else {
6447
6505
detail::check_atomic<Op, T, N, 2 >();
6448
6506
simd<uintptr_t , N> vAddr (reinterpret_cast <uintptr_t >(p));
0 commit comments