Skip to content

Commit 44a74d0

Browse files
authored
[ESIMD] Fix atomic_update() implementation for N=16 and N=32 on Gen12 (#12722)
atomic_update() for USM and ACC N=16,32 were lowered to SVM/DWORD atomic intrinsics even though the HW instructions on Gen12 supported only N up to 8 for USM and up to 16 for ACC. GPU had legalization pass for N that split longer vectors to smaller and available in HW. That GPU optimization/legalization workes incorrectly for USM as it splits longer vectors assuming instruction is available for N=16 in case of USM, which is not correct. The patch here implements splitting of N=16 and N=32 cases for atomic_update(usm, ...) to N=8 vectors until GPU fixes the legalization for USM atomic_update. Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
1 parent 6194f3c commit 44a74d0

File tree

2 files changed

+102
-44
lines changed

2 files changed

+102
-44
lines changed

sycl/include/sycl/ext/intel/esimd/memory.hpp

Lines changed: 98 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -5919,7 +5919,6 @@ __ESIMD_API
59195919
/// atomic_update(T *p, simd<Toffset, N> byte_offset,
59205920
/// props = {}); /// (usm-au0-2)
59215921
/// simd<T, N>
5922-
///
59235922
/// atomic_update(T *p, simd_view<OffsetObjT, RegionTy> byte_offset,
59245923
/// simd_mask<N> mask, props = {}); /// (usm-au0-3)
59255924
/// simd<T, N>
@@ -5979,27 +5978,45 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd_mask<N> mask,
59795978
return detail::atomic_update_impl<
59805979
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
59815980
p, byte_offset, mask);
5982-
} else {
5983-
if constexpr (Op == atomic_op::load) {
5984-
if constexpr (std::is_integral_v<T>) {
5985-
return atomic_update<atomic_op::bit_or, T, N>(
5986-
p, byte_offset, simd<T, N>(0), mask, props);
5987-
} else {
5988-
using Tint = detail::uint_type_t<sizeof(T)>;
5989-
simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
5990-
reinterpret_cast<Tint *>(p), byte_offset, simd<Tint, N>(0), mask,
5991-
props);
5992-
return Res.template bit_cast_view<T>();
5993-
}
5994-
} else {
5995-
detail::check_atomic<Op, T, N, 0>();
5981+
} else if constexpr (N == 16 || N == 32) {
5982+
// TODO: In fact GPU BE supports legalization for any N, even for
5983+
// non-power-of-2, but it is implemented with an error now. For example,
5984+
// N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
5985+
// (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
5986+
// and GPU thinks now it is up to 16.
5987+
// Thus we emulate N=16 with 2 calls with N=8 each.
5988+
// N=32 is emulated with 4 calls with N=8 each.
5989+
// Task1: Remove the special-case emulation for N=16 and N=32 below when
5990+
// GPU driver fixes the error.
5991+
// Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
5992+
// and let svm.atomic for any N.
59965993

5997-
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
5998-
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
5999-
vAddr += offset_i1;
6000-
using Tx = typename detail::__raw_t<T>;
6001-
return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data(), mask.data());
5994+
simd<T, N> Res;
5995+
for (int I = 0; I < N; I += 8) {
5996+
simd_mask<8> Mask8 = mask.template select<8, 1>(I);
5997+
simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
5998+
Res.template select<8, 1>(I) =
5999+
atomic_update<Op, T, 8>(p, ByteOffset8, Mask8, props);
60026000
}
6001+
return Res;
6002+
} else if constexpr (Op == atomic_op::load) {
6003+
if constexpr (std::is_integral_v<T>) {
6004+
return atomic_update<atomic_op::bit_or, T, N>(p, byte_offset,
6005+
simd<T, N>(0), mask, props);
6006+
} else {
6007+
using Tint = detail::uint_type_t<sizeof(T)>;
6008+
simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
6009+
reinterpret_cast<Tint *>(p), byte_offset, simd<Tint, N>(0), mask,
6010+
props);
6011+
return Res.template bit_cast_view<T>();
6012+
}
6013+
} else {
6014+
detail::check_atomic<Op, T, N, 0>();
6015+
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6016+
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6017+
vAddr += offset_i1;
6018+
using Tx = typename detail::__raw_t<T>;
6019+
return __esimd_svm_atomic0<Op, Tx, N>(vAddr.data(), mask.data());
60036020
}
60046021
}
60056022

@@ -6197,28 +6214,47 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
61976214
return detail::atomic_update_impl<
61986215
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
61996216
p, byte_offset, src0, mask);
6200-
} else {
6201-
if constexpr (Op == atomic_op::store) {
6202-
if constexpr (std::is_integral_v<T>) {
6203-
return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
6204-
props);
6205-
} else {
6206-
using Tint = detail::uint_type_t<sizeof(T)>;
6207-
simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6208-
reinterpret_cast<Tint *>(p), byte_offset,
6209-
src0.template bit_cast_view<Tint>(), mask, props);
6210-
return Res.template bit_cast_view<T>();
6211-
}
6217+
} else if constexpr (N == 16 || N == 32) {
6218+
// TODO: In fact GPU BE supports legalization for any N, even for
6219+
// non-power-of-2, but it is implemented with an error now. For example,
6220+
// N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6221+
// (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6222+
// and GPU thinks now it is up to 16.
6223+
// Thus we emulate N=16 with 2 calls with N=8 each.
6224+
// N=32 is emulated with 4 calls with N=8 each.
6225+
// Task1: Remove the special-case emulation for N=16 and N=32 below when
6226+
// GPU driver fixes the error.
6227+
// Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6228+
// and let svm.atomic for any N.
6229+
simd<T, N> Res;
6230+
for (int I = 0; I < N; I += 8) {
6231+
simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6232+
simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6233+
simd<T, 8> Src08 = src0.template select<8, 1>(I);
6234+
Res.template select<8, 1>(I) =
6235+
atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Mask8, props);
6236+
}
6237+
return Res;
6238+
} else if constexpr (Op == atomic_op::store) {
6239+
if constexpr (std::is_integral_v<T>) {
6240+
return atomic_update<atomic_op::xchg, T, N>(p, byte_offset, src0, mask,
6241+
props);
62126242
} else {
6213-
detail::check_atomic<Op, T, N, 1>();
6214-
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6215-
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6216-
vAddr += offset_i1;
6217-
6218-
using Tx = typename detail::__raw_t<T>;
6219-
return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data(), src0.data(),
6220-
mask.data());
6243+
using Tint = detail::uint_type_t<sizeof(T)>;
6244+
simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
6245+
reinterpret_cast<Tint *>(p), byte_offset,
6246+
src0.template bit_cast_view<Tint>(), mask, props);
6247+
return Res.template bit_cast_view<T>();
62216248
}
6249+
} else {
6250+
detail::check_atomic<Op, T, N, 1>();
6251+
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));
6252+
simd<uintptr_t, N> offset_i1 = convert<uintptr_t>(byte_offset);
6253+
vAddr += offset_i1;
6254+
6255+
using Tx = typename detail::__raw_t<T>;
6256+
return __esimd_svm_atomic1<Op, Tx, N>(vAddr.data(), src0.data(),
6257+
mask.data());
62226258
}
62236259
}
62246260

@@ -6443,6 +6479,28 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
64436479
return detail::atomic_update_impl<
64446480
Op, T, N, detail::lsc_data_size::default_size, L1Hint, L2Hint, Toffset>(
64456481
p, byte_offset, src1, src0, mask);
6482+
} else if constexpr (N == 16 || N == 32) {
6483+
// TODO: In fact GPU BE supports legalization for any N, even for
6484+
// non-power-of-2, but it is implemented with an error now. For example,
6485+
// N=17 is emulated as 2 calls (N=16 and N=1), while it must be 3 calls:
6486+
// (N=8, N=8, N=1). I.e. Gen12 atomic instruction supports only N up to 8
6487+
// and GPU thinks now it is up to 16.
6488+
// Thus we emulate N=16 with 2 calls with N=8 each.
6489+
// N=32 is emulated with 4 calls with N=8 each.
6490+
// Task1: Remove the special-case emulation for N=16 and N=32 below when
6491+
// GPU driver fixes the error.
6492+
// Task2: remove the condition "!__ESIMD_DNS::isPowerOf2(N, 32)" above
6493+
// and let svm.atomic for any N.
6494+
simd<T, N> Res;
6495+
for (int I = 0; I < N; I += 8) {
6496+
simd_mask<8> Mask8 = mask.template select<8, 1>(I);
6497+
simd<Toffset, 8> ByteOffset8 = byte_offset.template select<8, 1>(I);
6498+
simd<T, 8> Src08 = src0.template select<8, 1>(I);
6499+
simd<T, 8> Src18 = src1.template select<8, 1>(I);
6500+
Res.template select<8, 1>(I) =
6501+
atomic_update<Op, T, 8>(p, ByteOffset8, Src08, Src18, Mask8, props);
6502+
}
6503+
return Res;
64466504
} else {
64476505
detail::check_atomic<Op, T, N, 2>();
64486506
simd<uintptr_t, N> vAddr(reinterpret_cast<uintptr_t>(p));

sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -706,13 +706,13 @@ bool test_int_types_and_sizes(queue q, const Config &cfg) {
706706

707707
passed &=
708708
test_int_types<8, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
709+
passed &=
710+
test_int_types<16, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
711+
passed &=
712+
test_int_types<32, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(q, cfg);
709713

710714
// Supported by LSC atomic:
711715
if constexpr (UseLSCFeatures) {
712-
passed &= test_int_types<16, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
713-
q, cfg);
714-
passed &= test_int_types<32, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
715-
q, cfg);
716716
passed &= test_int_types<64, Op, UseMask, UseLSCFeatures, UseAcc, SignMask>(
717717
q, cfg);
718718
// non power of two values are supported only in newer driver.

0 commit comments

Comments
 (0)