Skip to content

Commit b3a4466

Browse files
authored
[AMDGPU] Implement GFX12 Memory Model (#98591)
- Emit GLOBAL_WB instructions - Reflect synscope on instructions's `scope:` operand Fixes SWDEV-468508 Fixes SWDEV-470735 Fixes SWDEV-468392 Fixes SWDEV-469622
1 parent 2448927 commit b3a4466

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+3604
-1817
lines changed

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,9 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
609609
bool
610610
insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
611611

612+
bool setAtomicScope(const MachineBasicBlock::iterator &MI,
613+
SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
614+
612615
public:
613616
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
614617

@@ -625,6 +628,28 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
625628
bool IsLastUse) const override;
626629

627630
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631+
632+
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
633+
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
634+
Position Pos) const override;
635+
636+
bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
637+
SIAtomicScope Scope,
638+
SIAtomicAddrSpace AddrSpace) const override {
639+
return setAtomicScope(MI, Scope, AddrSpace);
640+
}
641+
642+
bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
643+
SIAtomicScope Scope,
644+
SIAtomicAddrSpace AddrSpace) const override {
645+
return setAtomicScope(MI, Scope, AddrSpace);
646+
}
647+
648+
bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
649+
SIAtomicScope Scope,
650+
SIAtomicAddrSpace AddrSpace) const override {
651+
return setAtomicScope(MI, Scope, AddrSpace);
652+
}
628653
};
629654

630655
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2429,6 +2454,72 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
24292454
return true;
24302455
}
24312456

2457+
bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2458+
SIAtomicScope Scope,
2459+
SIAtomicAddrSpace AddrSpace,
2460+
bool IsCrossAddrSpaceOrdering,
2461+
Position Pos) const {
2462+
MachineBasicBlock &MBB = *MI->getParent();
2463+
DebugLoc DL = MI->getDebugLoc();
2464+
2465+
// The scratch address space does not need the global memory cache
2466+
// writeback as all memory operations by the same thread are
2467+
// sequentially consistent, and no other thread can access scratch
2468+
// memory.
2469+
2470+
// Other address spaces do not have a cache.
2471+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) == SIAtomicAddrSpace::NONE)
2472+
return false;
2473+
2474+
if (Pos == Position::AFTER)
2475+
++MI;
2476+
2477+
// GLOBAL_WB is always needed, even for write-through caches, as it
2478+
// additionally ensures all operations have reached the desired cache level.
2479+
bool SkipWB = false;
2480+
AMDGPU::CPol::CPol ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2481+
switch (Scope) {
2482+
case SIAtomicScope::SYSTEM:
2483+
ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2484+
break;
2485+
case SIAtomicScope::AGENT:
2486+
ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2487+
break;
2488+
case SIAtomicScope::WORKGROUP:
2489+
// In WGP mode the waves of a work-group can be executing on either CU of
2490+
// the WGP. Therefore we need to ensure all operations have reached L1,
2491+
// hence the SCOPE_SE WB.
2492+
// For CU mode, we need operations to reach L0, so the wait is enough -
2493+
// there are no ways for an operation to report completion without reaching
2494+
// at least L0.
2495+
if (ST.isCuModeEnabled())
2496+
SkipWB = true;
2497+
else
2498+
ScopeImm = AMDGPU::CPol::SCOPE_SE;
2499+
break;
2500+
case SIAtomicScope::WAVEFRONT:
2501+
case SIAtomicScope::SINGLETHREAD:
2502+
// No cache to invalidate.
2503+
return false;
2504+
default:
2505+
llvm_unreachable("Unsupported synchronization scope");
2506+
}
2507+
2508+
if (!SkipWB)
2509+
BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB)).addImm(ScopeImm);
2510+
2511+
if (Pos == Position::AFTER)
2512+
--MI;
2513+
2514+
// We always have to wait for previous memory operations (load/store) to
2515+
// complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2516+
// we of course need to wait for that as well.
2517+
insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2518+
IsCrossAddrSpaceOrdering, Pos);
2519+
2520+
return true;
2521+
}
2522+
24322523
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
24332524
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
24342525
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
@@ -2479,6 +2570,44 @@ bool SIGfx12CacheControl::expandSystemScopeStore(
24792570
return false;
24802571
}
24812572

2573+
bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2574+
SIAtomicScope Scope,
2575+
SIAtomicAddrSpace AddrSpace) const {
2576+
bool Changed = false;
2577+
2578+
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
2579+
switch (Scope) {
2580+
case SIAtomicScope::SYSTEM:
2581+
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2582+
break;
2583+
case SIAtomicScope::AGENT:
2584+
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2585+
break;
2586+
case SIAtomicScope::WORKGROUP:
2587+
// In workgroup mode, SCOPE_SE is needed as waves can executes on
2588+
// different CUs that access different L0s.
2589+
if (!ST.isCuModeEnabled())
2590+
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2591+
break;
2592+
case SIAtomicScope::WAVEFRONT:
2593+
case SIAtomicScope::SINGLETHREAD:
2594+
// No cache to bypass.
2595+
break;
2596+
default:
2597+
llvm_unreachable("Unsupported synchronization scope");
2598+
}
2599+
}
2600+
2601+
// The scratch address space does not need the global memory caches
2602+
// to be bypassed as all memory operations by the same thread are
2603+
// sequentially consistent, and no other thread can access scratch
2604+
// memory.
2605+
2606+
// Other address spaces do not have a cache.
2607+
2608+
return Changed;
2609+
}
2610+
24822611
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
24832612
if (AtomicPseudoMIs.empty())
24842613
return false;

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 28 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
1818
; GFX12-NEXT: s_wait_samplecnt 0x0
1919
; GFX12-NEXT: s_wait_bvhcnt 0x0
2020
; GFX12-NEXT: s_wait_kmcnt 0x0
21+
; GFX12-NEXT: global_wb scope:SCOPE_SE
2122
; GFX12-NEXT: s_wait_storecnt 0x0
2223
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
2324
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -90,6 +91,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
9091
; GFX12-NEXT: s_wait_samplecnt 0x0
9192
; GFX12-NEXT: s_wait_bvhcnt 0x0
9293
; GFX12-NEXT: s_wait_kmcnt 0x0
94+
; GFX12-NEXT: global_wb scope:SCOPE_SE
9395
; GFX12-NEXT: s_wait_storecnt 0x0
9496
; GFX12-NEXT: ds_max_num_f32 v0, v1
9597
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -162,6 +164,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
162164
; GFX12-NEXT: s_wait_samplecnt 0x0
163165
; GFX12-NEXT: s_wait_bvhcnt 0x0
164166
; GFX12-NEXT: s_wait_kmcnt 0x0
167+
; GFX12-NEXT: global_wb scope:SCOPE_SE
165168
; GFX12-NEXT: s_wait_storecnt 0x0
166169
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
167170
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -238,6 +241,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
238241
; GFX12-NEXT: s_wait_samplecnt 0x0
239242
; GFX12-NEXT: s_wait_bvhcnt 0x0
240243
; GFX12-NEXT: s_wait_kmcnt 0x0
244+
; GFX12-NEXT: global_wb scope:SCOPE_SE
241245
; GFX12-NEXT: s_wait_storecnt 0x0
242246
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
243247
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -324,8 +328,9 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
324328
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
325329
; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
326330
; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
331+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
327332
; GFX12-NEXT: s_wait_storecnt 0x0
328-
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
333+
; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
329334
; GFX12-NEXT: s_wait_loadcnt 0x0
330335
; GFX12-NEXT: global_inv scope:SCOPE_DEV
331336
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
@@ -538,8 +543,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
538543
; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
539544
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
540545
; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
546+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
541547
; GFX12-NEXT: s_wait_storecnt 0x0
542-
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
548+
; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
543549
; GFX12-NEXT: s_wait_loadcnt 0x0
544550
; GFX12-NEXT: global_inv scope:SCOPE_DEV
545551
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
@@ -746,8 +752,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
746752
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
747753
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
748754
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
755+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
749756
; GFX12-NEXT: s_wait_storecnt 0x0
750-
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
757+
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
751758
; GFX12-NEXT: s_wait_loadcnt 0x0
752759
; GFX12-NEXT: global_inv scope:SCOPE_DEV
753760
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -972,8 +979,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
972979
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
973980
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
974981
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
982+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
975983
; GFX12-NEXT: s_wait_storecnt 0x0
976-
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN
984+
; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
977985
; GFX12-NEXT: s_wait_loadcnt 0x0
978986
; GFX12-NEXT: global_inv scope:SCOPE_DEV
979987
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -1186,8 +1194,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
11861194
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11871195
; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
11881196
; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
1197+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
11891198
; GFX12-NEXT: s_wait_storecnt 0x0
1190-
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
1199+
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
11911200
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
11921201
; GFX12-NEXT: global_inv scope:SCOPE_DEV
11931202
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
@@ -1395,8 +1404,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
13951404
; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
13961405
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
13971406
; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
1407+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
13981408
; GFX12-NEXT: s_wait_storecnt 0x0
1399-
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
1409+
; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
14001410
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
14011411
; GFX12-NEXT: global_inv scope:SCOPE_DEV
14021412
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
@@ -1598,8 +1608,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
15981608
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15991609
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
16001610
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1611+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
16011612
; GFX12-NEXT: s_wait_storecnt 0x0
1602-
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
1613+
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
16031614
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
16041615
; GFX12-NEXT: global_inv scope:SCOPE_DEV
16051616
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -1823,8 +1834,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
18231834
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
18241835
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
18251836
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1837+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
18261838
; GFX12-NEXT: s_wait_storecnt 0x0
1827-
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN
1839+
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
18281840
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
18291841
; GFX12-NEXT: global_inv scope:SCOPE_DEV
18301842
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -2035,11 +2047,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
20352047
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
20362048
; GFX12-NEXT: s_wait_loadcnt 0x0
20372049
; GFX12-NEXT: v_mov_b32_e32 v5, v0
2050+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
20382051
; GFX12-NEXT: s_wait_storecnt 0x0
2039-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
20402052
; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
2053+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
20412054
; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v3
2042-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
20432055
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
20442056
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
20452057
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2285,9 +2297,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
22852297
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
22862298
; GFX12-NEXT: s_wait_loadcnt 0x0
22872299
; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1
2300+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
22882301
; GFX12-NEXT: s_wait_storecnt 0x0
2289-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
22902302
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3
2303+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
22912304
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
22922305
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
22932306
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2527,11 +2540,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
25272540
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
25282541
; GFX12-NEXT: s_wait_loadcnt 0x0
25292542
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2543+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
25302544
; GFX12-NEXT: s_wait_storecnt 0x0
2531-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25322545
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
2546+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
25332547
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
2534-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
25352548
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
25362549
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
25372550
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2800,10 +2813,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
28002813
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
28012814
; GFX12-NEXT: s_wait_loadcnt 0x0
28022815
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
2816+
; GFX12-NEXT: global_wb scope:SCOPE_DEV
28032817
; GFX12-NEXT: s_wait_storecnt 0x0
2804-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
28052818
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
28062819
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
2820+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
28072821
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
28082822
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
28092823
; GFX12-NEXT: s_wait_loadcnt 0x0

0 commit comments

Comments
 (0)