Skip to content

Commit c026024

Browse files
authored
AMDGPU: Legalize v2f16 atomicrmw fadd for buffer fat pointers (#95929)
Unfortunately the v2bf16 case is complicated because gfx90a doesn't support it, while it does for global/flat.
1 parent 1c85c71 commit c026024

File tree

2 files changed

+33
-230
lines changed

2 files changed

+33
-230
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16023,10 +16023,16 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1602316023
// FIXME: Needs to account for no fine-grained memory
1602416024
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
1602516025
return AtomicExpansionKind::None;
16026-
}
16026+
} else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
16027+
// gfx90a, gfx940, gfx12
16028+
// FIXME: Needs to account for no fine-grained memory
16029+
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
16030+
return AtomicExpansionKind::None;
1602716031

16028-
// TODO: Handle buffer case. gfx90a and gfx940 supports <2 x half>. gfx12
16029-
// supports <2 x half> and <2 x bfloat>.
16032+
// TODO: Handle <2 x bfloat> case. While gfx90a/gfx940 supports it for
16033+
// global/flat, it does not for buffer. gfx12 does have the buffer
16034+
// version.
16035+
}
1603016036

1603116037
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1603216038
return AtomicExpansionKind::CmpXChg;

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

Lines changed: 24 additions & 227 deletions
Original file line numberDiff line numberDiff line change
@@ -4618,58 +4618,21 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
46184618
; GFX12-NEXT: s_wait_samplecnt 0x0
46194619
; GFX12-NEXT: s_wait_bvhcnt 0x0
46204620
; GFX12-NEXT: s_wait_kmcnt 0x0
4621-
; GFX12-NEXT: v_mov_b32_e32 v2, v0
4622-
; GFX12-NEXT: v_mov_b32_e32 v0, s4
4623-
; GFX12-NEXT: s_addk_co_i32 s4, 0x400
4624-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4625-
; GFX12-NEXT: v_mov_b32_e32 v3, s4
4626-
; GFX12-NEXT: s_mov_b32 s4, 0
4627-
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
4628-
; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
4629-
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4630-
; GFX12-NEXT: s_wait_loadcnt 0x0
4631-
; GFX12-NEXT: v_mov_b32_e32 v5, v0
4621+
; GFX12-NEXT: v_mov_b32_e32 v1, s4
46324622
; GFX12-NEXT: s_wait_storecnt 0x0
4633-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4634-
; GFX12-NEXT: v_pk_add_f16 v4, v5, v2
4635-
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
4636-
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4623+
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
46374624
; GFX12-NEXT: s_wait_loadcnt 0x0
46384625
; GFX12-NEXT: global_inv scope:SCOPE_DEV
4639-
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
4640-
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4641-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4642-
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4643-
; GFX12-NEXT: s_cbranch_execnz .LBB12_1
4644-
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4645-
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
46464626
; GFX12-NEXT: s_setpc_b64 s[30:31]
46474627
;
46484628
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
46494629
; GFX940: ; %bb.0:
46504630
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4651-
; GFX940-NEXT: v_mov_b32_e32 v2, v0
4652-
; GFX940-NEXT: v_mov_b32_e32 v0, s4
4653-
; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
4654-
; GFX940-NEXT: s_add_i32 s6, s4, 0x400
4655-
; GFX940-NEXT: s_mov_b64 s[4:5], 0
4656-
; GFX940-NEXT: v_mov_b32_e32 v3, s6
4657-
; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
4658-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4659-
; GFX940-NEXT: s_waitcnt vmcnt(0)
4660-
; GFX940-NEXT: v_mov_b32_e32 v5, v0
4661-
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
4631+
; GFX940-NEXT: v_mov_b32_e32 v1, s4
46624632
; GFX940-NEXT: buffer_wbl2 sc1
4663-
; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
4664-
; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
4633+
; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
46654634
; GFX940-NEXT: s_waitcnt vmcnt(0)
46664635
; GFX940-NEXT: buffer_inv sc1
4667-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4668-
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4669-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4670-
; GFX940-NEXT: s_cbranch_execnz .LBB12_1
4671-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4672-
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
46734636
; GFX940-NEXT: s_setpc_b64 s[30:31]
46744637
;
46754638
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -4735,27 +4698,10 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
47354698
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
47364699
; GFX90A: ; %bb.0:
47374700
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4738-
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
4739-
; GFX90A-NEXT: v_mov_b32_e32 v0, s8
4740-
; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
4741-
; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
4742-
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
4743-
; GFX90A-NEXT: v_mov_b32_e32 v3, s10
4744-
; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
4745-
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
4746-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
4747-
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
4748-
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
4749-
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
4750-
; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
4701+
; GFX90A-NEXT: v_mov_b32_e32 v1, s8
4702+
; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
47514703
; GFX90A-NEXT: s_waitcnt vmcnt(0)
47524704
; GFX90A-NEXT: buffer_wbinvl1
4753-
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4754-
; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
4755-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
4756-
; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
4757-
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
4758-
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
47594705
; GFX90A-NEXT: s_setpc_b64 s[30:31]
47604706
;
47614707
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
@@ -4921,56 +4867,20 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
49214867
; GFX12-NEXT: s_wait_bvhcnt 0x0
49224868
; GFX12-NEXT: s_wait_kmcnt 0x0
49234869
; GFX12-NEXT: v_mov_b32_e32 v1, s4
4924-
; GFX12-NEXT: s_addk_co_i32 s4, 0x400
4925-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4926-
; GFX12-NEXT: v_mov_b32_e32 v3, s4
4927-
; GFX12-NEXT: s_mov_b32 s4, 0
4928-
; GFX12-NEXT: buffer_load_b32 v2, v1, s[0:3], null offen offset:1024
4929-
; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
4930-
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4931-
; GFX12-NEXT: s_wait_loadcnt 0x0
4932-
; GFX12-NEXT: v_pk_add_f16 v1, v2, v0
4933-
; GFX12-NEXT: v_mov_b32_e32 v5, v2
49344870
; GFX12-NEXT: s_wait_storecnt 0x0
4935-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
4936-
; GFX12-NEXT: v_mov_b32_e32 v4, v1
4937-
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4938-
; GFX12-NEXT: s_wait_loadcnt 0x0
4871+
; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
4872+
; GFX12-NEXT: s_wait_storecnt 0x0
49394873
; GFX12-NEXT: global_inv scope:SCOPE_DEV
4940-
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
4941-
; GFX12-NEXT: v_mov_b32_e32 v2, v4
4942-
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4943-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4944-
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4945-
; GFX12-NEXT: s_cbranch_execnz .LBB13_1
4946-
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4947-
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
49484874
; GFX12-NEXT: s_setpc_b64 s[30:31]
49494875
;
49504876
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
49514877
; GFX940: ; %bb.0:
49524878
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
49534879
; GFX940-NEXT: v_mov_b32_e32 v1, s4
4954-
; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen offset:1024
4955-
; GFX940-NEXT: s_add_i32 s6, s4, 0x400
4956-
; GFX940-NEXT: s_mov_b64 s[4:5], 0
4957-
; GFX940-NEXT: v_mov_b32_e32 v1, s6
4958-
; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
4959-
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4960-
; GFX940-NEXT: s_waitcnt vmcnt(0)
4961-
; GFX940-NEXT: v_pk_add_f16 v2, v3, v0
49624880
; GFX940-NEXT: buffer_wbl2 sc1
4963-
; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
4964-
; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
4881+
; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
49654882
; GFX940-NEXT: s_waitcnt vmcnt(0)
49664883
; GFX940-NEXT: buffer_inv sc1
4967-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
4968-
; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4969-
; GFX940-NEXT: v_mov_b32_e32 v3, v4
4970-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4971-
; GFX940-NEXT: s_cbranch_execnz .LBB13_1
4972-
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4973-
; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
49744884
; GFX940-NEXT: s_setpc_b64 s[30:31]
49754885
;
49764886
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5036,25 +4946,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(
50364946
; GFX90A: ; %bb.0:
50374947
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50384948
; GFX90A-NEXT: v_mov_b32_e32 v1, s8
5039-
; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen offset:1024
5040-
; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
5041-
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
5042-
; GFX90A-NEXT: v_mov_b32_e32 v1, s10
5043-
; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
5044-
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
5045-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5046-
; GFX90A-NEXT: v_pk_add_f16 v2, v3, v0
5047-
; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
5048-
; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
4949+
; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
50494950
; GFX90A-NEXT: s_waitcnt vmcnt(0)
50504951
; GFX90A-NEXT: buffer_wbinvl1
5051-
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
5052-
; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
5053-
; GFX90A-NEXT: v_mov_b32_e32 v3, v4
5054-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
5055-
; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
5056-
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
5057-
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
50584952
; GFX90A-NEXT: s_setpc_b64 s[30:31]
50594953
;
50604954
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
@@ -5217,8 +5111,8 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
52175111
; GFX12-NEXT: s_wait_samplecnt 0x0
52185112
; GFX12-NEXT: s_wait_bvhcnt 0x0
52195113
; GFX12-NEXT: s_wait_kmcnt 0x0
5220-
; GFX12-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
52215114
; GFX12-NEXT: s_mov_b32 s1, exec_lo
5115+
; GFX12-NEXT: s_wait_storecnt 0x0
52225116
; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
52235117
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
52245118
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
@@ -5230,59 +5124,24 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
52305124
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
52315125
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
52325126
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5233-
; GFX12-NEXT: buffer_load_b32 v8, v4, s[4:7], null offen offset:1024
5127+
; GFX12-NEXT: s_wait_loadcnt 0x0
5128+
; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
5129+
; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
52345130
; GFX12-NEXT: ; implicit-def: $vgpr4
52355131
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
52365132
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
52375133
; GFX12-NEXT: ; %bb.2:
52385134
; GFX12-NEXT: s_mov_b32 exec_lo, s1
5239-
; GFX12-NEXT: s_mov_b32 s1, 0
5240-
; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start
5241-
; GFX12-NEXT: ; =>This Loop Header: Depth=1
5242-
; GFX12-NEXT: ; Child Loop BB14_4 Depth 2
5243-
; GFX12-NEXT: s_wait_loadcnt 0x0
5244-
; GFX12-NEXT: v_pk_add_f16 v7, v8, v5
5245-
; GFX12-NEXT: s_mov_b32 s2, exec_lo
5246-
; GFX12-NEXT: s_wait_storecnt 0x0
5247-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
5248-
; GFX12-NEXT: v_mov_b32_e32 v6, v7
5249-
; GFX12-NEXT: v_mov_b32_e32 v7, v8
5250-
; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5251-
; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
5252-
; GFX12-NEXT: v_readfirstlane_b32 s4, v0
5253-
; GFX12-NEXT: v_readfirstlane_b32 s5, v1
5254-
; GFX12-NEXT: v_readfirstlane_b32 s6, v2
5255-
; GFX12-NEXT: v_readfirstlane_b32 s7, v3
5256-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5257-
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5258-
; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5259-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5260-
; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
5261-
; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5262-
; GFX12-NEXT: s_wait_loadcnt 0x0
5263-
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], null offen th:TH_ATOMIC_RETURN
5264-
; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5265-
; GFX12-NEXT: s_cbranch_execnz .LBB14_4
5266-
; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5267-
; GFX12-NEXT: s_mov_b32 exec_lo, s2
52685135
; GFX12-NEXT: s_wait_loadcnt 0x0
5269-
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
5270-
; GFX12-NEXT: v_mov_b32_e32 v8, v6
5136+
; GFX12-NEXT: v_mov_b32_e32 v0, v5
52715137
; GFX12-NEXT: global_inv scope:SCOPE_DEV
5272-
; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
5273-
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5274-
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
5275-
; GFX12-NEXT: s_cbranch_execnz .LBB14_3
5276-
; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
5277-
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
5278-
; GFX12-NEXT: v_mov_b32_e32 v0, v6
52795138
; GFX12-NEXT: s_setpc_b64 s[30:31]
52805139
;
52815140
; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
52825141
; GFX940: ; %bb.0:
52835142
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5284-
; GFX940-NEXT: v_add_u32_e32 v10, 0x400, v4
52855143
; GFX940-NEXT: s_mov_b64 s[2:3], exec
5144+
; GFX940-NEXT: buffer_wbl2 sc1
52865145
; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
52875146
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
52885147
; GFX940-NEXT: v_readfirstlane_b32 s5, v1
@@ -5293,48 +5152,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
52935152
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
52945153
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
52955154
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5296-
; GFX940-NEXT: buffer_load_dword v9, v4, s[4:7], 0 offen offset:1024
5155+
; GFX940-NEXT: s_waitcnt vmcnt(0)
5156+
; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
5157+
; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
52975158
; GFX940-NEXT: ; implicit-def: $vgpr4
52985159
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
52995160
; GFX940-NEXT: s_cbranch_execnz .LBB14_1
53005161
; GFX940-NEXT: ; %bb.2:
53015162
; GFX940-NEXT: s_mov_b64 exec, s[2:3]
5302-
; GFX940-NEXT: s_mov_b64 s[2:3], 0
5303-
; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start
5304-
; GFX940-NEXT: ; =>This Loop Header: Depth=1
5305-
; GFX940-NEXT: ; Child Loop BB14_4 Depth 2
5306-
; GFX940-NEXT: s_waitcnt vmcnt(0)
5307-
; GFX940-NEXT: v_pk_add_f16 v8, v9, v5
5308-
; GFX940-NEXT: s_mov_b64 s[8:9], exec
5309-
; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[8:9]
5310-
; GFX940-NEXT: buffer_wbl2 sc1
5311-
; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5312-
; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
5313-
; GFX940-NEXT: v_readfirstlane_b32 s4, v0
5314-
; GFX940-NEXT: v_readfirstlane_b32 s5, v1
5315-
; GFX940-NEXT: v_readfirstlane_b32 s6, v2
5316-
; GFX940-NEXT: v_readfirstlane_b32 s7, v3
5317-
; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5318-
; GFX940-NEXT: s_nop 0
5319-
; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5320-
; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
5321-
; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5322-
; GFX940-NEXT: s_waitcnt vmcnt(0)
5323-
; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[4:7], 0 offen sc0
5324-
; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
5325-
; GFX940-NEXT: s_cbranch_execnz .LBB14_4
5326-
; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5327-
; GFX940-NEXT: s_mov_b64 exec, s[8:9]
53285163
; GFX940-NEXT: s_waitcnt vmcnt(0)
5329-
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
5330-
; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5331-
; GFX940-NEXT: v_mov_b32_e32 v9, v6
5164+
; GFX940-NEXT: v_mov_b32_e32 v0, v5
53325165
; GFX940-NEXT: buffer_inv sc1
5333-
; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
5334-
; GFX940-NEXT: s_cbranch_execnz .LBB14_3
5335-
; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
5336-
; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
5337-
; GFX940-NEXT: v_mov_b32_e32 v0, v6
53385166
; GFX940-NEXT: s_setpc_b64 s[30:31]
53395167
;
53405168
; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
@@ -5468,7 +5296,6 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
54685296
; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
54695297
; GFX90A: ; %bb.0:
54705298
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5471-
; GFX90A-NEXT: v_add_u32_e32 v10, 0x400, v4
54725299
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
54735300
; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
54745301
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
@@ -5479,47 +5306,17 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall
54795306
; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
54805307
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
54815308
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5482-
; GFX90A-NEXT: s_nop 0
5483-
; GFX90A-NEXT: buffer_load_dword v9, v4, s[8:11], 0 offen offset:1024
5309+
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5310+
; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
5311+
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
54845312
; GFX90A-NEXT: ; implicit-def: $vgpr4
54855313
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
54865314
; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
54875315
; GFX90A-NEXT: ; %bb.2:
54885316
; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
5489-
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
5490-
; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start
5491-
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
5492-
; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2
54935317
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5494-
; GFX90A-NEXT: v_pk_add_f16 v8, v9, v5
5495-
; GFX90A-NEXT: s_mov_b64 s[12:13], exec
5496-
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[8:9], v[8:9] op_sel:[0,1]
5497-
; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5498-
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
5499-
; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
5500-
; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
5501-
; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
5502-
; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
5503-
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5504-
; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5505-
; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5506-
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5507-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5508-
; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v10, s[8:11], 0 offen glc
5509-
; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
5510-
; GFX90A-NEXT: s_cbranch_execnz .LBB14_4
5511-
; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5512-
; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
5513-
; GFX90A-NEXT: s_waitcnt vmcnt(0)
5514-
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v9
5515-
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5516-
; GFX90A-NEXT: v_mov_b32_e32 v9, v6
5318+
; GFX90A-NEXT: v_mov_b32_e32 v0, v5
55175319
; GFX90A-NEXT: buffer_wbinvl1
5518-
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
5519-
; GFX90A-NEXT: s_cbranch_execnz .LBB14_3
5520-
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
5521-
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
5522-
; GFX90A-NEXT: v_mov_b32_e32 v0, v6
55235320
; GFX90A-NEXT: s_setpc_b64 s[30:31]
55245321
;
55255322
; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:

0 commit comments

Comments
 (0)