Skip to content

Commit a2a73d8

Browse files
authored
AMDGPU: Fix no return atomicrmw fadd v2f16 selection for gfx908 (#96948)
We previously would always expand this with a cmpxchg loop, while it should be the same conditions as the f32 case (except for the denormal concern).
1 parent 918313d commit a2a73d8

File tree

5 files changed

+23
-126
lines changed

5 files changed

+23
-126
lines changed

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1626,6 +1626,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd
16261626
}
16271627

16281628
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
1629+
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
16291630
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
16301631
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
16311632
}

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -16219,13 +16219,20 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1621916219
if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
1622016220
return ReportUnsafeHWInst(AtomicExpansionKind::None);
1622116221

16222-
if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
16223-
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16224-
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16225-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16226-
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16227-
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16228-
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16222+
if (AS != AMDGPUAS::FLAT_ADDRESS) {
16223+
if (Ty->isFloatTy()) {
16224+
// global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
16225+
if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
16226+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16227+
// global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
16228+
if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
16229+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16230+
} else {
16231+
// gfx908
16232+
if (RMW->use_empty() &&
16233+
Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
16234+
return ReportUnsafeHWInst(AtomicExpansionKind::None);
16235+
}
1622916236
}
1623016237

1623116238
// flat atomic fadd f32: gfx940, gfx11+.

llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll

Lines changed: 2 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6430,26 +6430,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
64306430
; GFX908: ; %bb.0:
64316431
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64326432
; GFX908-NEXT: v_mov_b32_e32 v1, s8
6433-
; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
6434-
; GFX908-NEXT: s_add_i32 s10, s8, 0x400
6435-
; GFX908-NEXT: s_mov_b64 s[8:9], 0
6436-
; GFX908-NEXT: v_mov_b32_e32 v3, s10
6437-
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
6438-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
6439-
; GFX908-NEXT: s_waitcnt vmcnt(0)
6440-
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
6441-
; GFX908-NEXT: v_mov_b32_e32 v5, v2
6442-
; GFX908-NEXT: v_mov_b32_e32 v4, v1
6443-
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
6433+
; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
64446434
; GFX908-NEXT: s_waitcnt vmcnt(0)
64456435
; GFX908-NEXT: buffer_wbinvl1
6446-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
6447-
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
6448-
; GFX908-NEXT: v_mov_b32_e32 v2, v4
6449-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
6450-
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
6451-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
6452-
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
64536436
; GFX908-NEXT: s_setpc_b64 s[30:31]
64546437
;
64556438
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -7912,26 +7895,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
79127895
; GFX908: ; %bb.0:
79137896
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
79147897
; GFX908-NEXT: v_mov_b32_e32 v1, s8
7915-
; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
7916-
; GFX908-NEXT: s_add_i32 s10, s8, 0x400
7917-
; GFX908-NEXT: s_mov_b64 s[8:9], 0
7918-
; GFX908-NEXT: v_mov_b32_e32 v3, s10
7919-
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
7920-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
7921-
; GFX908-NEXT: s_waitcnt vmcnt(0)
7922-
; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
7923-
; GFX908-NEXT: v_mov_b32_e32 v5, v2
7924-
; GFX908-NEXT: v_mov_b32_e32 v4, v1
7925-
; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
7898+
; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
79267899
; GFX908-NEXT: s_waitcnt vmcnt(0)
79277900
; GFX908-NEXT: buffer_wbinvl1
7928-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
7929-
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
7930-
; GFX908-NEXT: v_mov_b32_e32 v2, v4
7931-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
7932-
; GFX908-NEXT: s_cbranch_execnz .LBB25_1
7933-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
7934-
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
79357901
; GFX908-NEXT: s_setpc_b64 s[30:31]
79367902
;
79377903
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:

llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll

Lines changed: 5 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -15550,22 +15550,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
1555015550
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
1555115551
; GFX908: ; %bb.0:
1555215552
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15553-
; GFX908-NEXT: global_load_dword v4, v[0:1], off
15554-
; GFX908-NEXT: s_mov_b64 s[4:5], 0
15555-
; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start
15556-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
15557-
; GFX908-NEXT: s_waitcnt vmcnt(0)
15558-
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
15559-
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
15553+
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
1556015554
; GFX908-NEXT: s_waitcnt vmcnt(0)
1556115555
; GFX908-NEXT: buffer_wbinvl1
15562-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
15563-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
15564-
; GFX908-NEXT: v_mov_b32_e32 v4, v3
15565-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
15566-
; GFX908-NEXT: s_cbranch_execnz .LBB67_1
15567-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
15568-
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1556915556
; GFX908-NEXT: s_setpc_b64 s[30:31]
1557015557
;
1557115558
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -15771,22 +15758,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
1577115758
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
1577215759
; GFX908: ; %bb.0:
1577315760
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15774-
; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
15775-
; GFX908-NEXT: s_mov_b64 s[4:5], 0
15776-
; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start
15777-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
15778-
; GFX908-NEXT: s_waitcnt vmcnt(0)
15779-
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
15780-
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
15761+
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
1578115762
; GFX908-NEXT: s_waitcnt vmcnt(0)
1578215763
; GFX908-NEXT: buffer_wbinvl1
15783-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
15784-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
15785-
; GFX908-NEXT: v_mov_b32_e32 v4, v3
15786-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
15787-
; GFX908-NEXT: s_cbranch_execnz .LBB68_1
15788-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
15789-
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1579015764
; GFX908-NEXT: s_setpc_b64 s[30:31]
1579115765
;
1579215766
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15995,22 +15969,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
1599515969
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
1599615970
; GFX908: ; %bb.0:
1599715971
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
15998-
; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
15999-
; GFX908-NEXT: s_mov_b64 s[4:5], 0
16000-
; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start
16001-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
16002-
; GFX908-NEXT: s_waitcnt vmcnt(0)
16003-
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
16004-
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
15972+
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
1600515973
; GFX908-NEXT: s_waitcnt vmcnt(0)
1600615974
; GFX908-NEXT: buffer_wbinvl1
16007-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
16008-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
16009-
; GFX908-NEXT: v_mov_b32_e32 v4, v3
16010-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
16011-
; GFX908-NEXT: s_cbranch_execnz .LBB69_1
16012-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
16013-
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1601415975
; GFX908-NEXT: s_setpc_b64 s[30:31]
1601515976
;
1601615977
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16917,22 +16878,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
1691716878
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
1691816879
; GFX908: ; %bb.0:
1691916880
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16920-
; GFX908-NEXT: global_load_dword v4, v[0:1], off
16921-
; GFX908-NEXT: s_mov_b64 s[4:5], 0
16922-
; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start
16923-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
16924-
; GFX908-NEXT: s_waitcnt vmcnt(0)
16925-
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
16926-
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
16881+
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
1692716882
; GFX908-NEXT: s_waitcnt vmcnt(0)
1692816883
; GFX908-NEXT: buffer_wbinvl1
16929-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
16930-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
16931-
; GFX908-NEXT: v_mov_b32_e32 v4, v3
16932-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
16933-
; GFX908-NEXT: s_cbranch_execnz .LBB73_1
16934-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
16935-
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1693616884
; GFX908-NEXT: s_setpc_b64 s[30:31]
1693716885
;
1693816886
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
@@ -17368,22 +17316,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
1736817316
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
1736917317
; GFX908: ; %bb.0:
1737017318
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
17371-
; GFX908-NEXT: global_load_dword v4, v[0:1], off
17372-
; GFX908-NEXT: s_mov_b64 s[4:5], 0
17373-
; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start
17374-
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
17375-
; GFX908-NEXT: s_waitcnt vmcnt(0)
17376-
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
17377-
; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
17319+
; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
1737817320
; GFX908-NEXT: s_waitcnt vmcnt(0)
1737917321
; GFX908-NEXT: buffer_wbinvl1
17380-
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
17381-
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
17382-
; GFX908-NEXT: v_mov_b32_e32 v4, v3
17383-
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
17384-
; GFX908-NEXT: s_cbranch_execnz .LBB75_1
17385-
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
17386-
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1738717322
; GFX908-NEXT: s_setpc_b64 s[30:31]
1738817323
;
1738917324
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:

llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5750,19 +5750,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
57505750
; GFX9-NEXT: ret void
57515751
;
57525752
; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
5753-
; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
5754-
; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
5755-
; GFX908: atomicrmw.start:
5756-
; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
5757-
; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
5758-
; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
5759-
; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
5760-
; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
5761-
; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
5762-
; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
5763-
; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
5764-
; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
5765-
; GFX908: atomicrmw.end:
5753+
; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
57665754
; GFX908-NEXT: ret void
57675755
;
57685756
; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(

0 commit comments

Comments
 (0)