-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Fix no return atomicrmw fadd v2f16 selection for gfx908 #96948
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Fix no return atomicrmw fadd v2f16 selection for gfx908 #96948
Conversation
We previously would always expand this with a cmpxchg loop, while it should be the same conditions as the f32 case (except for the denormal concern).
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesWe previously would always expand this with a cmpxchg loop, while Full diff: https://github.com/llvm/llvm-project/pull/96948.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 98054dde398b3..4bf8f20269a15 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1626,6 +1626,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd
}
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 16fa7266a4b7d..12977af0d7e85 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16219,13 +16219,20 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (Subtarget->hasGFX90AInsts() && Ty->isDoubleTy())
return ReportUnsafeHWInst(AtomicExpansionKind::None);
- if (AS != AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
- // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
- if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
- // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
- if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
- return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ if (AS != AMDGPUAS::FLAT_ADDRESS) {
+ if (Ty->isFloatTy()) {
+ // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, gfx11+.
+ if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+.
+ if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ } else {
+ // gfx908
+ if (RMW->use_empty() &&
+ Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() && isHalf2(Ty))
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
+ }
}
// flat atomic fadd f32: gfx940, gfx11+.
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 00a01e8d976bb..d50ba64ba5d47 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -6430,26 +6430,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s8
-; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s10, s8, 0x400
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s10
-; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB20_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
@@ -7912,26 +7895,9 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_rem
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s8
-; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s10, s8, 0x400
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s10
-; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB25_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 62762008b0c23..e312b37b2e0bb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -15550,22 +15550,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB67_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
@@ -15771,22 +15758,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB68_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -15995,22 +15969,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB69_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -16917,22 +16878,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB73_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
@@ -17368,22 +17316,9 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB75_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 533f0f2691045..021a55f743f66 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -5750,19 +5750,7 @@ define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(
; GFX9-NEXT: ret void
;
; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
-; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX908: atomicrmw.start:
-; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
-; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
-; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
-; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
-; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
-; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX908: atomicrmw.end:
+; GFX908-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4
; GFX908-NEXT: ret void
;
; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
|
…#96948) We previously would always expand this with a cmpxchg loop, while it should be the same conditions as the f32 case (except for the denormal concern).
We previously would always expand this with a cmpxchg loop, while
it should be the same conditions as the f32 case (except for the
denormal concern).