-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Undo atomicrmw add/sub/xor 0 -> atomicrmw or canonicalization #87533
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
InstCombine transforms add of 0 to or of 0. For system atomics, this is problematic because while PCIe supports add, it does not support the other operations. Undo this for system scope atomics.
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-backend-amdgpu Author: Matt Arsenault (arsenm) ChangesInstCombine transforms add of 0 to or of 0. For system atomics, this is problematic because while PCIe supports add, it does not support the other operations. Undo this for system scope atomics. Patch is 142.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/87533.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 14948ef9ea8d17..6350a6598545a5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -16027,6 +16027,19 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
switch (RMW->getOperation()) {
+ case AtomicRMWInst::Sub:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor: {
+ // Atomic sub/or/xor do not work over PCI express, but atomic add
+ // does. InstCombine transforms these with 0 to or, so undo that.
+ if (HasSystemScope && AMDGPU::isFlatGlobalAddrSpace(AS)) {
+ if (Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
+ ConstVal && ConstVal->isNullValue())
+ return AtomicExpansionKind::Expand;
+ }
+
+ break;
+ }
case AtomicRMWInst::FAdd: {
Type *Ty = RMW->getType();
@@ -16312,6 +16325,16 @@ bool SITargetLowering::checkForPhysRegDependency(
}
void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
+ if (AI->getOperation() == AtomicRMWInst::Sub ||
+ AI->getOperation() == AtomicRMWInst::Or ||
+ AI->getOperation() == AtomicRMWInst::Xor) {
+ // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
+ assert(cast<Constant>(AI->getValOperand())->isNullValue() &&
+ "this cannot be replaced with add");
+ AI->setOperation(AtomicRMWInst::Add);
+ return;
+ }
+
assert(Subtarget->hasAtomicFaddInsts() &&
"target should have atomic fadd instructions");
assert(AI->getType()->isFloatTy() &&
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 99d02ffaa523d5..4598d23e088b54 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -1403,6 +1403,43 @@ define amdgpu_gfx i32 @global_atomic_sub_i32_ret_offset_scalar(ptr addrspace(1)
ret i32 %result
}
+define i32 @global_atomic_sub_0_i32_ret(ptr addrspace(1) %ptr) {
+; SI-LABEL: global_atomic_sub_0_i32_ret:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_mov_b32 s7, 0xf000
+; SI-NEXT: s_mov_b32 s6, 0
+; SI-NEXT: v_mov_b32_e32 v2, 0
+; SI-NEXT: s_mov_b32 s4, s6
+; SI-NEXT: s_mov_b32 s5, s6
+; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
+; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: buffer_wbinvl1
+; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: s_waitcnt expcnt(0)
+; SI-NEXT: s_setpc_b64 s[30:31]
+;
+; VI-LABEL: global_atomic_sub_0_i32_ret:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_mov_b32_e32 v2, 0
+; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
+; VI-NEXT: s_waitcnt vmcnt(0)
+; VI-NEXT: buffer_wbinvl1_vol
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: global_atomic_sub_0_i32_ret:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: global_atomic_add v0, v[0:1], v2, off glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: buffer_wbinvl1_vol
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw sub ptr addrspace(1) %ptr, i32 0 seq_cst
+ ret i32 %result
+}
+
; ---------------------------------------------------------------------
; atomicrmw and
; ---------------------------------------------------------------------
@@ -1767,7 +1804,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB40_1: ; %atomicrmw.start
+; SI-NEXT: .LBB41_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, v4, v2
@@ -1782,7 +1819,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB40_1
+; SI-NEXT: s_cbranch_execnz .LBB41_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
@@ -1793,7 +1830,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB40_1: ; %atomicrmw.start
+; VI-NEXT: .LBB41_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v3, v4, v2
@@ -1805,7 +1842,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB40_1
+; VI-NEXT: s_cbranch_execnz .LBB41_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1815,7 +1852,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
@@ -1827,7 +1864,7 @@ define void @global_atomic_nand_i32_noret(ptr addrspace(1) %ptr, i32 %in) {
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB40_1
+; GFX9-NEXT: s_cbranch_execnz .LBB41_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1845,7 +1882,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB41_1: ; %atomicrmw.start
+; SI-NEXT: .LBB42_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v3, v4, v2
@@ -1860,7 +1897,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; SI-NEXT: v_mov_b32_e32 v4, v5
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB41_1
+; SI-NEXT: s_cbranch_execnz .LBB42_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: s_waitcnt expcnt(0)
@@ -1873,7 +1910,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB41_1: ; %atomicrmw.start
+; VI-NEXT: .LBB42_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v3, v4, v2
@@ -1885,7 +1922,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: v_mov_b32_e32 v4, v3
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB41_1
+; VI-NEXT: s_cbranch_execnz .LBB42_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -1895,7 +1932,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v4, v[0:1], off offset:16
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v3, v4, v2
@@ -1907,7 +1944,7 @@ define void @global_atomic_nand_i32_noret_offset(ptr addrspace(1) %out, i32 %in)
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v4, v3
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB41_1
+; GFX9-NEXT: s_cbranch_execnz .LBB42_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -1926,7 +1963,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB42_1: ; %atomicrmw.start
+; SI-NEXT: .LBB43_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, v3
@@ -1941,7 +1978,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB42_1
+; SI-NEXT: s_cbranch_execnz .LBB43_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: v_mov_b32_e32 v0, v3
@@ -1953,7 +1990,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB42_1: ; %atomicrmw.start
+; VI-NEXT: .LBB43_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, v3
@@ -1965,7 +2002,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB42_1
+; VI-NEXT: s_cbranch_execnz .LBB43_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: v_mov_b32_e32 v0, v3
@@ -1976,7 +2013,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v3, v[0:1], off
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, v3
@@ -1988,7 +2025,7 @@ define i32 @global_atomic_nand_i32_ret(ptr addrspace(1) %ptr, i32 %in) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB42_1
+; GFX9-NEXT: s_cbranch_execnz .LBB43_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
@@ -2007,7 +2044,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: s_mov_b32 s5, s6
; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:16
; SI-NEXT: s_mov_b64 s[8:9], 0
-; SI-NEXT: .LBB43_1: ; %atomicrmw.start
+; SI-NEXT: .LBB44_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v5, v3
@@ -2022,7 +2059,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; SI-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; SI-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; SI-NEXT: s_cbranch_execnz .LBB43_1
+; SI-NEXT: s_cbranch_execnz .LBB44_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[8:9]
; SI-NEXT: v_mov_b32_e32 v0, v3
@@ -2036,7 +2073,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v0, v[3:4]
; VI-NEXT: s_mov_b64 s[4:5], 0
-; VI-NEXT: .LBB43_1: ; %atomicrmw.start
+; VI-NEXT: .LBB44_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, v0
@@ -2048,7 +2085,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; VI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; VI-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; VI-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; VI-NEXT: s_cbranch_execnz .LBB43_1
+; VI-NEXT: s_cbranch_execnz .LBB44_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[4:5]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -2058,7 +2095,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dword v3, v[0:1], off offset:16
; GFX9-NEXT: s_mov_b64 s[4:5], 0
-; GFX9-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, v3
@@ -2070,7 +2107,7 @@ define i32 @global_atomic_nand_i32_ret_offset(ptr addrspace(1) %out, i32 %in) {
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT: s_cbranch_execnz .LBB43_1
+; GFX9-NEXT: s_cbranch_execnz .LBB44_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: v_mov_b32_e32 v0, v3
@@ -2095,7 +2132,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
-; SI-NEXT: .LBB44_1: ; %atomicrmw.start
+; SI-NEXT: .LBB45_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, s34, v1
@@ -2110,7 +2147,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; SI-NEXT: s_cbranch_execnz .LBB44_1
+; SI-NEXT: s_cbranch_execnz .LBB45_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
; SI-NEXT: v_readlane_b32 s7, v4, 1
@@ -2128,7 +2165,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
-; VI-NEXT: .LBB44_1: ; %atomicrmw.start
+; VI-NEXT: .LBB45_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, s6, v3
@@ -2140,7 +2177,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; VI-NEXT: s_cbranch_execnz .LBB44_1
+; VI-NEXT: s_cbranch_execnz .LBB45_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -2151,7 +2188,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dword v1, v2, s[4:5]
; GFX9-NEXT: s_mov_b64 s[34:35], 0
-; GFX9-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
@@ -2163,7 +2200,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GFX9-NEXT: s_cbranch_execnz .LBB44_1
+; GFX9-NEXT: s_cbranch_execnz .LBB45_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2186,7 +2223,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v1, off, s[4:7], 0 offset:16
; SI-NEXT: s_mov_b64 s[36:37], 0
-; SI-NEXT: .LBB45_1: ; %atomicrmw.start
+; SI-NEXT: .LBB46_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_and_b32_e32 v0, s34, v1
@@ -2201,7 +2238,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: v_mov_b32_e32 v1, v2
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; SI-NEXT: s_cbranch_execnz .LBB45_1
+; SI-NEXT: s_cbranch_execnz .LBB46_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
; SI-NEXT: v_readlane_b32 s7, v4, 1
@@ -2221,7 +2258,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; VI-NEXT: v_mov_b32_e32 v1, s35
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: s_mov_b64 s[34:35], 0
-; VI-NEXT: .LBB45_1: ; %atomicrmw.start
+; VI-NEXT: .LBB46_1: ; %atomicrmw.start
; VI-NEXT: ; =>This Inner Loop Header: Depth=1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_and_b32_e32 v2, s6, v3
@@ -2233,7 +2270,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; VI-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; VI-NEXT: v_mov_b32_e32 v3, v2
; VI-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; VI-NEXT: s_cbranch_execnz .LBB45_1
+; VI-NEXT: s_cbranch_execnz .LBB46_1
; VI-NEXT: ; %bb.2: ; %atomicrmw.end
; VI-NEXT: s_or_b64 exec, exec, s[34:35]
; VI-NEXT: s_setpc_b64 s[30:31]
@@ -2244,7 +2281,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: global_load_dword v1, v2, s[4:5] offset:16
; GFX9-NEXT: s_mov_b64 s[34:35], 0
-; GFX9-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX9-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_and_b32_e32 v0, s6, v1
@@ -2256,7 +2293,7 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
; GFX9-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
; GFX9-NEXT: v_mov_b32_e32 v1, v0
; GFX9-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GFX9-NEXT: s_cbranch_execnz .LBB45_1
+; GFX9-NEXT: s_cbranch_execnz .LBB46_1
; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX9-NEXT: s_or_b64 exec, exec, s[34:35]
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -2280,7 +2317,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0
; SI-NEXT: s_mov_b64 s[36:37], 0
-; SI-NEXT: .LBB46_1: ; %atomicrmw.start
+; SI-NEXT: .LBB47_1: ; %atomicrmw.start
; SI-NEXT: ; =>This Inner Loop Header: Depth=1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, v0
@@ -2295,7 +2332,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
; SI-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; SI-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
; SI-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; SI-NEXT: s_cbranch_execnz .LBB46_1
+; SI-NEXT: s_cbranch_execnz .LBB47_1
; SI-NEXT: ; %bb.2: ; %atomicrmw.end
; SI-NEXT: s_or_b64 exec, exec, s[36:37]
; SI-NEXT: v_readlane_b32 s7, v3, 1
@@ -2315,7 +2352,7 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: ...
[truncated]
|
case AtomicRMWInst::Or: | ||
case AtomicRMWInst::Xor: { | ||
// Atomic sub/or/xor do not work over PCI express, but atomic add | ||
// does. InstCombine transforms these with 0 to or, so undo that. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
How does this Lowering hook work? Does it prevent IC from doing the or canonicalization?
If so then it's not really undoing it, just preventing it from happening, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, this is a codegen hook for the AtomicExpand pass. InstCombine does this transform and then we run the expand in codegen to undo it. Someone could also have written the code this way in the first place
should we do this for other synch scopes? Or revisit it after we add metadata for no-fine-grained and no-remote? |
I wanted to stay as targeted as possible. There will be an interaction with the new metadata, but it shouldn't be necessary outside of system scope |
I believe scope does not necessarily point to a specific bus. Most of the uses probably would not request a device scope on an atomic in remote memory, but this is not necessary. Why not to do it for any scope? |
Because the question of bus is only relevant at system scope. The device local scopes don't have the issue |
Adding Brian to double check on this, in the meantime LGTM. |
The scope is only a hint about where the target memory actually lives. But I am OK with the change as-is until there is a complaint. |
…llvm#87533) InstCombine transforms add of 0 to or of 0. For system atomics, this is problematic because while PCIe supports add, it does not support the other operations. Undo this for system scope atomics.
InstCombine transforms add of 0 to or of 0. For system atomics, this is problematic because while PCIe supports add, it does not support the other operations. Undo this for system scope atomics.