-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][SDAG] Add ISD::PTRADD DAG combines #142739
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[AMDGPU][SDAG] Add ISD::PTRADD DAG combines #142739
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-amdgpu Author: Fabian Ritter (ritter-x2a) ChangesThis patch focuses on generic DAG combines, plus an AMDGPU-target-specific one The generic DAG combine is based on a part of PR #105669 by @rgwott, which was
For SWDEV-516125. Patch is 20.34 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/142739.diff 4 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9e418329d15be..e45134a2e5548 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -419,6 +419,7 @@ namespace {
SDValue visitADDLike(SDNode *N);
SDValue visitADDLikeCommutative(SDValue N0, SDValue N1,
SDNode *LocReference);
+ SDValue visitPTRADD(SDNode *N);
SDValue visitSUB(SDNode *N);
SDValue visitADDSAT(SDNode *N);
SDValue visitSUBSAT(SDNode *N);
@@ -1138,7 +1139,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
return true;
}
- if (Opc != ISD::ADD)
+ if (Opc != ISD::ADD && Opc != ISD::PTRADD)
return false;
auto *C2 = dyn_cast<ConstantSDNode>(N1);
@@ -1858,6 +1859,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::TokenFactor: return visitTokenFactor(N);
case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
case ISD::ADD: return visitADD(N);
+ case ISD::PTRADD: return visitPTRADD(N);
case ISD::SUB: return visitSUB(N);
case ISD::SADDSAT:
case ISD::UADDSAT: return visitADDSAT(N);
@@ -2623,6 +2625,93 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
return SDValue();
}
+/// Try to fold a pointer arithmetic node.
+/// This needs to be done separately from normal addition, because pointer
+/// addition is not commutative.
+SDValue DAGCombiner::visitPTRADD(SDNode *N) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT PtrVT = N0.getValueType();
+ EVT IntVT = N1.getValueType();
+ SDLoc DL(N);
+
+ // This is already ensured by an assert in SelectionDAG::getNode(). Several
+ // combines here depend on this assumption.
+ assert(PtrVT == IntVT &&
+ "PTRADD with different operand types is not supported");
+
+ // fold (ptradd undef, y) -> undef
+ if (N0.isUndef())
+ return N0;
+
+ // fold (ptradd x, undef) -> undef
+ if (N1.isUndef())
+ return DAG.getUNDEF(PtrVT);
+
+ // fold (ptradd x, 0) -> x
+ if (isNullConstant(N1))
+ return N0;
+
+ // fold (ptradd 0, x) -> x
+ if (isNullConstant(N0))
+ return N1;
+
+ if (N0.getOpcode() == ISD::PTRADD &&
+ !reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1)) {
+ SDValue X = N0.getOperand(0);
+ SDValue Y = N0.getOperand(1);
+ SDValue Z = N1;
+ bool N0OneUse = N0.hasOneUse();
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+
+ // (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
+ // * y is a constant and (ptradd x, y) has one use; or
+ // * y and z are both constants.
+ if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, the new ones are as well.
+ if (N->getFlags().hasNoUnsignedWrap() &&
+ N0->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+ SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
+ AddToWorklist(Add.getNode());
+ return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
+ }
+
+ // TODO: There is another possible fold here that was proven useful.
+ // It would be this:
+ //
+ // (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
+ // * (ptradd x, y) has one use; and
+ // * y is a constant; and
+ // * z is not a constant.
+ //
+ // In some cases, specifically in AArch64's FEAT_CPA, it exposes the
+ // opportunity to select more complex instructions such as SUBPT and
+ // MSUBPT. However, a hypothetical corner case has been found that we could
+ // not avoid. Consider this (pseudo-POSIX C):
+ //
+ // char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
+ // char *p = mmap(LARGE_CONSTANT);
+ // char *q = foo(p, -LARGE_CONSTANT);
+ //
+ // Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
+ // further + z takes it back to the start of the mapping, so valid,
+ // regardless of the address mmap gave back. However, if mmap gives you an
+ // address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
+ // borrow from the high bits (with the subsequent + z carrying back into
+ // the high bits to give you a well-defined pointer) and thus trip
+ // FEAT_CPA's pointer corruption checks.
+ //
+ // We leave this fold as an opportunity for future work, addressing the
+ // corner case for FEAT_CPA, as well as reconciling the solution with the
+ // more general application of pointer arithmetic in other future targets.
+ }
+
+ return SDValue();
+}
+
/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
/// a shift and add with a different constant.
static SDValue foldAddSubOfSignBit(SDNode *N, const SDLoc &DL,
@@ -14940,6 +15029,7 @@ SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
default:
break;
case ISD::ADD:
+ case ISD::PTRADD:
case ISD::SUB: {
unsigned AlignShift = Log2(AL);
SDValue LHS = N0.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 52504b9f5fda1..b285886accdac 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -931,6 +931,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom);
setTargetDAGCombine({ISD::ADD,
+ ISD::PTRADD,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
@@ -14833,6 +14834,52 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
+ if (N1.getOpcode() == ISD::ADD) {
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
+ // y is not, and (add y, z) is used only once.
+ // (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
+ // z is not, and (add y, z) is used only once.
+ // The goal is to move constant offsets to the outermost ptradd, to create
+ // more opportunities to fold offsets into memory instructions.
+ // Together with the generic combines in DAGCombiner.cpp, this also
+ // implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
+ //
+ // This transform is here instead of in the general DAGCombiner as it can
+ // turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
+ // AArch64's CPA.
+ SDValue X = N0;
+ SDValue Y = N1.getOperand(0);
+ SDValue Z = N1.getOperand(1);
+ bool N1OneUse = N1.hasOneUse();
+ bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
+ bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
+ if ((ZIsConstant != YIsConstant) && N1OneUse) {
+ SDNodeFlags Flags;
+ // If both additions in the original were NUW, the new ones are as well.
+ if (N->getFlags().hasNoUnsignedWrap() &&
+ N1->getFlags().hasNoUnsignedWrap())
+ Flags |= SDNodeFlags::NoUnsignedWrap;
+
+ if (YIsConstant)
+ std::swap(Y, Z);
+
+ SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
+ DCI.AddToWorklist(Inner.getNode());
+ return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performSubCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -15365,6 +15412,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
case ISD::ADD:
return performAddCombine(N, DCI);
+ case ISD::PTRADD:
+ return performPtrAddCombine(N, DCI);
case ISD::SUB:
return performSubCombine(N, DCI);
case ISD::UADDO_CARRY:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index bd9ec7cb8ec48..a7917acd97a0f 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -217,6 +217,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
DAGCombinerInfo &DCI) const;
SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 0241be9197e1a..656003f45c54b 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -8,24 +8,14 @@
; Tests reassociation (ptradd N0:(ptradd p, c1), z) where N0 has only one use.
define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
-; GFX942_PTRADD-LABEL: global_load_ZTwoUses:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, 24
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: global_load_ZTwoUses:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: global_load_ZTwoUses:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %base, i64 24
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 %voffset
%l = load i64, ptr addrspace(1) %gep1, align 8
@@ -37,9 +27,8 @@ define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
; GFX942_PTRADD: ; %bb.0:
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[2:3], v[2:3], 0, 24
; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off
+; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
;
@@ -60,69 +49,36 @@ define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
; would be folded away in most cases, but the index computation introduced by
; the legalization of wide vector stores can for example introduce them.
define amdgpu_kernel void @store_v16i32(ptr addrspace(1) %out, <16 x i32> %a) {
-; GFX942_PTRADD-LABEL: store_v16i32:
-; GFX942_PTRADD: ; %bb.0: ; %entry
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v4, 0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_add_u32 s2, s0, 32
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s20
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s21
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s22
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s23
-; GFX942_PTRADD-NEXT: s_addc_u32 s3, s1, 0
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[2:3] offset:16
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s16
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s17
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s18
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s19
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s12
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s13
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s14
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s15
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942_PTRADD-NEXT: s_nop 1
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, s8
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s9
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, s10
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v3, s11
-; GFX942_PTRADD-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: store_v16i32:
-; GFX942_LEGACY: ; %bb.0: ; %entry
-; GFX942_LEGACY-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v4, 0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s20
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s21
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s22
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s23
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s16
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s17
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s18
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s19
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s12
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s13
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s14
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s15
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
-; GFX942_LEGACY-NEXT: s_nop 1
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, s8
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s9
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, s10
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v3, s11
-; GFX942_LEGACY-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: store_v16i32:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x40
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: v_mov_b32_e32 v4, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v0, s20
+; GFX942-NEXT: v_mov_b32_e32 v1, s21
+; GFX942-NEXT: v_mov_b32_e32 v2, s22
+; GFX942-NEXT: v_mov_b32_e32 v3, s23
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:48
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v0, s16
+; GFX942-NEXT: v_mov_b32_e32 v1, s17
+; GFX942-NEXT: v_mov_b32_e32 v2, s18
+; GFX942-NEXT: v_mov_b32_e32 v3, s19
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:32
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v0, s12
+; GFX942-NEXT: v_mov_b32_e32 v1, s13
+; GFX942-NEXT: v_mov_b32_e32 v2, s14
+; GFX942-NEXT: v_mov_b32_e32 v3, s15
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1] offset:16
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mov_b32_e32 v0, s8
+; GFX942-NEXT: v_mov_b32_e32 v1, s9
+; GFX942-NEXT: v_mov_b32_e32 v2, s10
+; GFX942-NEXT: v_mov_b32_e32 v3, s11
+; GFX942-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
store <16 x i32> %a, ptr addrspace(1) %out
ret void
@@ -131,20 +87,12 @@ entry:
; Tests the (ptradd 0, x) -> x DAG combine.
define void @baseptr_null(i64 %offset, i8 %v) {
-; GFX942_PTRADD-LABEL: baseptr_null:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], 0, 0, v[0:1]
-; GFX942_PTRADD-NEXT: flat_store_byte v[0:1], v2
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: baseptr_null:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: flat_store_byte v[0:1], v2
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: baseptr_null:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: flat_store_byte v[0:1], v2
+; GFX942-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i8, ptr null, i64 %offset
store i8 %v, ptr %gep, align 1
ret void
@@ -153,40 +101,21 @@ define void @baseptr_null(i64 %offset, i8 %v) {
; Taken from implicit-kernarg-backend-usage.ll, tests the PTRADD handling in the
; assertalign DAG combine.
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) #0 {
-; GFX942_PTRADD-LABEL: llvm_amdgcn_queue_ptr:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_add_u32 s8, s4, 8
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v2, 0
-; GFX942_PTRADD-NEXT: s_addc_u32 s9, s5, 0
-; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
-; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[8:9] sc0 sc1
-; GFX942_PTRADD-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
-; GFX942_PTRADD-NEXT: ; kill: killed $sgpr0_sgpr1
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942_PTRADD-NEXT: ; kill: killed $sgpr8 killed $sgpr9
-; GFX942_PTRADD-NEXT: ; kill: killed $sgpr2_sgpr3
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: llvm_amdgcn_queue_ptr:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v2, 0
-; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
-; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1
-; GFX942_LEGACY-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
-; GFX942_LEGACY-NEXT: ; kill: killed $sgpr0_sgpr1
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
-; GFX942_LEGACY-NEXT: ; kill: killed $sgpr2_sgpr3
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: llvm_amdgcn_queue_ptr:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: global_load_ubyte v0, v2, s[2:3] sc0 sc1
+; GFX942-NEXT: global_load_ubyte v0, v2, s[4:5] offset:8 sc0 sc1
+; GFX942-NEXT: global_load_ubyte v0, v2, s[0:1] sc0 sc1
+; GFX942-NEXT: ; kill: killed $sgpr0_sgpr1
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942-NEXT: ; kill: killed $sgpr2_sgpr3
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_endpgm
%queue.ptr = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
%implicitarg.ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%dispatch.ptr = call ptr a...
[truncated]
|
1afe2b1
to
efdaf03
Compare
d363847
to
8f51e2d
Compare
I don't know if it's important for CHERI to have this or if the IR-level optimisations render it not so needed. But |
I agree that there can be valid uses for this transformation, but I think it would make sense to make this target-specific for backends like CHERI that want it. When we make progress with sentinel pointer values in the DataLayout that properly describe non-zero NULL pointers (see #131557), we could enable reassociation for the sentinel pointer. |
efdaf03
to
80a86ba
Compare
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/Target/AMDGPU/SIISelLowering.cpp llvm/lib/Target/AMDGPU/SIISelLowering.h llvm/test/CodeGen/AArch64/cpa-selectiondag.ll llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM if @jrtc27 is also happy with this version.
df4a8d1
to
3764eec
Compare
477db40
to
57bf9ab
Compare
// fold (ptradd 0, x) -> x | ||
if (isNullConstant(N0) && PtrVT == IntVT) | ||
return N1; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// fold (ptradd 0, x) -> x | |
if (isNullConstant(N0) && PtrVT == IntVT) | |
return N1; | |
// fold (ptradd 0, x) -> x | |
if (PtrVT == IntVT && isNullConstant(N0)) | |
return N1; |
But PtrVT == IntVT was already asserted above?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, @arichardson argued for adding it redundantly here in a thread below, to highlight this constraint more explicitly for when the assertion might be relaxed in the future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've applied the suggested change for now.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@arsenm, do you think we should remove the redundant PtrVT == IntVT
part here again? That would be fine by me, and @arichardson already said above that he doesn't feel strongly about this and is happy with either.
57bf9ab
to
9a7045e
Compare
3f911ac
to
fd680df
Compare
41ee0e7
to
6a0ec76
Compare
760e22e
to
554316b
Compare
6a0ec76
to
e08329f
Compare
554316b
to
00afb5a
Compare
This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one that is closely connected. The generic DAG combine is based on a part of PR #105669 by @rgwott, which was adapted from work by @jrtc27, @arichardson, @davidchisnall in the CHERI/Morello LLVM tree. I added some parts and removed several disjuncts from the reassociation condition: - `isNullConstant(X)`, since there are address spaces where 0 is a perfectly normal value that shouldn't be treated specially, - `(YIsConstant && ZOneUse)` and `(N0OneUse && ZOneUse && !ZIsConstant)`, since they cause regressions in AMDGPU. For SWDEV-516125.
Those are folded earlier already.
…ment referring to the target-specific reassociation there, and remove a currently unused variable in the target-specific combine.
00afb5a
to
506c77e
Compare
This patch focuses on generic DAG combines, plus an AMDGPU-target-specific one
that is closely connected.
The generic DAG combine is based on a part of PR #105669 by rgwott, which was
adapted from work by jrtc27, arichardson, davidchisnall in the CHERI/Morello
LLVM tree. I added some parts and removed several disjuncts from the
reassociation condition:
isNullConstant(X)
, since there are address spaces where 0 is a perfectlynormal value that shouldn't be treated specially,
(YIsConstant && ZOneUse)
and(N0OneUse && ZOneUse && !ZIsConstant)
, sincethey cause regressions in AMDGPU.
For SWDEV-516125.