Skip to content

[AMDGPU][SDAG] Add ISD::PTRADD DAG combines #142739

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 84 additions & 1 deletion llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,7 @@ namespace {
SDValue visitADDLike(SDNode *N);
SDValue visitADDLikeCommutative(SDValue N0, SDValue N1,
SDNode *LocReference);
SDValue visitPTRADD(SDNode *N);
SDValue visitSUB(SDNode *N);
SDValue visitADDSAT(SDNode *N);
SDValue visitSUBSAT(SDNode *N);
Expand Down Expand Up @@ -1140,7 +1141,7 @@ bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
return true;
}

if (Opc != ISD::ADD)
if (Opc != ISD::ADD && Opc != ISD::PTRADD)
return false;

auto *C2 = dyn_cast<ConstantSDNode>(N1);
Expand Down Expand Up @@ -1860,6 +1861,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
case ISD::TokenFactor: return visitTokenFactor(N);
case ISD::MERGE_VALUES: return visitMERGE_VALUES(N);
case ISD::ADD: return visitADD(N);
case ISD::PTRADD: return visitPTRADD(N);
case ISD::SUB: return visitSUB(N);
case ISD::SADDSAT:
case ISD::UADDSAT: return visitADDSAT(N);
Expand Down Expand Up @@ -2630,6 +2632,86 @@ SDValue DAGCombiner::foldSubToAvg(SDNode *N, const SDLoc &DL) {
return SDValue();
}

/// Try to fold a pointer arithmetic node.
/// This needs to be done separately from normal addition, because pointer
/// addition is not commutative.
SDValue DAGCombiner::visitPTRADD(SDNode *N) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT PtrVT = N0.getValueType();
EVT IntVT = N1.getValueType();
SDLoc DL(N);

// This is already ensured by an assert in SelectionDAG::getNode(). Several
// combines here depend on this assumption.
assert(PtrVT == IntVT &&
"PTRADD with different operand types is not supported");

// fold (ptradd x, 0) -> x
if (isNullConstant(N1))
return N0;

// fold (ptradd 0, x) -> x
if (PtrVT == IntVT && isNullConstant(N0))
return N1;

if (N0.getOpcode() != ISD::PTRADD ||
reassociationCanBreakAddressingModePattern(ISD::PTRADD, DL, N, N0, N1))
return SDValue();

SDValue X = N0.getOperand(0);
SDValue Y = N0.getOperand(1);
SDValue Z = N1;
bool N0OneUse = N0.hasOneUse();
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);

// (ptradd (ptradd x, y), z) -> (ptradd x, (add y, z)) if:
// * y is a constant and (ptradd x, y) has one use; or
// * y and z are both constants.
if ((YIsConstant && N0OneUse) || (YIsConstant && ZIsConstant)) {
// If both additions in the original were NUW, the new ones are as well.
SDNodeFlags Flags =
(N->getFlags() & N0->getFlags()) & SDNodeFlags::NoUnsignedWrap;
SDValue Add = DAG.getNode(ISD::ADD, DL, IntVT, {Y, Z}, Flags);
AddToWorklist(Add.getNode());
return DAG.getMemBasePlusOffset(X, Add, DL, Flags);
}

// TODO: There is another possible fold here that was proven useful.
// It would be this:
//
// (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y) if:
// * (ptradd x, y) has one use; and
// * y is a constant; and
// * z is not a constant.
//
// In some cases, specifically in AArch64's FEAT_CPA, it exposes the
// opportunity to select more complex instructions such as SUBPT and
// MSUBPT. However, a hypothetical corner case has been found that we could
// not avoid. Consider this (pseudo-POSIX C):
//
// char *foo(char *x, int z) {return (x + LARGE_CONSTANT) + z;}
// char *p = mmap(LARGE_CONSTANT);
// char *q = foo(p, -LARGE_CONSTANT);
//
// Then x + LARGE_CONSTANT is one-past-the-end, so valid, and a
// further + z takes it back to the start of the mapping, so valid,
// regardless of the address mmap gave back. However, if mmap gives you an
// address < LARGE_CONSTANT (ignoring high bits), x - LARGE_CONSTANT will
// borrow from the high bits (with the subsequent + z carrying back into
// the high bits to give you a well-defined pointer) and thus trip
// FEAT_CPA's pointer corruption checks.
//
// We leave this fold as an opportunity for future work, addressing the
// corner case for FEAT_CPA, as well as reconciling the solution with the
// more general application of pointer arithmetic in other future targets.
// For now each architecture that wants this fold must implement it in the
// target-specific code (see e.g. SITargetLowering::performPtrAddCombine)

return SDValue();
}

/// Try to fold a 'not' shifted sign-bit with add/sub with constant operand into
/// a shift and add with a different constant.
static SDValue foldAddSubOfSignBit(SDNode *N, const SDLoc &DL,
Expand Down Expand Up @@ -15061,6 +15143,7 @@ SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
default:
break;
case ISD::ADD:
case ISD::PTRADD:
case ISD::SUB: {
unsigned AlignShift = Log2(AL);
SDValue LHS = N0.getOperand(0);
Expand Down
46 changes: 46 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
}

setTargetDAGCombine({ISD::ADD,
ISD::PTRADD,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
Expand Down Expand Up @@ -15095,6 +15096,49 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return SDValue();
}

SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

if (N1.getOpcode() == ISD::ADD) {
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if z is a constant,
// y is not, and (add y, z) is used only once.
// (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if y is a constant,
// z is not, and (add y, z) is used only once.
// The goal is to move constant offsets to the outermost ptradd, to create
// more opportunities to fold offsets into memory instructions.
// Together with the generic combines in DAGCombiner.cpp, this also
// implements (ptradd (ptradd x, y), z) -> (ptradd (ptradd x, z), y)).
//
// This transform is here instead of in the general DAGCombiner as it can
// turn in-bounds pointer arithmetic out-of-bounds, which is problematic for
// AArch64's CPA.
SDValue X = N0;
SDValue Y = N1.getOperand(0);
SDValue Z = N1.getOperand(1);
if (N1.hasOneUse()) {
bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
if (ZIsConstant != YIsConstant) {
// If both additions in the original were NUW, the new ones are as well.
SDNodeFlags Flags =
(N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
if (YIsConstant)
std::swap(Y, Z);

SDValue Inner = DAG.getMemBasePlusOffset(X, Y, DL, Flags);
DCI.AddToWorklist(Inner.getNode());
return DAG.getMemBasePlusOffset(Inner, Z, DL, Flags);
}
}
}

return SDValue();
}

SDValue SITargetLowering::performSubCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
Expand Down Expand Up @@ -15633,6 +15677,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
case ISD::ADD:
return performAddCombine(N, DCI);
case ISD::PTRADD:
return performPtrAddCombine(N, DCI);
case ISD::SUB:
return performSubCombine(N, DCI);
case ISD::UADDO_CARRY:
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
DAGCombinerInfo &DCI) const;

SDValue performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performPtrAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAddCarrySubCarryCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
Expand Down
107 changes: 46 additions & 61 deletions llvm/test/CodeGen/AArch64/cpa-selectiondag.ll
Original file line number Diff line number Diff line change
Expand Up @@ -119,23 +119,17 @@ define void @msubpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O0: // %bb.0: // %entry
; CHECK-CPA-O0-NEXT: // implicit-def: $x8
; CHECK-CPA-O0-NEXT: mov w8, w0
; CHECK-CPA-O0-NEXT: sxtw x9, w8
; CHECK-CPA-O0-NEXT: mov x8, xzr
; CHECK-CPA-O0-NEXT: subs x8, x8, x9
; CHECK-CPA-O0-NEXT: lsl x8, x8, #1
; CHECK-CPA-O0-NEXT: subs x10, x8, x9
; CHECK-CPA-O0-NEXT: sxtw x8, w8
; CHECK-CPA-O0-NEXT: mov w9, #48 // =0x30
; CHECK-CPA-O0-NEXT: // kill: def $x9 killed $w9
; CHECK-CPA-O0-NEXT: mneg x8, x8, x9
; CHECK-CPA-O0-NEXT: add x8, x8, #288
; CHECK-CPA-O0-NEXT: adrp x9, array2
; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array2
; CHECK-CPA-O0-NEXT: mov w8, #288 // =0x120
; CHECK-CPA-O0-NEXT: // kill: def $x8 killed $w8
; CHECK-CPA-O0-NEXT: addpt x8, x9, x8
; CHECK-CPA-O0-NEXT: addpt x8, x8, x10, lsl #4
; CHECK-CPA-O0-NEXT: mov w10, #96 // =0x60
; CHECK-CPA-O0-NEXT: // kill: def $x10 killed $w10
; CHECK-CPA-O0-NEXT: addpt x10, x9, x10
; CHECK-CPA-O0-NEXT: ldr q1, [x10, #16]
; CHECK-CPA-O0-NEXT: ldr q2, [x10, #32]
; CHECK-CPA-O0-NEXT: ldr q0, [x9, #96]
; CHECK-CPA-O0-NEXT: ldr q1, [x9, #112]
; CHECK-CPA-O0-NEXT: ldr q2, [x9, #128]
; CHECK-CPA-O0-NEXT: str q2, [x8, #32]
; CHECK-CPA-O0-NEXT: str q1, [x8, #16]
; CHECK-CPA-O0-NEXT: str q0, [x8]
Expand All @@ -144,21 +138,17 @@ define void @msubpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O3-LABEL: msubpt1:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-CPA-O3-NEXT: sxtw x9, w0
; CHECK-CPA-O3-NEXT: adrp x8, array2
; CHECK-CPA-O3-NEXT: add x8, x8, :lo12:array2
; CHECK-CPA-O3-NEXT: mov w11, #96 // =0x60
; CHECK-CPA-O3-NEXT: mov w12, #288 // =0x120
; CHECK-CPA-O3-NEXT: ldr q2, [x8, #96]
; CHECK-CPA-O3-NEXT: neg x10, x9
; CHECK-CPA-O3-NEXT: addpt x11, x8, x11
; CHECK-CPA-O3-NEXT: lsl x10, x10, #1
; CHECK-CPA-O3-NEXT: ldp q1, q0, [x11, #16]
; CHECK-CPA-O3-NEXT: sub x9, x10, x9
; CHECK-CPA-O3-NEXT: addpt x10, x8, x12
; CHECK-CPA-O3-NEXT: addpt x9, x10, x9, lsl #4
; CHECK-CPA-O3-NEXT: stp q1, q0, [x9, #16]
; CHECK-CPA-O3-NEXT: str q2, [x9]
; CHECK-CPA-O3-NEXT: sxtw x8, w0
; CHECK-CPA-O3-NEXT: mov w9, #48 // =0x30
; CHECK-CPA-O3-NEXT: mneg x8, x8, x9
; CHECK-CPA-O3-NEXT: adrp x9, array2
; CHECK-CPA-O3-NEXT: add x9, x9, :lo12:array2
; CHECK-CPA-O3-NEXT: ldp q1, q0, [x9, #112]
; CHECK-CPA-O3-NEXT: ldr q2, [x9, #96]
; CHECK-CPA-O3-NEXT: add x8, x8, #288
; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: stp q1, q0, [x8, #16]
; CHECK-CPA-O3-NEXT: str q2, [x8]
; CHECK-CPA-O3-NEXT: ret
;
; CHECK-NOCPA-O0-LABEL: msubpt1:
Expand Down Expand Up @@ -205,29 +195,29 @@ entry:
define void @subpt1(i32 %index, i32 %elem) {
; CHECK-CPA-O0-LABEL: subpt1:
; CHECK-CPA-O0: // %bb.0: // %entry
; CHECK-CPA-O0-NEXT: adrp x9, array
; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: // implicit-def: $x8
; CHECK-CPA-O0-NEXT: mov w8, w0
; CHECK-CPA-O0-NEXT: sxtw x9, w8
; CHECK-CPA-O0-NEXT: mov w8, #96 // =0x60
; CHECK-CPA-O0-NEXT: // kill: def $x8 killed $w8
; CHECK-CPA-O0-NEXT: subs x8, x8, x9, lsl #8
; CHECK-CPA-O0-NEXT: adrp x9, array
; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: addpt x8, x9, x8
; CHECK-CPA-O0-NEXT: // implicit-def: $x10
; CHECK-CPA-O0-NEXT: mov w10, w0
; CHECK-CPA-O0-NEXT: sbfiz x10, x10, #8, #32
; CHECK-CPA-O0-NEXT: subpt x8, x8, x10
; CHECK-CPA-O0-NEXT: ldr q0, [x9, #32]
; CHECK-CPA-O0-NEXT: str q0, [x8]
; CHECK-CPA-O0-NEXT: ret
;
; CHECK-CPA-O3-LABEL: subpt1:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-CPA-O3-NEXT: adrp x8, array
; CHECK-CPA-O3-NEXT: add x8, x8, :lo12:array
; CHECK-CPA-O3-NEXT: sxtw x8, w0
; CHECK-CPA-O3-NEXT: mov w9, #96 // =0x60
; CHECK-CPA-O3-NEXT: sbfiz x10, x0, #8, #32
; CHECK-CPA-O3-NEXT: addpt x9, x8, x9
; CHECK-CPA-O3-NEXT: ldr q0, [x8, #32]
; CHECK-CPA-O3-NEXT: subpt x8, x9, x10
; CHECK-CPA-O3-NEXT: sub x8, x9, x8, lsl #8
; CHECK-CPA-O3-NEXT: adrp x9, array
; CHECK-CPA-O3-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O3-NEXT: ldr q0, [x9, #32]
; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: str q0, [x8]
; CHECK-CPA-O3-NEXT: ret
;
Expand Down Expand Up @@ -264,28 +254,24 @@ entry:
define void @subpt2(i32 %index, i32 %elem) {
; CHECK-CPA-O0-LABEL: subpt2:
; CHECK-CPA-O0: // %bb.0: // %entry
; CHECK-CPA-O0-NEXT: mov x8, xzr
; CHECK-CPA-O0-NEXT: subs x10, x8, w0, sxtw
; CHECK-CPA-O0-NEXT: adrp x9, array
; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: mov w8, #96 // =0x60
; CHECK-CPA-O0-NEXT: // kill: def $x8 killed $w8
; CHECK-CPA-O0-NEXT: subs x8, x8, w0, sxtw #4
; CHECK-CPA-O0-NEXT: adrp x9, array
; CHECK-CPA-O0-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O0-NEXT: addpt x8, x9, x8
; CHECK-CPA-O0-NEXT: addpt x8, x8, x10, lsl #4
; CHECK-CPA-O0-NEXT: ldr q0, [x9, #32]
; CHECK-CPA-O0-NEXT: str q0, [x8]
; CHECK-CPA-O0-NEXT: ret
;
; CHECK-CPA-O3-LABEL: subpt2:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: mov x8, xzr
; CHECK-CPA-O3-NEXT: mov w9, #96 // =0x60
; CHECK-CPA-O3-NEXT: adrp x10, array
; CHECK-CPA-O3-NEXT: add x10, x10, :lo12:array
; CHECK-CPA-O3-NEXT: sub x8, x8, w0, sxtw
; CHECK-CPA-O3-NEXT: addpt x9, x10, x9
; CHECK-CPA-O3-NEXT: ldr q0, [x10, #32]
; CHECK-CPA-O3-NEXT: addpt x8, x9, x8, lsl #4
; CHECK-CPA-O3-NEXT: mov w8, #96 // =0x60
; CHECK-CPA-O3-NEXT: adrp x9, array
; CHECK-CPA-O3-NEXT: add x9, x9, :lo12:array
; CHECK-CPA-O3-NEXT: sub x8, x8, w0, sxtw #4
; CHECK-CPA-O3-NEXT: ldr q0, [x9, #32]
; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: str q0, [x8]
; CHECK-CPA-O3-NEXT: ret
;
Expand Down Expand Up @@ -670,14 +656,13 @@ define hidden void @multidim() {
; CHECK-CPA-O0-NEXT: .cfi_offset w30, -16
; CHECK-CPA-O0-NEXT: adrp x8, b
; CHECK-CPA-O0-NEXT: ldrh w9, [x8, :lo12:b]
; CHECK-CPA-O0-NEXT: // implicit-def: $x8
; CHECK-CPA-O0-NEXT: mov w8, w9
; CHECK-CPA-O0-NEXT: mov w10, w8
; CHECK-CPA-O0-NEXT: ubfiz x8, x8, #1, #32
; CHECK-CPA-O0-NEXT: add x10, x8, #2
; CHECK-CPA-O0-NEXT: adrp x8, a
; CHECK-CPA-O0-NEXT: add x8, x8, :lo12:a
; CHECK-CPA-O0-NEXT: mov w11, #2 // =0x2
; CHECK-CPA-O0-NEXT: // kill: def $x11 killed $w11
; CHECK-CPA-O0-NEXT: addpt x8, x8, x11
; CHECK-CPA-O0-NEXT: addpt x8, x8, x10, lsl #1
; CHECK-CPA-O0-NEXT: addpt x8, x8, x10
; CHECK-CPA-O0-NEXT: add w9, w9, #1
; CHECK-CPA-O0-NEXT: mov w9, w9
; CHECK-CPA-O0-NEXT: // kill: def $x9 killed $w9
Expand All @@ -697,13 +682,13 @@ define hidden void @multidim() {
; CHECK-CPA-O3-LABEL: multidim:
; CHECK-CPA-O3: // %bb.0: // %entry
; CHECK-CPA-O3-NEXT: adrp x8, b
; CHECK-CPA-O3-NEXT: mov w9, #2 // =0x2
; CHECK-CPA-O3-NEXT: adrp x10, a
; CHECK-CPA-O3-NEXT: add x10, x10, :lo12:a
; CHECK-CPA-O3-NEXT: ldrh w8, [x8, :lo12:b]
; CHECK-CPA-O3-NEXT: addpt x9, x10, x9
; CHECK-CPA-O3-NEXT: addpt x9, x9, x8, lsl #1
; CHECK-CPA-O3-NEXT: lsl x9, x8, #1
; CHECK-CPA-O3-NEXT: add x8, x8, #1
; CHECK-CPA-O3-NEXT: add x9, x9, #2
; CHECK-CPA-O3-NEXT: addpt x9, x10, x9
; CHECK-CPA-O3-NEXT: addpt x8, x9, x8
; CHECK-CPA-O3-NEXT: ldrb w8, [x8]
; CHECK-CPA-O3-NEXT: cbz w8, .LBB14_2
Expand Down
Loading
Loading