Skip to content

Commit 222adf3

Browse files
committed
[Arch64][SelectionDAG] Add target-specific implementation of srem
1. X%C to the equivalent of X-X/C*C is not always fastest path if there is no SDIV pair exist. So check target have faster for srem only first. 2. Add AArch64 faster path for SREM only pow2 case. Fix llvm#54649 Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D122968
1 parent b968c59 commit 222adf3

File tree

8 files changed

+187
-83
lines changed

8 files changed

+187
-83
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4479,6 +4479,14 @@ class TargetLowering : public TargetLoweringBase {
44794479
SelectionDAG &DAG,
44804480
SmallVectorImpl<SDNode *> &Created) const;
44814481

4482+
/// Targets may override this function to provide custom SREM lowering for
4483+
/// power-of-2 denominators. If the target returns an empty SDValue, LLVM
4484+
/// assumes SREM is expensive and replaces it with a series of other integer
4485+
/// operations.
4486+
virtual SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor,
4487+
SelectionDAG &DAG,
4488+
SmallVectorImpl<SDNode *> &Created) const;
4489+
44824490
/// Indicate whether this target prefers to combine FDIVs with the same
44834491
/// divisor. If the transform should never be done, return zero. If the
44844492
/// transform should be done, return the minimum number of divisor uses

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,8 @@ namespace {
569569
SDValue BuildSDIV(SDNode *N);
570570
SDValue BuildSDIVPow2(SDNode *N);
571571
SDValue BuildUDIV(SDNode *N);
572+
SDValue BuildSREMPow2(SDNode *N);
573+
SDValue buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N);
572574
SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
573575
SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
574576
SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
@@ -4320,12 +4322,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
43204322
return SDValue();
43214323
}
43224324

4323-
SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
4324-
SDLoc DL(N);
4325-
EVT VT = N->getValueType(0);
4326-
EVT CCVT = getSetCCResultType(VT);
4327-
unsigned BitWidth = VT.getScalarSizeInBits();
4328-
4325+
static bool isDivisorPowerOfTwo(SDValue Divisor) {
43294326
// Helper for determining whether a value is a power-2 constant scalar or a
43304327
// vector of such elements.
43314328
auto IsPowerOfTwo = [](ConstantSDNode *C) {
@@ -4338,11 +4335,20 @@ SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
43384335
return false;
43394336
};
43404337

4338+
return ISD::matchUnaryPredicate(Divisor, IsPowerOfTwo);
4339+
}
4340+
4341+
SDValue DAGCombiner::visitSDIVLike(SDValue N0, SDValue N1, SDNode *N) {
4342+
SDLoc DL(N);
4343+
EVT VT = N->getValueType(0);
4344+
EVT CCVT = getSetCCResultType(VT);
4345+
unsigned BitWidth = VT.getScalarSizeInBits();
4346+
43414347
// fold (sdiv X, pow2) -> simple ops after legalize
43424348
// FIXME: We check for the exact bit here because the generic lowering gives
43434349
// better results in that case. The target-specific lowering should learn how
43444350
// to handle exact sdivs efficiently.
4345-
if (!N->getFlags().hasExact() && ISD::matchUnaryPredicate(N1, IsPowerOfTwo)) {
4351+
if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1)) {
43464352
// Target-specific implementation of sdiv x, pow2.
43474353
if (SDValue Res = BuildSDIVPow2(N))
43484354
return Res;
@@ -4498,6 +4504,16 @@ SDValue DAGCombiner::visitUDIVLike(SDValue N0, SDValue N1, SDNode *N) {
44984504
return SDValue();
44994505
}
45004506

4507+
SDValue DAGCombiner::buildOptimizedSREM(SDValue N0, SDValue N1, SDNode *N) {
4508+
if (!N->getFlags().hasExact() && isDivisorPowerOfTwo(N1) &&
4509+
!DAG.doesNodeExist(ISD::SDIV, N->getVTList(), {N0, N1})) {
4510+
// Target-specific implementation of srem x, pow2.
4511+
if (SDValue Res = BuildSREMPow2(N))
4512+
return Res;
4513+
}
4514+
return SDValue();
4515+
}
4516+
45014517
// handles ISD::SREM and ISD::UREM
45024518
SDValue DAGCombiner::visitREM(SDNode *N) {
45034519
unsigned Opcode = N->getOpcode();
@@ -4558,6 +4574,12 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
45584574
// combine will not return a DIVREM. Regardless, checking cheapness here
45594575
// makes sense since the simplification results in fatter code.
45604576
if (DAG.isKnownNeverZero(N1) && !TLI.isIntDivCheap(VT, Attr)) {
4577+
if (isSigned) {
4578+
// check if we can build faster implementation for srem
4579+
SDValue OptimizedRem = buildOptimizedSREM(N0, N1, N);
4580+
if (OptimizedRem.getNode())
4581+
return OptimizedRem;
4582+
}
45614583
SDValue OptimizedDiv =
45624584
isSigned ? visitSDIVLike(N0, N1, N) : visitUDIVLike(N0, N1, N);
45634585
if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != N) {
@@ -23876,6 +23898,27 @@ SDValue DAGCombiner::BuildUDIV(SDNode *N) {
2387623898
return SDValue();
2387723899
}
2387823900

23901+
/// Given an ISD::SREM node expressing a remainder by constant power of 2,
23902+
/// return a DAG expression that will generate the same value.
23903+
SDValue DAGCombiner::BuildSREMPow2(SDNode *N) {
23904+
ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
23905+
if (!C)
23906+
return SDValue();
23907+
23908+
// Avoid division by zero.
23909+
if (C->isZero())
23910+
return SDValue();
23911+
23912+
SmallVector<SDNode *, 8> Built;
23913+
if (SDValue S = TLI.BuildSREMPow2(N, C->getAPIntValue(), DAG, Built)) {
23914+
for (SDNode *N : Built)
23915+
AddToWorklist(N);
23916+
return S;
23917+
}
23918+
23919+
return SDValue();
23920+
}
23921+
2387923922
/// Determines the LogBase2 value for a non-null input value using the
2388023923
/// transform: LogBase2(V) = (EltBits - 1) - ctlz(V).
2388123924
SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5560,6 +5560,17 @@ SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
55605560
return SDValue();
55615561
}
55625562

5563+
SDValue
5564+
TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
5565+
SelectionDAG &DAG,
5566+
SmallVectorImpl<SDNode *> &Created) const {
5567+
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
5568+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5569+
if (TLI.isIntDivCheap(N->getValueType(0), Attr))
5570+
return SDValue(N, 0); // Lower SREM as SREM
5571+
return SDValue();
5572+
}
5573+
55635574
/// Given an ISD::SDIV node expressing a divide by constant,
55645575
/// return a DAG expression to select that will generate the same value by
55655576
/// multiplying by a magic number.

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13567,6 +13567,60 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
1356713567
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
1356813568
}
1356913569

13570+
SDValue
13571+
AArch64TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
13572+
SelectionDAG &DAG,
13573+
SmallVectorImpl<SDNode *> &Created) const {
13574+
AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
13575+
if (isIntDivCheap(N->getValueType(0), Attr))
13576+
return SDValue(N, 0); // Lower SREM as SREM
13577+
13578+
EVT VT = N->getValueType(0);
13579+
13580+
// For scalable and fixed types, mark them as cheap so we can handle it much
13581+
// later. This allows us to handle larger than legal types.
13582+
if (VT.isScalableVector() || Subtarget->useSVEForFixedLengthVectors())
13583+
return SDValue(N, 0);
13584+
13585+
// fold (srem X, pow2)
13586+
if ((VT != MVT::i32 && VT != MVT::i64) ||
13587+
!(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
13588+
return SDValue();
13589+
13590+
unsigned Lg2 = Divisor.countTrailingZeros();
13591+
if (Lg2 == 0)
13592+
return SDValue();
13593+
13594+
SDLoc DL(N);
13595+
SDValue N0 = N->getOperand(0);
13596+
SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
13597+
SDValue Zero = DAG.getConstant(0, DL, VT);
13598+
SDValue CCVal, CSNeg;
13599+
if (Lg2 == 1) {
13600+
SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETGE, CCVal, DAG, DL);
13601+
SDValue And = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
13602+
CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, And, And, CCVal, Cmp);
13603+
13604+
Created.push_back(Cmp.getNode());
13605+
Created.push_back(And.getNode());
13606+
} else {
13607+
SDValue CCVal = DAG.getConstant(AArch64CC::MI, DL, MVT_CC);
13608+
SDVTList VTs = DAG.getVTList(VT, MVT::i32);
13609+
13610+
SDValue Negs = DAG.getNode(AArch64ISD::SUBS, DL, VTs, Zero, N0);
13611+
SDValue AndPos = DAG.getNode(ISD::AND, DL, VT, N0, Pow2MinusOne);
13612+
SDValue AndNeg = DAG.getNode(ISD::AND, DL, VT, Negs, Pow2MinusOne);
13613+
CSNeg = DAG.getNode(AArch64ISD::CSNEG, DL, VT, AndPos, AndNeg, CCVal,
13614+
Negs.getValue(1));
13615+
13616+
Created.push_back(Negs.getNode());
13617+
Created.push_back(AndPos.getNode());
13618+
Created.push_back(AndNeg.getNode());
13619+
}
13620+
13621+
return CSNeg;
13622+
}
13623+
1357013624
static bool IsSVECntIntrinsic(SDValue S) {
1357113625
switch(getIntrinsicID(S.getNode())) {
1357213626
default:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,6 +1043,8 @@ class AArch64TargetLowering : public TargetLowering {
10431043

10441044
SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
10451045
SmallVectorImpl<SDNode *> &Created) const override;
1046+
SDValue BuildSREMPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
1047+
SmallVectorImpl<SDNode *> &Created) const override;
10461048
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
10471049
int &ExtraSteps, bool &UseOneConst,
10481050
bool Reciprocal) const override;

llvm/test/CodeGen/AArch64/srem-pow2.ll

Lines changed: 20 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -43,10 +43,9 @@ define i16 @fold_srem_2_i16(i16 %x) {
4343
define i32 @fold_srem_2_i64(i32 %x) {
4444
; CHECK-LABEL: fold_srem_2_i64:
4545
; CHECK: // %bb.0:
46+
; CHECK-NEXT: and w8, w0, #0x1
4647
; CHECK-NEXT: cmp w0, #0
47-
; CHECK-NEXT: cinc w8, w0, lt
48-
; CHECK-NEXT: and w8, w8, #0xfffffffe
49-
; CHECK-NEXT: sub w0, w0, w8
48+
; CHECK-NEXT: cneg w0, w8, lt
5049
; CHECK-NEXT: ret
5150
%1 = srem i32 %x, 2
5251
ret i32 %1
@@ -55,10 +54,9 @@ define i32 @fold_srem_2_i64(i32 %x) {
5554
define i64 @fold_srem_2_i32(i64 %x) {
5655
; CHECK-LABEL: fold_srem_2_i32:
5756
; CHECK: // %bb.0:
57+
; CHECK-NEXT: and x8, x0, #0x1
5858
; CHECK-NEXT: cmp x0, #0
59-
; CHECK-NEXT: cinc x8, x0, lt
60-
; CHECK-NEXT: and x8, x8, #0xfffffffffffffffe
61-
; CHECK-NEXT: sub x0, x0, x8
59+
; CHECK-NEXT: cneg x0, x8, lt
6260
; CHECK-NEXT: ret
6361
%1 = srem i64 %x, 2
6462
ret i64 %1
@@ -80,11 +78,10 @@ define i16 @fold_srem_pow2_i16(i16 %x) {
8078
define i32 @fold_srem_pow2_i32(i32 %x) {
8179
; CHECK-LABEL: fold_srem_pow2_i32:
8280
; CHECK: // %bb.0:
83-
; CHECK-NEXT: add w8, w0, #63
84-
; CHECK-NEXT: cmp w0, #0
85-
; CHECK-NEXT: csel w8, w8, w0, lt
86-
; CHECK-NEXT: and w8, w8, #0xffffffc0
87-
; CHECK-NEXT: sub w0, w0, w8
81+
; CHECK-NEXT: negs w8, w0
82+
; CHECK-NEXT: and w9, w0, #0x3f
83+
; CHECK-NEXT: and w8, w8, #0x3f
84+
; CHECK-NEXT: csneg w0, w9, w8, mi
8885
; CHECK-NEXT: ret
8986
%1 = srem i32 %x, 64
9087
ret i32 %1
@@ -93,11 +90,10 @@ define i32 @fold_srem_pow2_i32(i32 %x) {
9390
define i64 @fold_srem_pow2_i64(i64 %x) {
9491
; CHECK-LABEL: fold_srem_pow2_i64:
9592
; CHECK: // %bb.0:
96-
; CHECK-NEXT: add x8, x0, #63
97-
; CHECK-NEXT: cmp x0, #0
98-
; CHECK-NEXT: csel x8, x8, x0, lt
99-
; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0
100-
; CHECK-NEXT: sub x0, x0, x8
93+
; CHECK-NEXT: negs x8, x0
94+
; CHECK-NEXT: and x9, x0, #0x3f
95+
; CHECK-NEXT: and x8, x8, #0x3f
96+
; CHECK-NEXT: csneg x0, x9, x8, mi
10197
; CHECK-NEXT: ret
10298
%1 = srem i64 %x, 64
10399
ret i64 %1
@@ -119,12 +115,10 @@ define i16 @fold_srem_smax_i16(i16 %x) {
119115
define i32 @fold_srem_smax_i32(i32 %x) {
120116
; CHECK-LABEL: fold_srem_smax_i32:
121117
; CHECK: // %bb.0:
122-
; CHECK-NEXT: mov w8, #2147483647
123-
; CHECK-NEXT: cmp w0, #0
124-
; CHECK-NEXT: add w8, w0, w8
125-
; CHECK-NEXT: csel w8, w8, w0, lt
126-
; CHECK-NEXT: and w8, w8, #0x80000000
127-
; CHECK-NEXT: add w0, w0, w8
118+
; CHECK-NEXT: negs w8, w0
119+
; CHECK-NEXT: and w9, w0, #0x7fffffff
120+
; CHECK-NEXT: and w8, w8, #0x7fffffff
121+
; CHECK-NEXT: csneg w0, w9, w8, mi
128122
; CHECK-NEXT: ret
129123
%1 = srem i32 %x, 2147483648
130124
ret i32 %1
@@ -133,12 +127,10 @@ define i32 @fold_srem_smax_i32(i32 %x) {
133127
define i64 @fold_srem_smax_i64(i64 %x) {
134128
; CHECK-LABEL: fold_srem_smax_i64:
135129
; CHECK: // %bb.0:
136-
; CHECK-NEXT: mov x8, #9223372036854775807
137-
; CHECK-NEXT: cmp x0, #0
138-
; CHECK-NEXT: add x8, x0, x8
139-
; CHECK-NEXT: csel x8, x8, x0, lt
140-
; CHECK-NEXT: and x8, x8, #0x8000000000000000
141-
; CHECK-NEXT: add x0, x0, x8
130+
; CHECK-NEXT: negs x8, x0
131+
; CHECK-NEXT: and x9, x0, #0x7fffffffffffffff
132+
; CHECK-NEXT: and x8, x8, #0x7fffffffffffffff
133+
; CHECK-NEXT: csneg x0, x9, x8, mi
142134
; CHECK-NEXT: ret
143135
%1 = srem i64 %x, -9223372036854775808
144136
ret i64 %1

llvm/test/CodeGen/AArch64/srem-seteq.ll

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -234,11 +234,11 @@ define i32 @test_srem_one(i32 %X) nounwind {
234234
define i32 @test_srem_pow2(i32 %X) nounwind {
235235
; CHECK-LABEL: test_srem_pow2:
236236
; CHECK: // %bb.0:
237-
; CHECK-NEXT: add w8, w0, #15
238-
; CHECK-NEXT: cmp w0, #0
239-
; CHECK-NEXT: csel w8, w8, w0, lt
240-
; CHECK-NEXT: and w8, w8, #0xfffffff0
241-
; CHECK-NEXT: cmp w0, w8
237+
; CHECK-NEXT: negs w8, w0
238+
; CHECK-NEXT: and w9, w0, #0xf
239+
; CHECK-NEXT: and w8, w8, #0xf
240+
; CHECK-NEXT: csneg w8, w9, w8, mi
241+
; CHECK-NEXT: cmp w8, #0
242242
; CHECK-NEXT: cset w0, eq
243243
; CHECK-NEXT: ret
244244
%srem = srem i32 %X, 16
@@ -251,12 +251,11 @@ define i32 @test_srem_pow2(i32 %X) nounwind {
251251
define i32 @test_srem_int_min(i32 %X) nounwind {
252252
; CHECK-LABEL: test_srem_int_min:
253253
; CHECK: // %bb.0:
254-
; CHECK-NEXT: mov w8, #2147483647
255-
; CHECK-NEXT: cmp w0, #0
256-
; CHECK-NEXT: add w8, w0, w8
257-
; CHECK-NEXT: csel w8, w8, w0, lt
258-
; CHECK-NEXT: and w8, w8, #0x80000000
259-
; CHECK-NEXT: cmn w0, w8
254+
; CHECK-NEXT: negs w8, w0
255+
; CHECK-NEXT: and w9, w0, #0x7fffffff
256+
; CHECK-NEXT: and w8, w8, #0x7fffffff
257+
; CHECK-NEXT: csneg w8, w9, w8, mi
258+
; CHECK-NEXT: cmp w8, #0
260259
; CHECK-NEXT: cset w0, eq
261260
; CHECK-NEXT: ret
262261
%srem = srem i32 %X, 2147483648

0 commit comments

Comments
 (0)