Skip to content

Commit d1cca72

Browse files
AZero13yuxuanchen1997
authored andcommitted
[CodeGen] Emit more efficient magic numbers for exact udivs (#87161)
Summary: Have simpler lowering for exact udivs in both SelectionDAG and GlobalISel. The algorithm is the same between unsigned exact divs and signed divs save for arithmetic vs logical shift for even divisors, according to Hacker's Delight, 2nd Edition, page 242. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250869
1 parent dbf7910 commit d1cca72

File tree

5 files changed

+462
-6
lines changed

5 files changed

+462
-6
lines changed

llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5183,8 +5183,35 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
51835183
KB ? KB->getKnownBits(LHS).countMinLeadingZeros() : 0;
51845184
auto &MIB = Builder;
51855185

5186+
bool UseSRL = false;
51865187
bool UseNPQ = false;
51875188
SmallVector<Register, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
5189+
SmallVector<Register, 16> Shifts, Factors;
5190+
auto *RHSDefInstr = cast<GenericMachineInstr>(getDefIgnoringCopies(RHS, MRI));
5191+
bool IsSplat = getIConstantSplatVal(*RHSDefInstr, MRI).has_value();
5192+
5193+
auto BuildExactUDIVPattern = [&](const Constant *C) {
5194+
// Don't recompute inverses for each splat element.
5195+
if (IsSplat && !Factors.empty()) {
5196+
Shifts.push_back(Shifts[0]);
5197+
Factors.push_back(Factors[0]);
5198+
return true;
5199+
}
5200+
5201+
auto *CI = cast<ConstantInt>(C);
5202+
APInt Divisor = CI->getValue();
5203+
unsigned Shift = Divisor.countr_zero();
5204+
if (Shift) {
5205+
Divisor.lshrInPlace(Shift);
5206+
UseSRL = true;
5207+
}
5208+
5209+
// Calculate the multiplicative inverse modulo BW.
5210+
APInt Factor = Divisor.multiplicativeInverse();
5211+
Shifts.push_back(MIB.buildConstant(ScalarShiftAmtTy, Shift).getReg(0));
5212+
Factors.push_back(MIB.buildConstant(ScalarTy, Factor).getReg(0));
5213+
return true;
5214+
};
51885215

51895216
auto BuildUDIVPattern = [&](const Constant *C) {
51905217
auto *CI = cast<ConstantInt>(C);
@@ -5231,6 +5258,29 @@ MachineInstr *CombinerHelper::buildUDivUsingMul(MachineInstr &MI) {
52315258
return true;
52325259
};
52335260

5261+
if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
5262+
// Collect all magic values from the build vector.
5263+
bool Matched = matchUnaryPredicate(MRI, RHS, BuildExactUDIVPattern);
5264+
(void)Matched;
5265+
assert(Matched && "Expected unary predicate match to succeed");
5266+
5267+
Register Shift, Factor;
5268+
if (Ty.isVector()) {
5269+
Shift = MIB.buildBuildVector(ShiftAmtTy, Shifts).getReg(0);
5270+
Factor = MIB.buildBuildVector(Ty, Factors).getReg(0);
5271+
} else {
5272+
Shift = Shifts[0];
5273+
Factor = Factors[0];
5274+
}
5275+
5276+
Register Res = LHS;
5277+
5278+
if (UseSRL)
5279+
Res = MIB.buildLShr(Ty, Res, Shift, MachineInstr::IsExact).getReg(0);
5280+
5281+
return MIB.buildMul(Ty, Res, Factor);
5282+
}
5283+
52345284
// Collect the shifts/magic values from each element.
52355285
bool Matched = matchUnaryPredicate(MRI, RHS, BuildUDIVPattern);
52365286
(void)Matched;
@@ -5283,9 +5333,6 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
52835333
Register Dst = MI.getOperand(0).getReg();
52845334
Register RHS = MI.getOperand(2).getReg();
52855335
LLT DstTy = MRI.getType(Dst);
5286-
auto *RHSDef = MRI.getVRegDef(RHS);
5287-
if (!isConstantOrConstantVector(*RHSDef, MRI))
5288-
return false;
52895336

52905337
auto &MF = *MI.getMF();
52915338
AttributeList Attr = MF.getFunction().getAttributes();
@@ -5300,6 +5347,15 @@ bool CombinerHelper::matchUDivByConst(MachineInstr &MI) {
53005347
if (MF.getFunction().hasMinSize())
53015348
return false;
53025349

5350+
if (MI.getFlag(MachineInstr::MIFlag::IsExact)) {
5351+
return matchUnaryPredicate(
5352+
MRI, RHS, [](const Constant *C) { return C && !C->isNullValue(); });
5353+
}
5354+
5355+
auto *RHSDef = MRI.getVRegDef(RHS);
5356+
if (!isConstantOrConstantVector(*RHSDef, MRI))
5357+
return false;
5358+
53035359
// Don't do this if the types are not going to be legal.
53045360
if (LI) {
53055361
if (!isLegalOrBeforeLegalizer({TargetOpcode::G_MUL, {DstTy, DstTy}}))

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 68 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6092,6 +6092,7 @@ void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
60926092

60936093
/// Given an exact SDIV by a constant, create a multiplication
60946094
/// with the multiplicative inverse of the constant.
6095+
/// Ref: "Hacker's Delight" by Henry Warren, 2nd Edition, p. 242
60956096
static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
60966097
const SDLoc &dl, SelectionDAG &DAG,
60976098
SmallVectorImpl<SDNode *> &Created) {
@@ -6141,10 +6142,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
61416142
}
61426143

61436144
SDValue Res = Op0;
6144-
6145-
// Shift the value upfront if it is even, so the LSB is one.
61466145
if (UseSRA) {
6147-
// TODO: For UDIV use SRL instead of SRA.
61486146
SDNodeFlags Flags;
61496147
Flags.setExact(true);
61506148
Res = DAG.getNode(ISD::SRA, dl, VT, Res, Shift, Flags);
@@ -6154,6 +6152,69 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDNode *N,
61546152
return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
61556153
}
61566154

6155+
/// Given an exact UDIV by a constant, create a multiplication
6156+
/// with the multiplicative inverse of the constant.
6157+
/// Ref: "Hacker's Delight" by Henry Warren, 2nd Edition, p. 242
6158+
static SDValue BuildExactUDIV(const TargetLowering &TLI, SDNode *N,
6159+
const SDLoc &dl, SelectionDAG &DAG,
6160+
SmallVectorImpl<SDNode *> &Created) {
6161+
EVT VT = N->getValueType(0);
6162+
EVT SVT = VT.getScalarType();
6163+
EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
6164+
EVT ShSVT = ShVT.getScalarType();
6165+
6166+
bool UseSRL = false;
6167+
SmallVector<SDValue, 16> Shifts, Factors;
6168+
6169+
auto BuildUDIVPattern = [&](ConstantSDNode *C) {
6170+
if (C->isZero())
6171+
return false;
6172+
APInt Divisor = C->getAPIntValue();
6173+
unsigned Shift = Divisor.countr_zero();
6174+
if (Shift) {
6175+
Divisor.lshrInPlace(Shift);
6176+
UseSRL = true;
6177+
}
6178+
// Calculate the multiplicative inverse modulo BW.
6179+
APInt Factor = Divisor.multiplicativeInverse();
6180+
Shifts.push_back(DAG.getConstant(Shift, dl, ShSVT));
6181+
Factors.push_back(DAG.getConstant(Factor, dl, SVT));
6182+
return true;
6183+
};
6184+
6185+
SDValue Op1 = N->getOperand(1);
6186+
6187+
// Collect all magic values from the build vector.
6188+
if (!ISD::matchUnaryPredicate(Op1, BuildUDIVPattern))
6189+
return SDValue();
6190+
6191+
SDValue Shift, Factor;
6192+
if (Op1.getOpcode() == ISD::BUILD_VECTOR) {
6193+
Shift = DAG.getBuildVector(ShVT, dl, Shifts);
6194+
Factor = DAG.getBuildVector(VT, dl, Factors);
6195+
} else if (Op1.getOpcode() == ISD::SPLAT_VECTOR) {
6196+
assert(Shifts.size() == 1 && Factors.size() == 1 &&
6197+
"Expected matchUnaryPredicate to return one element for scalable "
6198+
"vectors");
6199+
Shift = DAG.getSplatVector(ShVT, dl, Shifts[0]);
6200+
Factor = DAG.getSplatVector(VT, dl, Factors[0]);
6201+
} else {
6202+
assert(isa<ConstantSDNode>(Op1) && "Expected a constant");
6203+
Shift = Shifts[0];
6204+
Factor = Factors[0];
6205+
}
6206+
6207+
SDValue Res = N->getOperand(0);
6208+
if (UseSRL) {
6209+
SDNodeFlags Flags;
6210+
Flags.setExact(true);
6211+
Res = DAG.getNode(ISD::SRL, dl, VT, Res, Shift, Flags);
6212+
Created.push_back(Res.getNode());
6213+
}
6214+
6215+
return DAG.getNode(ISD::MUL, dl, VT, Res, Factor);
6216+
}
6217+
61576218
SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
61586219
SelectionDAG &DAG,
61596220
SmallVectorImpl<SDNode *> &Created) const {
@@ -6413,6 +6474,10 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
64136474
return SDValue();
64146475
}
64156476

6477+
// If the udiv has an 'exact' bit we can use a simpler lowering.
6478+
if (N->getFlags().hasExact())
6479+
return BuildExactUDIV(*this, N, dl, DAG, Created);
6480+
64166481
SDValue N0 = N->getOperand(0);
64176482
SDValue N1 = N->getOperand(1);
64186483

llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.ll

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,3 +269,45 @@ define i32 @udiv_div_by_180(i32 %x)
269269
%udiv = udiv i32 %truncate, 180
270270
ret i32 %udiv
271271
}
272+
273+
define i32 @udiv_div_by_180_exact(i32 %x)
274+
; SDAG-LABEL: udiv_div_by_180_exact:
275+
; SDAG: // %bb.0:
276+
; SDAG-NEXT: lsr w8, w0, #2
277+
; SDAG-NEXT: mov w9, #20389 // =0x4fa5
278+
; SDAG-NEXT: movk w9, #42234, lsl #16
279+
; SDAG-NEXT: mul w0, w8, w9
280+
; SDAG-NEXT: ret
281+
;
282+
; GISEL-LABEL: udiv_div_by_180_exact:
283+
; GISEL: // %bb.0:
284+
; GISEL-NEXT: lsr w8, w0, #2
285+
; GISEL-NEXT: mov w9, #20389 // =0x4fa5
286+
; GISEL-NEXT: movk w9, #42234, lsl #16
287+
; GISEL-NEXT: mul w0, w8, w9
288+
; GISEL-NEXT: ret
289+
{
290+
%udiv = udiv exact i32 %x, 180
291+
ret i32 %udiv
292+
}
293+
294+
define <4 x i32> @udiv_div_by_104_exact(<4 x i32> %x)
295+
; SDAG-LABEL: udiv_div_by_104_exact:
296+
; SDAG: // %bb.0:
297+
; SDAG-NEXT: adrp x8, .LCPI8_0
298+
; SDAG-NEXT: ushr v0.4s, v0.4s, #3
299+
; SDAG-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
300+
; SDAG-NEXT: mul v0.4s, v0.4s, v1.4s
301+
; SDAG-NEXT: ret
302+
;
303+
; GISEL-LABEL: udiv_div_by_104_exact:
304+
; GISEL: // %bb.0:
305+
; GISEL-NEXT: adrp x8, .LCPI8_0
306+
; GISEL-NEXT: ushr v0.4s, v0.4s, #3
307+
; GISEL-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
308+
; GISEL-NEXT: mul v0.4s, v0.4s, v1.4s
309+
; GISEL-NEXT: ret
310+
{
311+
%udiv = udiv exact <4 x i32> %x, <i32 104, i32 72, i32 104, i32 72>
312+
ret <4 x i32> %udiv
313+
}

llvm/test/CodeGen/AArch64/GlobalISel/combine-udiv.mir

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,5 +304,127 @@ body: |
304304
%10:_(<8 x s16>) = G_UDIV %0, %1
305305
$q0 = COPY %10(<8 x s16>)
306306
RET_ReallyLR implicit $q0
307+
...
308+
---
309+
name: udiv_exact
310+
body: |
311+
bb.1:
312+
liveins: $w0
313+
314+
; CHECK-LABEL: name: udiv_exact
315+
; CHECK: liveins: $w0
316+
; CHECK-NEXT: {{ $}}
317+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
318+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
319+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
320+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = exact G_LSHR [[COPY]], [[C]](s32)
321+
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LSHR]], [[C1]]
322+
; CHECK-NEXT: $w0 = COPY [[MUL]](s32)
323+
; CHECK-NEXT: RET_ReallyLR implicit $w0
324+
%0:_(s32) = COPY $w0
325+
%1:_(s32) = G_CONSTANT i32 104
326+
%2:_(s32) = exact G_UDIV %0, %1
327+
$w0 = COPY %2(s32)
328+
RET_ReallyLR implicit $w0
329+
330+
...
331+
---
332+
name: udiv_noexact
333+
body: |
334+
bb.1:
335+
liveins: $w0
336+
337+
; CHECK-LABEL: name: udiv_noexact
338+
; CHECK: liveins: $w0
339+
; CHECK-NEXT: {{ $}}
340+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
341+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1321528399
342+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 5
343+
; CHECK-NEXT: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[COPY]], [[C]]
344+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[UMULH]], [[C1]](s32)
345+
; CHECK-NEXT: $w0 = COPY [[LSHR]](s32)
346+
; CHECK-NEXT: RET_ReallyLR implicit $w0
347+
%0:_(s32) = COPY $w0
348+
%1:_(s32) = G_CONSTANT i32 104
349+
%2:_(s32) = G_UDIV %0, %1
350+
$w0 = COPY %2(s32)
351+
RET_ReallyLR implicit $w0
352+
353+
...
354+
---
355+
name: udiv_exact_minsize
356+
body: |
357+
bb.1:
358+
liveins: $w0
359+
360+
; CHECK-LABEL: name: udiv_exact_minsize
361+
; CHECK: liveins: $w0
362+
; CHECK-NEXT: {{ $}}
363+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
364+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
365+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
366+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = exact G_LSHR [[COPY]], [[C]](s32)
367+
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[LSHR]], [[C1]]
368+
; CHECK-NEXT: $w0 = COPY [[MUL]](s32)
369+
; CHECK-NEXT: RET_ReallyLR implicit $w0
370+
%0:_(s32) = COPY $w0
371+
%1:_(s32) = G_CONSTANT i32 104
372+
%2:_(s32) = exact G_UDIV %0, %1
373+
$w0 = COPY %2(s32)
374+
RET_ReallyLR implicit $w0
375+
376+
...
377+
---
378+
name: div_v4s32
379+
body: |
380+
bb.1:
381+
liveins: $q0
382+
383+
; CHECK-LABEL: name: div_v4s32
384+
; CHECK: liveins: $q0
385+
; CHECK-NEXT: {{ $}}
386+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
387+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
388+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
389+
; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 954437177
390+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
391+
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C2]](s32), [[C1]](s32), [[C2]](s32)
392+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = exact G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>)
393+
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[LSHR]], [[BUILD_VECTOR1]]
394+
; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>)
395+
; CHECK-NEXT: RET_ReallyLR implicit $q0
396+
%0:_(<4 x s32>) = COPY $q0
397+
%c1:_(s32) = G_CONSTANT i32 104
398+
%c2:_(s32) = G_CONSTANT i32 72
399+
%1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c2(s32), %c1(s32), %c2(s32)
400+
%3:_(<4 x s32>) = exact G_UDIV %0, %1
401+
$q0 = COPY %3(<4 x s32>)
402+
RET_ReallyLR implicit $q0
403+
404+
...
405+
---
406+
name: div_v4s32_splat
407+
body: |
408+
bb.1:
409+
liveins: $q0
410+
411+
; CHECK-LABEL: name: div_v4s32_splat
412+
; CHECK: liveins: $q0
413+
; CHECK-NEXT: {{ $}}
414+
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
415+
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
416+
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 -991146299
417+
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C]](s32), [[C]](s32), [[C]](s32), [[C]](s32)
418+
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[C1]](s32), [[C1]](s32), [[C1]](s32), [[C1]](s32)
419+
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(<4 x s32>) = exact G_LSHR [[COPY]], [[BUILD_VECTOR]](<4 x s32>)
420+
; CHECK-NEXT: [[MUL:%[0-9]+]]:_(<4 x s32>) = G_MUL [[LSHR]], [[BUILD_VECTOR1]]
421+
; CHECK-NEXT: $q0 = COPY [[MUL]](<4 x s32>)
422+
; CHECK-NEXT: RET_ReallyLR implicit $q0
423+
%0:_(<4 x s32>) = COPY $q0
424+
%c1:_(s32) = G_CONSTANT i32 104
425+
%1:_(<4 x s32>) = G_BUILD_VECTOR %c1(s32), %c1(s32), %c1(s32), %c1(s32)
426+
%3:_(<4 x s32>) = exact G_UDIV %0, %1
427+
$q0 = COPY %3(<4 x s32>)
428+
RET_ReallyLR implicit $q0
307429
308430
...

0 commit comments

Comments
 (0)