Skip to content

Commit 5744502

Browse files
committed
[TargetLowering][RISCV][AArch64][PowerPC] Enable BuildUDIV/BuildSDIV on illegal types before type legalization if we can find a larger legal type that supports MUL.
If we wait until the type is legalized, we'll lose information about the orginal type and need to use larger magic constants. This gets especially bad on RISCV64 where i64 is the only legal type. I've limited this to simple scalar types so it only works for i8/i16/i32 which are most likely to occur. For more odd types we might want to do a small promotion to a type where MULH is legal instead. Unfortunately, this does prevent some urem/srem+seteq matching since that still require legal types. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D96210
1 parent 0557b1b commit 5744502

File tree

9 files changed

+220
-243
lines changed

9 files changed

+220
-243
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 69 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5083,11 +5083,25 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
50835083
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
50845084
EVT ShSVT = ShVT.getScalarType();
50855085
unsigned EltBits = VT.getScalarSizeInBits();
5086+
EVT MulVT;
50865087

50875088
// Check to see if we can do this.
50885089
// FIXME: We should be more aggressive here.
5089-
if (!isTypeLegal(VT))
5090-
return SDValue();
5090+
if (!isTypeLegal(VT)) {
5091+
// Limit this to simple scalars for now.
5092+
if (VT.isVector() || !VT.isSimple())
5093+
return SDValue();
5094+
5095+
// If this type will be promoted to a large enough type with a legal
5096+
// multiply operation, we can go ahead and do this transform.
5097+
if (getTypeAction(VT.getSimpleVT()) != TypePromoteInteger)
5098+
return SDValue();
5099+
5100+
MulVT = getTypeToTransformTo(*DAG.getContext(), VT);
5101+
if (MulVT.getSizeInBits() < (2 * EltBits) ||
5102+
!isOperationLegal(ISD::MUL, MulVT))
5103+
return SDValue();
5104+
}
50915105

50925106
// If the sdiv has an 'exact' bit we can use a simpler lowering.
50935107
if (N->getFlags().hasExact())
@@ -5156,15 +5170,32 @@ SDValue TargetLowering::BuildSDIV(SDNode *N, SelectionDAG &DAG,
51565170

51575171
// Multiply the numerator (operand 0) by the magic value.
51585172
// FIXME: We should support doing a MUL in a wider type.
5159-
SDValue Q;
5160-
if (isOperationLegalOrCustom(ISD::MULHS, VT, IsAfterLegalization))
5161-
Q = DAG.getNode(ISD::MULHS, dl, VT, N0, MagicFactor);
5162-
else if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT, IsAfterLegalization)) {
5163-
SDValue LoHi =
5164-
DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), N0, MagicFactor);
5165-
Q = SDValue(LoHi.getNode(), 1);
5166-
} else
5167-
return SDValue(); // No mulhs or equivalent.
5173+
auto GetMULHS = [&](SDValue X, SDValue Y) {
5174+
// If the type isn't legal, use a wider mul of the the type calculated
5175+
// earlier.
5176+
if (!isTypeLegal(VT)) {
5177+
X = DAG.getNode(ISD::SIGN_EXTEND, dl, MulVT, X);
5178+
Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MulVT, Y);
5179+
Y = DAG.getNode(ISD::MUL, dl, MulVT, X, Y);
5180+
Y = DAG.getNode(ISD::SRL, dl, MulVT, Y,
5181+
DAG.getShiftAmountConstant(EltBits, MulVT, dl));
5182+
return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
5183+
}
5184+
5185+
if (isOperationLegalOrCustom(ISD::MULHS, VT, IsAfterLegalization))
5186+
return DAG.getNode(ISD::MULHS, dl, VT, X, Y);
5187+
if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT, IsAfterLegalization)) {
5188+
SDValue LoHi =
5189+
DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT), X, Y);
5190+
return SDValue(LoHi.getNode(), 1);
5191+
}
5192+
return SDValue();
5193+
};
5194+
5195+
SDValue Q = GetMULHS(N0, MagicFactor);
5196+
if (!Q)
5197+
return SDValue();
5198+
51685199
Created.push_back(Q.getNode());
51695200

51705201
// (Optionally) Add/subtract the numerator using Factor.
@@ -5199,11 +5230,25 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
51995230
EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
52005231
EVT ShSVT = ShVT.getScalarType();
52015232
unsigned EltBits = VT.getScalarSizeInBits();
5233+
EVT MulVT;
52025234

52035235
// Check to see if we can do this.
52045236
// FIXME: We should be more aggressive here.
5205-
if (!isTypeLegal(VT))
5206-
return SDValue();
5237+
if (!isTypeLegal(VT)) {
5238+
// Limit this to simple scalars for now.
5239+
if (VT.isVector() || !VT.isSimple())
5240+
return SDValue();
5241+
5242+
// If this type will be promoted to a large enough type with a legal
5243+
// multiply operation, we can go ahead and do this transform.
5244+
if (getTypeAction(VT.getSimpleVT()) != TypePromoteInteger)
5245+
return SDValue();
5246+
5247+
MulVT = getTypeToTransformTo(*DAG.getContext(), VT);
5248+
if (MulVT.getSizeInBits() < (2 * EltBits) ||
5249+
!isOperationLegal(ISD::MUL, MulVT))
5250+
return SDValue();
5251+
}
52075252

52085253
bool UseNPQ = false;
52095254
SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
@@ -5283,6 +5328,17 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
52835328

52845329
// FIXME: We should support doing a MUL in a wider type.
52855330
auto GetMULHU = [&](SDValue X, SDValue Y) {
5331+
// If the type isn't legal, use a wider mul of the the type calculated
5332+
// earlier.
5333+
if (!isTypeLegal(VT)) {
5334+
X = DAG.getNode(ISD::ZERO_EXTEND, dl, MulVT, X);
5335+
Y = DAG.getNode(ISD::ZERO_EXTEND, dl, MulVT, Y);
5336+
Y = DAG.getNode(ISD::MUL, dl, MulVT, X, Y);
5337+
Y = DAG.getNode(ISD::SRL, dl, MulVT, Y,
5338+
DAG.getShiftAmountConstant(EltBits, MulVT, dl));
5339+
return DAG.getNode(ISD::TRUNCATE, dl, VT, Y);
5340+
}
5341+
52865342
if (isOperationLegalOrCustom(ISD::MULHU, VT, IsAfterLegalization))
52875343
return DAG.getNode(ISD::MULHU, dl, VT, X, Y);
52885344
if (isOperationLegalOrCustom(ISD::UMUL_LOHI, VT, IsAfterLegalization)) {

llvm/lib/Target/BPF/BPFISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ class BPFTargetLowering : public TargetLowering {
104104
return Op.size() >= 8 ? MVT::i64 : MVT::i32;
105105
}
106106

107+
bool isIntDivCheap(EVT VT, AttributeList Attr) const override { return true; }
108+
107109
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
108110
Type *Ty) const override {
109111
return true;

llvm/test/CodeGen/AArch64/srem-seteq.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,10 @@ define i32 @test_srem_odd_bit31(i32 %X) nounwind {
8383
define i16 @test_srem_even(i16 %X) nounwind {
8484
; CHECK-LABEL: test_srem_even:
8585
; CHECK: // %bb.0:
86-
; CHECK-NEXT: mov w9, #9363
8786
; CHECK-NEXT: sxth w8, w0
88-
; CHECK-NEXT: movk w9, #37449, lsl #16
89-
; CHECK-NEXT: smull x9, w8, w9
90-
; CHECK-NEXT: lsr x9, x9, #32
91-
; CHECK-NEXT: add w8, w9, w8
92-
; CHECK-NEXT: asr w9, w8, #3
87+
; CHECK-NEXT: mov w9, #18725
88+
; CHECK-NEXT: mul w8, w8, w9
89+
; CHECK-NEXT: asr w9, w8, #18
9390
; CHECK-NEXT: add w8, w9, w8, lsr #31
9491
; CHECK-NEXT: mov w9, #14
9592
; CHECK-NEXT: msub w8, w8, w9, w0

llvm/test/CodeGen/AArch64/urem-seteq-nonzero.ll

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -195,14 +195,15 @@ define i1 @t32_6_5(i32 %X) nounwind {
195195
define i1 @t16_3_2(i16 %X) nounwind {
196196
; CHECK-LABEL: t16_3_2:
197197
; CHECK: // %bb.0:
198-
; CHECK-NEXT: mov w9, #43691
199198
; CHECK-NEXT: and w8, w0, #0xffff
200-
; CHECK-NEXT: movk w9, #43690, lsl #16
201-
; CHECK-NEXT: mov w10, #-1431655766
202-
; CHECK-NEXT: madd w8, w8, w9, w10
203-
; CHECK-NEXT: mov w9, #1431655765
204-
; CHECK-NEXT: cmp w8, w9
205-
; CHECK-NEXT: cset w0, lo
199+
; CHECK-NEXT: mov w9, #43691
200+
; CHECK-NEXT: mul w8, w8, w9
201+
; CHECK-NEXT: lsr w8, w8, #17
202+
; CHECK-NEXT: add w8, w8, w8, lsl #1
203+
; CHECK-NEXT: sub w8, w0, w8
204+
; CHECK-NEXT: and w8, w8, #0xffff
205+
; CHECK-NEXT: cmp w8, #2 // =2
206+
; CHECK-NEXT: cset w0, eq
206207
; CHECK-NEXT: ret
207208
%urem = urem i16 %X, 3
208209
%cmp = icmp eq i16 %urem, 2
@@ -212,14 +213,15 @@ define i1 @t16_3_2(i16 %X) nounwind {
212213
define i1 @t8_3_2(i8 %X) nounwind {
213214
; CHECK-LABEL: t8_3_2:
214215
; CHECK: // %bb.0:
215-
; CHECK-NEXT: mov w9, #43691
216216
; CHECK-NEXT: and w8, w0, #0xff
217-
; CHECK-NEXT: movk w9, #43690, lsl #16
218-
; CHECK-NEXT: mov w10, #-1431655766
219-
; CHECK-NEXT: madd w8, w8, w9, w10
220-
; CHECK-NEXT: mov w9, #1431655765
221-
; CHECK-NEXT: cmp w8, w9
222-
; CHECK-NEXT: cset w0, lo
217+
; CHECK-NEXT: mov w9, #171
218+
; CHECK-NEXT: mul w8, w8, w9
219+
; CHECK-NEXT: lsr w8, w8, #9
220+
; CHECK-NEXT: add w8, w8, w8, lsl #1
221+
; CHECK-NEXT: sub w8, w0, w8
222+
; CHECK-NEXT: and w8, w8, #0xff
223+
; CHECK-NEXT: cmp w8, #2 // =2
224+
; CHECK-NEXT: cset w0, eq
223225
; CHECK-NEXT: ret
224226
%urem = urem i8 %X, 3
225227
%cmp = icmp eq i8 %urem, 2

llvm/test/CodeGen/AArch64/urem-seteq.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,14 @@ define i32 @test_urem_odd_bit31(i32 %X) nounwind {
7878
define i16 @test_urem_even(i16 %X) nounwind {
7979
; CHECK-LABEL: test_urem_even:
8080
; CHECK: // %bb.0:
81-
; CHECK-NEXT: mov w9, #28087
82-
; CHECK-NEXT: and w8, w0, #0xffff
83-
; CHECK-NEXT: movk w9, #46811, lsl #16
81+
; CHECK-NEXT: ubfx w8, w0, #1, #15
82+
; CHECK-NEXT: mov w9, #18725
8483
; CHECK-NEXT: mul w8, w8, w9
85-
; CHECK-NEXT: mov w9, #9362
86-
; CHECK-NEXT: ror w8, w8, #1
87-
; CHECK-NEXT: movk w9, #4681, lsl #16
88-
; CHECK-NEXT: cmp w8, w9
89-
; CHECK-NEXT: cset w0, hi
84+
; CHECK-NEXT: lsr w8, w8, #17
85+
; CHECK-NEXT: mov w9, #14
86+
; CHECK-NEXT: msub w8, w8, w9, w0
87+
; CHECK-NEXT: tst w8, #0xffff
88+
; CHECK-NEXT: cset w0, ne
9089
; CHECK-NEXT: ret
9190
%urem = urem i16 %X, 14
9291
%cmp = icmp ne i16 %urem, 0

llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,6 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
615615
; CHECK-LABEL: test_ds_cross_basic_blocks:
616616
; CHECK: # %bb.0:
617617
; CHECK-NEXT: cmplwi r4, 0
618-
; CHECK-NEXT: std r26, -48(r1) # 8-byte Folded Spill
619618
; CHECK-NEXT: std r27, -40(r1) # 8-byte Folded Spill
620619
; CHECK-NEXT: std r28, -32(r1) # 8-byte Folded Spill
621620
; CHECK-NEXT: std r29, -24(r1) # 8-byte Folded Spill
@@ -627,59 +626,57 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
627626
; CHECK-NEXT: li r7, 1
628627
; CHECK-NEXT: addi r6, r3, 4009
629628
; CHECK-NEXT: ld r5, .LC0@toc@l(r5)
630-
; CHECK-NEXT: iselgt r8, r4, r7
631-
; CHECK-NEXT: lis r4, -21846
629+
; CHECK-NEXT: iselgt r4, r4, r7
632630
; CHECK-NEXT: li r3, 0
633-
; CHECK-NEXT: li r9, -7
634-
; CHECK-NEXT: li r10, -6
631+
; CHECK-NEXT: li r8, -7
632+
; CHECK-NEXT: li r9, -6
633+
; CHECK-NEXT: li r10, 1
635634
; CHECK-NEXT: li r11, 1
636635
; CHECK-NEXT: li r12, 1
637636
; CHECK-NEXT: li r30, 1
638637
; CHECK-NEXT: ld r5, 0(r5)
639-
; CHECK-NEXT: mtctr r8
640-
; CHECK-NEXT: ori r4, r4, 43691
641-
; CHECK-NEXT: li r8, -9
638+
; CHECK-NEXT: mtctr r4
639+
; CHECK-NEXT: li r4, -9
642640
; CHECK-NEXT: li r29, 1
643-
; CHECK-NEXT: li r28, 1
644641
; CHECK-NEXT: addi r5, r5, -1
645642
; CHECK-NEXT: b .LBB6_4
646643
; CHECK-NEXT: .p2align 4
647644
; CHECK-NEXT: .LBB6_2:
648-
; CHECK-NEXT: ldx r0, r6, r8
649-
; CHECK-NEXT: add r28, r0, r28
650-
; CHECK-NEXT: ld r0, -8(r6)
645+
; CHECK-NEXT: ldx r0, r6, r4
651646
; CHECK-NEXT: add r29, r0, r29
647+
; CHECK-NEXT: ld r0, -8(r6)
648+
; CHECK-NEXT: add r30, r0, r30
652649
; CHECK-NEXT: .LBB6_3:
653-
; CHECK-NEXT: mulld r0, r29, r28
650+
; CHECK-NEXT: mulld r0, r30, r29
654651
; CHECK-NEXT: addi r6, r6, 1
655-
; CHECK-NEXT: mulld r0, r0, r30
656652
; CHECK-NEXT: mulld r0, r0, r12
657653
; CHECK-NEXT: mulld r0, r0, r11
654+
; CHECK-NEXT: mulld r0, r0, r10
658655
; CHECK-NEXT: maddld r3, r0, r7, r3
659656
; CHECK-NEXT: bdz .LBB6_9
660657
; CHECK-NEXT: .LBB6_4:
661658
; CHECK-NEXT: lbzu r0, 1(r5)
662-
; CHECK-NEXT: mulhwu r27, r0, r4
663-
; CHECK-NEXT: rlwinm r26, r27, 0, 0, 30
664-
; CHECK-NEXT: srwi r27, r27, 1
665-
; CHECK-NEXT: add r27, r27, r26
666-
; CHECK-NEXT: sub r0, r0, r27
659+
; CHECK-NEXT: mulli r28, r0, 171
660+
; CHECK-NEXT: rlwinm r27, r28, 24, 8, 30
661+
; CHECK-NEXT: srwi r28, r28, 9
662+
; CHECK-NEXT: add r28, r28, r27
663+
; CHECK-NEXT: sub r0, r0, r28
664+
; CHECK-NEXT: clrlwi r0, r0, 24
667665
; CHECK-NEXT: cmplwi r0, 1
668666
; CHECK-NEXT: beq cr0, .LBB6_2
669667
; CHECK-NEXT: # %bb.5:
670-
; CHECK-NEXT: clrlwi r0, r0, 24
671668
; CHECK-NEXT: cmplwi r0, 2
672669
; CHECK-NEXT: bne cr0, .LBB6_7
673670
; CHECK-NEXT: # %bb.6:
674-
; CHECK-NEXT: ldx r0, r6, r9
675-
; CHECK-NEXT: add r30, r0, r30
676-
; CHECK-NEXT: ld r0, -4(r6)
671+
; CHECK-NEXT: ldx r0, r6, r8
677672
; CHECK-NEXT: add r12, r0, r12
673+
; CHECK-NEXT: ld r0, -4(r6)
674+
; CHECK-NEXT: add r11, r0, r11
678675
; CHECK-NEXT: b .LBB6_3
679676
; CHECK-NEXT: .p2align 4
680677
; CHECK-NEXT: .LBB6_7:
681-
; CHECK-NEXT: ldx r0, r6, r10
682-
; CHECK-NEXT: add r11, r0, r11
678+
; CHECK-NEXT: ldx r0, r6, r9
679+
; CHECK-NEXT: add r10, r0, r10
683680
; CHECK-NEXT: ld r0, 0(r6)
684681
; CHECK-NEXT: add r7, r0, r7
685682
; CHECK-NEXT: b .LBB6_3
@@ -690,7 +687,6 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
690687
; CHECK-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
691688
; CHECK-NEXT: ld r28, -32(r1) # 8-byte Folded Reload
692689
; CHECK-NEXT: ld r27, -40(r1) # 8-byte Folded Reload
693-
; CHECK-NEXT: ld r26, -48(r1) # 8-byte Folded Reload
694690
; CHECK-NEXT: blr
695691
%3 = sext i32 %1 to i64
696692
%4 = icmp eq i32 %1, 0

0 commit comments

Comments
 (0)