Skip to content

Commit e30a4fc

Browse files
authored
[TargetLowering] Improve one signature of forceExpandWideMUL. (#123991)
We have two forceExpandWideMUL functions. One takes the low and high half of 2 inputs and calculates the low and high half of their product. This does not calculate the full 2x width product. The other signature takes 2 inputs and calculates the low and high half of their full 2x width product. Previously it did this by sign/zero extending the inputs to create the high bits and then calling the other function. We can instead copy the algorithm from the other function and use the Signed flag to determine whether we should do SRA or SRL. This avoids the need to multiply the high part of the inputs and add them to the high half of the result. This improves the generated code for signed multiplication. This should improve the performance of #123262. I don't know yet how close we will get to gcc.
1 parent e19261f commit e30a4fc

16 files changed

+3367
-4646
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10952,22 +10952,71 @@ void TargetLowering::forceExpandWideMUL(SelectionDAG &DAG, const SDLoc &dl,
1095210952
SDValue &Hi) const {
1095310953
EVT VT = LHS.getValueType();
1095410954
assert(RHS.getValueType() == VT && "Mismatching operand types");
10955+
EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
10956+
// We can fall back to a libcall with an illegal type for the MUL if we
10957+
// have a libcall big enough.
10958+
RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
10959+
if (WideVT == MVT::i16)
10960+
LC = RTLIB::MUL_I16;
10961+
else if (WideVT == MVT::i32)
10962+
LC = RTLIB::MUL_I32;
10963+
else if (WideVT == MVT::i64)
10964+
LC = RTLIB::MUL_I64;
10965+
else if (WideVT == MVT::i128)
10966+
LC = RTLIB::MUL_I128;
1095510967

10956-
SDValue HiLHS;
10957-
SDValue HiRHS;
10958-
if (Signed) {
10959-
// The high part is obtained by SRA'ing all but one of the bits of low
10960-
// part.
10961-
unsigned LoSize = VT.getFixedSizeInBits();
10962-
SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
10963-
HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
10964-
HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
10965-
} else {
10966-
HiLHS = DAG.getConstant(0, dl, VT);
10967-
HiRHS = DAG.getConstant(0, dl, VT);
10968+
if (LC != RTLIB::UNKNOWN_LIBCALL && getLibcallName(LC)) {
10969+
SDValue HiLHS, HiRHS;
10970+
if (Signed) {
10971+
// The high part is obtained by SRA'ing all but one of the bits of low
10972+
// part.
10973+
unsigned LoSize = VT.getFixedSizeInBits();
10974+
SDValue Shift = DAG.getShiftAmountConstant(LoSize - 1, VT, dl);
10975+
HiLHS = DAG.getNode(ISD::SRA, dl, VT, LHS, Shift);
10976+
HiRHS = DAG.getNode(ISD::SRA, dl, VT, RHS, Shift);
10977+
} else {
10978+
HiLHS = DAG.getConstant(0, dl, VT);
10979+
HiRHS = DAG.getConstant(0, dl, VT);
10980+
}
10981+
forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
10982+
return;
1096810983
}
10969-
EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
10970-
forceExpandWideMUL(DAG, dl, Signed, WideVT, LHS, HiLHS, RHS, HiRHS, Lo, Hi);
10984+
10985+
// Expand the multiplication by brute force. This is a generalized-version of
10986+
// the code from Hacker's Delight (itself derived from Knuth's Algorithm M
10987+
// from section 4.3.1) combined with the Hacker's delight code
10988+
// for calculating mulhs.
10989+
unsigned Bits = VT.getSizeInBits();
10990+
unsigned HalfBits = Bits / 2;
10991+
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl, VT);
10992+
SDValue LL = DAG.getNode(ISD::AND, dl, VT, LHS, Mask);
10993+
SDValue RL = DAG.getNode(ISD::AND, dl, VT, RHS, Mask);
10994+
10995+
SDValue T = DAG.getNode(ISD::MUL, dl, VT, LL, RL);
10996+
SDValue TL = DAG.getNode(ISD::AND, dl, VT, T, Mask);
10997+
10998+
SDValue Shift = DAG.getShiftAmountConstant(HalfBits, VT, dl);
10999+
// This is always an unsigned shift.
11000+
SDValue TH = DAG.getNode(ISD::SRL, dl, VT, T, Shift);
11001+
11002+
unsigned ShiftOpc = Signed ? ISD::SRA : ISD::SRL;
11003+
SDValue LH = DAG.getNode(ShiftOpc, dl, VT, LHS, Shift);
11004+
SDValue RH = DAG.getNode(ShiftOpc, dl, VT, RHS, Shift);
11005+
11006+
SDValue U =
11007+
DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RL), TH);
11008+
SDValue UL = DAG.getNode(ISD::AND, dl, VT, U, Mask);
11009+
SDValue UH = DAG.getNode(ShiftOpc, dl, VT, U, Shift);
11010+
11011+
SDValue V =
11012+
DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LL, RH), UL);
11013+
SDValue VH = DAG.getNode(ShiftOpc, dl, VT, V, Shift);
11014+
11015+
Lo = DAG.getNode(ISD::ADD, dl, VT, TL,
11016+
DAG.getNode(ISD::SHL, dl, VT, V, Shift));
11017+
11018+
Hi = DAG.getNode(ISD::ADD, dl, VT, DAG.getNode(ISD::MUL, dl, VT, LH, RH),
11019+
DAG.getNode(ISD::ADD, dl, VT, UH, VH));
1097111020
}
1097211021

1097311022
SDValue

llvm/test/CodeGen/AArch64/i128-math.ll

Lines changed: 77 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -355,40 +355,32 @@ define i128 @i128_mul(i128 %x, i128 %y) {
355355
define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
356356
; CHECK-LABEL: i128_checked_mul:
357357
; CHECK: // %bb.0:
358-
; CHECK-NEXT: asr x8, x1, #63
359-
; CHECK-NEXT: asr x11, x3, #63
360-
; CHECK-NEXT: umulh x13, x0, x2
361-
; CHECK-NEXT: mul x9, x2, x8
362-
; CHECK-NEXT: umulh x10, x2, x8
363-
; CHECK-NEXT: umulh x12, x11, x0
364-
; CHECK-NEXT: mul x14, x1, x2
365-
; CHECK-NEXT: add x10, x10, x9
366-
; CHECK-NEXT: madd x8, x3, x8, x10
367-
; CHECK-NEXT: madd x10, x11, x1, x12
368-
; CHECK-NEXT: mul x11, x11, x0
369-
; CHECK-NEXT: umulh x12, x1, x2
370-
; CHECK-NEXT: mul x15, x0, x3
371-
; CHECK-NEXT: add x10, x10, x11
372-
; CHECK-NEXT: adds x9, x11, x9
373-
; CHECK-NEXT: umulh x16, x0, x3
374-
; CHECK-NEXT: adc x10, x10, x8
375-
; CHECK-NEXT: adds x8, x14, x13
376-
; CHECK-NEXT: cinc x12, x12, hs
377-
; CHECK-NEXT: mul x11, x1, x3
378-
; CHECK-NEXT: adds x8, x15, x8
379-
; CHECK-NEXT: umulh x13, x1, x3
358+
; CHECK-NEXT: asr x9, x1, #63
359+
; CHECK-NEXT: umulh x10, x0, x2
360+
; CHECK-NEXT: asr x13, x3, #63
361+
; CHECK-NEXT: mul x11, x1, x2
362+
; CHECK-NEXT: umulh x8, x1, x2
363+
; CHECK-NEXT: mul x9, x9, x2
364+
; CHECK-NEXT: adds x10, x11, x10
365+
; CHECK-NEXT: mul x14, x0, x3
366+
; CHECK-NEXT: umulh x12, x0, x3
367+
; CHECK-NEXT: adc x9, x8, x9
368+
; CHECK-NEXT: mul x13, x0, x13
369+
; CHECK-NEXT: adds x8, x14, x10
370+
; CHECK-NEXT: mul x15, x1, x3
371+
; CHECK-NEXT: smulh x10, x1, x3
380372
; CHECK-NEXT: mov x1, x8
381-
; CHECK-NEXT: cinc x14, x16, hs
382-
; CHECK-NEXT: adds x12, x12, x14
373+
; CHECK-NEXT: adc x11, x12, x13
374+
; CHECK-NEXT: asr x12, x9, #63
375+
; CHECK-NEXT: asr x13, x11, #63
376+
; CHECK-NEXT: adds x9, x9, x11
377+
; CHECK-NEXT: asr x11, x8, #63
383378
; CHECK-NEXT: mul x0, x0, x2
384-
; CHECK-NEXT: cset w14, hs
385-
; CHECK-NEXT: adds x11, x11, x12
386-
; CHECK-NEXT: asr x12, x8, #63
387-
; CHECK-NEXT: adc x13, x13, x14
388-
; CHECK-NEXT: adds x9, x11, x9
389-
; CHECK-NEXT: adc x10, x13, x10
390-
; CHECK-NEXT: cmp x9, x12
391-
; CHECK-NEXT: ccmp x10, x12, #0, eq
379+
; CHECK-NEXT: adc x12, x12, x13
380+
; CHECK-NEXT: adds x9, x15, x9
381+
; CHECK-NEXT: adc x10, x10, x12
382+
; CHECK-NEXT: cmp x9, x11
383+
; CHECK-NEXT: ccmp x10, x11, #0, eq
392384
; CHECK-NEXT: cset w2, eq
393385
; CHECK-NEXT: ret
394386
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
@@ -404,40 +396,32 @@ define { i128, i8 } @i128_checked_mul(i128 %x, i128 %y) {
404396
define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
405397
; CHECK-LABEL: i128_overflowing_mul:
406398
; CHECK: // %bb.0:
407-
; CHECK-NEXT: asr x8, x1, #63
408-
; CHECK-NEXT: asr x11, x3, #63
409-
; CHECK-NEXT: umulh x13, x0, x2
410-
; CHECK-NEXT: mul x9, x2, x8
411-
; CHECK-NEXT: umulh x10, x2, x8
412-
; CHECK-NEXT: umulh x12, x11, x0
413-
; CHECK-NEXT: mul x14, x1, x2
414-
; CHECK-NEXT: add x10, x10, x9
415-
; CHECK-NEXT: madd x8, x3, x8, x10
416-
; CHECK-NEXT: madd x10, x11, x1, x12
417-
; CHECK-NEXT: mul x11, x11, x0
418-
; CHECK-NEXT: umulh x12, x1, x2
419-
; CHECK-NEXT: mul x15, x0, x3
420-
; CHECK-NEXT: add x10, x10, x11
421-
; CHECK-NEXT: adds x9, x11, x9
422-
; CHECK-NEXT: umulh x16, x0, x3
423-
; CHECK-NEXT: adc x10, x10, x8
424-
; CHECK-NEXT: adds x8, x14, x13
425-
; CHECK-NEXT: cinc x12, x12, hs
426-
; CHECK-NEXT: mul x11, x1, x3
427-
; CHECK-NEXT: adds x8, x15, x8
428-
; CHECK-NEXT: umulh x13, x1, x3
399+
; CHECK-NEXT: asr x9, x1, #63
400+
; CHECK-NEXT: umulh x10, x0, x2
401+
; CHECK-NEXT: asr x13, x3, #63
402+
; CHECK-NEXT: mul x11, x1, x2
403+
; CHECK-NEXT: umulh x8, x1, x2
404+
; CHECK-NEXT: mul x9, x9, x2
405+
; CHECK-NEXT: adds x10, x11, x10
406+
; CHECK-NEXT: mul x14, x0, x3
407+
; CHECK-NEXT: umulh x12, x0, x3
408+
; CHECK-NEXT: adc x9, x8, x9
409+
; CHECK-NEXT: mul x13, x0, x13
410+
; CHECK-NEXT: adds x8, x14, x10
411+
; CHECK-NEXT: mul x15, x1, x3
412+
; CHECK-NEXT: smulh x10, x1, x3
429413
; CHECK-NEXT: mov x1, x8
430-
; CHECK-NEXT: cinc x14, x16, hs
431-
; CHECK-NEXT: adds x12, x12, x14
414+
; CHECK-NEXT: adc x11, x12, x13
415+
; CHECK-NEXT: asr x12, x9, #63
416+
; CHECK-NEXT: asr x13, x11, #63
417+
; CHECK-NEXT: adds x9, x9, x11
418+
; CHECK-NEXT: asr x11, x8, #63
432419
; CHECK-NEXT: mul x0, x0, x2
433-
; CHECK-NEXT: cset w14, hs
434-
; CHECK-NEXT: adds x11, x11, x12
435-
; CHECK-NEXT: asr x12, x8, #63
436-
; CHECK-NEXT: adc x13, x13, x14
437-
; CHECK-NEXT: adds x9, x11, x9
438-
; CHECK-NEXT: adc x10, x13, x10
439-
; CHECK-NEXT: cmp x9, x12
440-
; CHECK-NEXT: ccmp x10, x12, #0, eq
420+
; CHECK-NEXT: adc x12, x12, x13
421+
; CHECK-NEXT: adds x9, x15, x9
422+
; CHECK-NEXT: adc x10, x10, x12
423+
; CHECK-NEXT: cmp x9, x11
424+
; CHECK-NEXT: ccmp x10, x11, #0, eq
441425
; CHECK-NEXT: cset w2, ne
442426
; CHECK-NEXT: ret
443427
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
@@ -452,46 +436,38 @@ define { i128, i8 } @i128_overflowing_mul(i128 %x, i128 %y) {
452436
define i128 @i128_saturating_mul(i128 %x, i128 %y) {
453437
; CHECK-LABEL: i128_saturating_mul:
454438
; CHECK: // %bb.0:
455-
; CHECK-NEXT: asr x8, x1, #63
456-
; CHECK-NEXT: asr x11, x3, #63
457-
; CHECK-NEXT: umulh x13, x0, x2
458-
; CHECK-NEXT: mul x9, x2, x8
459-
; CHECK-NEXT: umulh x10, x2, x8
460-
; CHECK-NEXT: umulh x12, x11, x0
461-
; CHECK-NEXT: mul x14, x1, x2
462-
; CHECK-NEXT: add x10, x10, x9
463-
; CHECK-NEXT: madd x8, x3, x8, x10
464-
; CHECK-NEXT: madd x10, x11, x1, x12
465-
; CHECK-NEXT: mul x11, x11, x0
466-
; CHECK-NEXT: umulh x12, x1, x2
467-
; CHECK-NEXT: mul x16, x0, x3
468-
; CHECK-NEXT: add x10, x10, x11
469-
; CHECK-NEXT: adds x9, x11, x9
470-
; CHECK-NEXT: umulh x15, x0, x3
471-
; CHECK-NEXT: adc x8, x10, x8
472-
; CHECK-NEXT: adds x10, x14, x13
473-
; CHECK-NEXT: cinc x12, x12, hs
474-
; CHECK-NEXT: mul x17, x1, x3
475-
; CHECK-NEXT: adds x10, x16, x10
476-
; CHECK-NEXT: umulh x11, x1, x3
477-
; CHECK-NEXT: cinc x13, x15, hs
478-
; CHECK-NEXT: adds x12, x12, x13
479-
; CHECK-NEXT: cset w13, hs
480-
; CHECK-NEXT: adds x12, x17, x12
481-
; CHECK-NEXT: adc x11, x11, x13
482-
; CHECK-NEXT: adds x9, x12, x9
483-
; CHECK-NEXT: asr x12, x10, #63
439+
; CHECK-NEXT: asr x9, x1, #63
440+
; CHECK-NEXT: umulh x10, x0, x2
441+
; CHECK-NEXT: asr x13, x3, #63
442+
; CHECK-NEXT: mul x11, x1, x2
443+
; CHECK-NEXT: umulh x8, x1, x2
444+
; CHECK-NEXT: mul x9, x9, x2
445+
; CHECK-NEXT: adds x10, x11, x10
446+
; CHECK-NEXT: mul x14, x0, x3
447+
; CHECK-NEXT: umulh x12, x0, x3
448+
; CHECK-NEXT: adc x8, x8, x9
449+
; CHECK-NEXT: mul x13, x0, x13
450+
; CHECK-NEXT: adds x9, x14, x10
451+
; CHECK-NEXT: mul x11, x1, x3
452+
; CHECK-NEXT: adc x10, x12, x13
453+
; CHECK-NEXT: smulh x12, x1, x3
454+
; CHECK-NEXT: asr x13, x8, #63
455+
; CHECK-NEXT: asr x14, x10, #63
456+
; CHECK-NEXT: adds x8, x8, x10
457+
; CHECK-NEXT: adc x10, x13, x14
458+
; CHECK-NEXT: adds x8, x11, x8
459+
; CHECK-NEXT: asr x11, x9, #63
484460
; CHECK-NEXT: mul x13, x0, x2
485-
; CHECK-NEXT: adc x8, x11, x8
486-
; CHECK-NEXT: eor x11, x3, x1
487-
; CHECK-NEXT: eor x8, x8, x12
488-
; CHECK-NEXT: eor x9, x9, x12
489-
; CHECK-NEXT: asr x11, x11, #63
490-
; CHECK-NEXT: orr x8, x9, x8
491-
; CHECK-NEXT: eor x9, x11, #0x7fffffffffffffff
461+
; CHECK-NEXT: adc x10, x12, x10
462+
; CHECK-NEXT: eor x12, x3, x1
463+
; CHECK-NEXT: eor x8, x8, x11
464+
; CHECK-NEXT: eor x10, x10, x11
465+
; CHECK-NEXT: asr x11, x12, #63
466+
; CHECK-NEXT: orr x8, x8, x10
467+
; CHECK-NEXT: eor x10, x11, #0x7fffffffffffffff
492468
; CHECK-NEXT: cmp x8, #0
493-
; CHECK-NEXT: csel x1, x9, x10, ne
494469
; CHECK-NEXT: csinv x0, x13, x11, eq
470+
; CHECK-NEXT: csel x1, x10, x9, ne
495471
; CHECK-NEXT: ret
496472
%1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %x, i128 %y)
497473
%2 = extractvalue { i128, i1 } %1, 0

llvm/test/CodeGen/AArch64/umulo-128-legalisation-lowering.ll

Lines changed: 23 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -35,41 +35,33 @@ start:
3535
define i128 @__muloti4(i128 %0, i128 %1, ptr nocapture nonnull writeonly align 4 %2) #2 {
3636
; AARCH-LABEL: __muloti4:
3737
; AARCH: // %bb.0: // %Entry
38-
; AARCH-NEXT: asr x10, x1, #63
38+
; AARCH-NEXT: asr x11, x1, #63
3939
; AARCH-NEXT: asr x9, x3, #63
40-
; AARCH-NEXT: umulh x14, x0, x2
40+
; AARCH-NEXT: umulh x12, x0, x2
4141
; AARCH-NEXT: mov x8, x1
4242
; AARCH-NEXT: str wzr, [x4]
43-
; AARCH-NEXT: mul x12, x2, x10
44-
; AARCH-NEXT: umulh x13, x2, x10
45-
; AARCH-NEXT: umulh x11, x9, x0
46-
; AARCH-NEXT: mul x15, x1, x2
47-
; AARCH-NEXT: add x13, x13, x12
48-
; AARCH-NEXT: madd x11, x9, x1, x11
49-
; AARCH-NEXT: mul x9, x9, x0
50-
; AARCH-NEXT: madd x10, x3, x10, x13
51-
; AARCH-NEXT: umulh x13, x1, x2
52-
; AARCH-NEXT: add x11, x11, x9
53-
; AARCH-NEXT: adds x9, x9, x12
54-
; AARCH-NEXT: mul x16, x0, x3
55-
; AARCH-NEXT: adc x10, x11, x10
56-
; AARCH-NEXT: adds x11, x15, x14
57-
; AARCH-NEXT: umulh x17, x0, x3
58-
; AARCH-NEXT: cinc x13, x13, hs
59-
; AARCH-NEXT: mul x12, x1, x3
60-
; AARCH-NEXT: adds x1, x16, x11
61-
; AARCH-NEXT: umulh x11, x8, x3
62-
; AARCH-NEXT: cinc x14, x17, hs
63-
; AARCH-NEXT: adds x13, x13, x14
43+
; AARCH-NEXT: mul x13, x1, x2
44+
; AARCH-NEXT: umulh x10, x1, x2
45+
; AARCH-NEXT: mul x11, x11, x2
46+
; AARCH-NEXT: adds x12, x13, x12
47+
; AARCH-NEXT: mul x15, x0, x3
48+
; AARCH-NEXT: umulh x14, x0, x3
49+
; AARCH-NEXT: adc x10, x10, x11
50+
; AARCH-NEXT: mul x9, x0, x9
51+
; AARCH-NEXT: mul x16, x1, x3
52+
; AARCH-NEXT: adds x1, x15, x12
53+
; AARCH-NEXT: asr x12, x10, #63
54+
; AARCH-NEXT: smulh x11, x8, x3
55+
; AARCH-NEXT: adc x9, x14, x9
56+
; AARCH-NEXT: asr x13, x9, #63
57+
; AARCH-NEXT: adds x9, x10, x9
58+
; AARCH-NEXT: asr x10, x1, #63
6459
; AARCH-NEXT: mul x0, x0, x2
65-
; AARCH-NEXT: cset w14, hs
66-
; AARCH-NEXT: adds x12, x12, x13
67-
; AARCH-NEXT: asr x13, x1, #63
68-
; AARCH-NEXT: adc x11, x11, x14
69-
; AARCH-NEXT: adds x9, x12, x9
70-
; AARCH-NEXT: adc x10, x11, x10
71-
; AARCH-NEXT: cmp x9, x13
72-
; AARCH-NEXT: ccmp x10, x13, #0, eq
60+
; AARCH-NEXT: adc x12, x12, x13
61+
; AARCH-NEXT: adds x9, x16, x9
62+
; AARCH-NEXT: adc x11, x11, x12
63+
; AARCH-NEXT: cmp x9, x10
64+
; AARCH-NEXT: ccmp x11, x10, #0, eq
7365
; AARCH-NEXT: cset w9, ne
7466
; AARCH-NEXT: tbz x8, #63, .LBB1_2
7567
; AARCH-NEXT: // %bb.1: // %Entry

0 commit comments

Comments
 (0)