Skip to content

Commit 1cc9447

Browse files
committed
[RISCV] Expand constant multiplication for targets without M extension
1 parent 8b010e8 commit 1cc9447

19 files changed

+1704
-971
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 82 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -20,17 +20,20 @@
2020
#include "RISCVSelectionDAGInfo.h"
2121
#include "RISCVSubtarget.h"
2222
#include "llvm/ADT/SmallSet.h"
23+
#include "llvm/ADT/SmallVector.h"
2324
#include "llvm/ADT/Statistic.h"
2425
#include "llvm/Analysis/MemoryLocation.h"
2526
#include "llvm/Analysis/ValueTracking.h"
2627
#include "llvm/Analysis/VectorUtils.h"
28+
#include "llvm/CodeGen/ISDOpcodes.h"
2729
#include "llvm/CodeGen/MachineFrameInfo.h"
2830
#include "llvm/CodeGen/MachineFunction.h"
2931
#include "llvm/CodeGen/MachineInstrBuilder.h"
3032
#include "llvm/CodeGen/MachineJumpTableInfo.h"
3133
#include "llvm/CodeGen/MachineRegisterInfo.h"
3234
#include "llvm/CodeGen/SDPatternMatch.h"
3335
#include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
36+
#include "llvm/CodeGen/SelectionDAGNodes.h"
3437
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
3538
#include "llvm/CodeGen/ValueTypes.h"
3639
#include "llvm/IR/DiagnosticInfo.h"
@@ -15436,6 +15439,73 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
1543615439
return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
1543715440
}
1543815441

15442+
// Try to expand a multiply to a sequence of shifts and add/subs,
15443+
// for a machine w/o native mul instruction.
15444+
static SDValue expandMulToBasicOps(SDNode *N, SelectionDAG &DAG,
15445+
uint64_t MulAmt) {
15446+
const uint64_t BitWidth = N->getValueType(0).getFixedSizeInBits();
15447+
SDLoc DL(N);
15448+
15449+
if (MulAmt == 0)
15450+
return DAG.getConstant(0, DL, N->getValueType(0));
15451+
15452+
// Find the Non-adjacent form of the multiplier.
15453+
llvm::SmallVector<std::pair<bool, uint64_t>> Sequence; // {isAdd, shamt}
15454+
uint64_t E = MulAmt;
15455+
uint64_t I = 0;
15456+
while (E > 0) {
15457+
if (E & 1) {
15458+
if (I >= BitWidth)
15459+
break;
15460+
int8_t Z = ((E & 3) == 1) ? 1 : -1;
15461+
Sequence.push_back(std::make_pair((Z == 1), I));
15462+
E -= Z;
15463+
}
15464+
E >>= 1;
15465+
I++;
15466+
}
15467+
15468+
SDValue Result = DAG.getConstant(0, DL, N->getValueType(0));
15469+
SDValue N0 = N->getOperand(0);
15470+
15471+
for (const auto &Op : Sequence) {
15472+
SDValue ShiftVal;
15473+
if (Op.second > 0)
15474+
ShiftVal =
15475+
DAG.getNode(ISD::SHL, DL, N->getValueType(0), N0,
15476+
DAG.getConstant(Op.second, DL, N->getValueType(0)));
15477+
else
15478+
ShiftVal = N0;
15479+
15480+
ISD::NodeType AddSubOp = Op.first ? ISD::ADD : ISD::SUB;
15481+
Result = DAG.getNode(AddSubOp, DL, N->getValueType(0), Result, ShiftVal);
15482+
}
15483+
15484+
return Result;
15485+
}
15486+
15487+
// 2^N +/- 2^M -> (add/sub (shl X, C1), (shl X, C2))
15488+
static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
15489+
uint64_t MulAmt) {
15490+
uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
15491+
ISD::NodeType Op;
15492+
if (isPowerOf2_64(MulAmt + MulAmtLowBit))
15493+
Op = ISD::SUB;
15494+
else if (isPowerOf2_64(MulAmt - MulAmtLowBit))
15495+
Op = ISD::ADD;
15496+
else
15497+
return SDValue();
15498+
uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit;
15499+
SDLoc DL(N);
15500+
SDValue Shift1 =
15501+
DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(0),
15502+
DAG.getConstant(Log2_64(ShiftAmt1), DL, N->getValueType(0)));
15503+
SDValue Shift2 = DAG.getNode(
15504+
ISD::SHL, DL, N->getValueType(0), N->getOperand(0),
15505+
DAG.getConstant(Log2_64(MulAmtLowBit), DL, N->getValueType(0)));
15506+
return DAG.getNode(Op, DL, N->getValueType(0), Shift1, Shift2);
15507+
}
15508+
1543915509
// Try to expand a scalar multiply to a faster sequence.
1544015510
static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
1544115511
TargetLowering::DAGCombinerInfo &DCI,
@@ -15447,20 +15517,23 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
1544715517
if (DAG.getMachineFunction().getFunction().hasMinSize())
1544815518
return SDValue();
1544915519

15450-
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15451-
return SDValue();
15452-
1545315520
if (VT != Subtarget.getXLenVT())
1545415521
return SDValue();
1545515522

15456-
const bool HasShlAdd =
15457-
Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
15458-
1545915523
ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
1546015524
if (!CNode)
1546115525
return SDValue();
1546215526
uint64_t MulAmt = CNode->getZExtValue();
1546315527

15528+
if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul())
15529+
return expandMulToBasicOps(N, DAG, MulAmt);
15530+
15531+
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15532+
return SDValue();
15533+
15534+
const bool HasShlAdd =
15535+
Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
15536+
1546415537
// WARNING: The code below is knowingly incorrect with regards to undef semantics.
1546515538
// We're adding additional uses of X here, and in principle, we should be freezing
1546615539
// X before doing so. However, adding freeze here causes real regressions, and no
@@ -15569,22 +15642,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
1556915642
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
1557015643
}
1557115644
}
15572-
}
15573-
15574-
// 2^N - 2^M -> (sub (shl X, C1), (shl X, C2))
15575-
uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
15576-
if (isPowerOf2_64(MulAmt + MulAmtLowBit)) {
15577-
uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit;
15578-
SDLoc DL(N);
15579-
SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
15580-
DAG.getConstant(Log2_64(ShiftAmt1), DL, VT));
15581-
SDValue Shift2 =
15582-
DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
15583-
DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT));
15584-
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Shift2);
15585-
}
1558615645

15587-
if (HasShlAdd) {
1558815646
for (uint64_t Divisor : {3, 5, 9}) {
1558915647
if (MulAmt % Divisor != 0)
1559015648
continue;
@@ -15610,6 +15668,9 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
1561015668
}
1561115669
}
1561215670

15671+
if (SDValue V = expandMulToAddOrSubOfShl(N, DAG, MulAmt))
15672+
return V;
15673+
1561315674
return SDValue();
1561415675
}
1561515676

llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -262,20 +262,33 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
262262
; RV64I-NEXT: sext.w a1, a0
263263
; RV64I-NEXT: beqz a1, .LBB2_2
264264
; RV64I-NEXT: # %bb.1: # %cond.false
265-
; RV64I-NEXT: addi sp, sp, -16
266-
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
267-
; RV64I-NEXT: neg a1, a0
265+
; RV64I-NEXT: negw a1, a0
268266
; RV64I-NEXT: and a0, a0, a1
269-
; RV64I-NEXT: lui a1, 30667
270-
; RV64I-NEXT: addiw a1, a1, 1329
271-
; RV64I-NEXT: call __muldi3
267+
; RV64I-NEXT: slli a1, a0, 6
268+
; RV64I-NEXT: slli a2, a0, 8
269+
; RV64I-NEXT: slli a3, a0, 10
270+
; RV64I-NEXT: slli a4, a0, 12
271+
; RV64I-NEXT: add a1, a1, a2
272+
; RV64I-NEXT: slli a2, a0, 16
273+
; RV64I-NEXT: subw a3, a3, a4
274+
; RV64I-NEXT: slli a4, a0, 18
275+
; RV64I-NEXT: subw a2, a2, a4
276+
; RV64I-NEXT: slli a4, a0, 4
277+
; RV64I-NEXT: subw a4, a0, a4
278+
; RV64I-NEXT: add a1, a4, a1
279+
; RV64I-NEXT: slli a4, a0, 14
280+
; RV64I-NEXT: subw a3, a3, a4
281+
; RV64I-NEXT: slli a4, a0, 23
282+
; RV64I-NEXT: subw a2, a2, a4
283+
; RV64I-NEXT: slli a0, a0, 27
284+
; RV64I-NEXT: add a1, a1, a3
285+
; RV64I-NEXT: add a0, a2, a0
286+
; RV64I-NEXT: add a0, a1, a0
272287
; RV64I-NEXT: srliw a0, a0, 27
273288
; RV64I-NEXT: lui a1, %hi(.LCPI2_0)
274289
; RV64I-NEXT: addi a1, a1, %lo(.LCPI2_0)
275290
; RV64I-NEXT: add a0, a1, a0
276291
; RV64I-NEXT: lbu a0, 0(a0)
277-
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
278-
; RV64I-NEXT: addi sp, sp, 16
279292
; RV64I-NEXT: ret
280293
; RV64I-NEXT: .LBB2_2:
281294
; RV64I-NEXT: li a0, 32
@@ -718,20 +731,33 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
718731
;
719732
; RV64I-LABEL: test_cttz_i32_zero_undef:
720733
; RV64I: # %bb.0:
721-
; RV64I-NEXT: addi sp, sp, -16
722-
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
723-
; RV64I-NEXT: neg a1, a0
734+
; RV64I-NEXT: negw a1, a0
724735
; RV64I-NEXT: and a0, a0, a1
725-
; RV64I-NEXT: lui a1, 30667
726-
; RV64I-NEXT: addiw a1, a1, 1329
727-
; RV64I-NEXT: call __muldi3
736+
; RV64I-NEXT: slli a1, a0, 6
737+
; RV64I-NEXT: slli a2, a0, 8
738+
; RV64I-NEXT: slli a3, a0, 10
739+
; RV64I-NEXT: slli a4, a0, 12
740+
; RV64I-NEXT: add a1, a1, a2
741+
; RV64I-NEXT: slli a2, a0, 16
742+
; RV64I-NEXT: subw a3, a3, a4
743+
; RV64I-NEXT: slli a4, a0, 18
744+
; RV64I-NEXT: subw a2, a2, a4
745+
; RV64I-NEXT: slli a4, a0, 4
746+
; RV64I-NEXT: subw a4, a0, a4
747+
; RV64I-NEXT: add a1, a4, a1
748+
; RV64I-NEXT: slli a4, a0, 14
749+
; RV64I-NEXT: subw a3, a3, a4
750+
; RV64I-NEXT: slli a4, a0, 23
751+
; RV64I-NEXT: subw a2, a2, a4
752+
; RV64I-NEXT: slli a0, a0, 27
753+
; RV64I-NEXT: add a1, a1, a3
754+
; RV64I-NEXT: add a0, a2, a0
755+
; RV64I-NEXT: add a0, a1, a0
728756
; RV64I-NEXT: srliw a0, a0, 27
729757
; RV64I-NEXT: lui a1, %hi(.LCPI6_0)
730758
; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0)
731759
; RV64I-NEXT: add a0, a1, a0
732760
; RV64I-NEXT: lbu a0, 0(a0)
733-
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
734-
; RV64I-NEXT: addi sp, sp, 16
735761
; RV64I-NEXT: ret
736762
;
737763
; RV32M-LABEL: test_cttz_i32_zero_undef:

0 commit comments

Comments
 (0)