Skip to content

Commit f73da9e

Browse files
committed
[RISCV] Expand constant multiplication for targets without M extension
1 parent de0bcd0 commit f73da9e

12 files changed

+1491
-700
lines changed

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 110 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "RISCVSelectionDAGInfo.h"
2121
#include "RISCVSubtarget.h"
2222
#include "llvm/ADT/SmallSet.h"
23+
#include "llvm/ADT/SmallVector.h"
2324
#include "llvm/ADT/Statistic.h"
2425
#include "llvm/Analysis/MemoryLocation.h"
2526
#include "llvm/Analysis/ValueTracking.h"
@@ -15502,6 +15503,105 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
1550215503
return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
1550315504
}
1550415505

15506+
static SDValue expandMulToNAFSequence(SDNode *N, SelectionDAG &DAG,
15507+
const SDLoc &DL, uint64_t MulAmt) {
15508+
EVT VT = N->getValueType(0);
15509+
const uint64_t BitWidth = VT.getFixedSizeInBits();
15510+
15511+
// Find the Non-adjacent form of the multiplier.
15512+
llvm::SmallVector<std::pair<bool, uint64_t>> Sequence; // {isAdd, shamt}
15513+
for (uint64_t E = MulAmt, I = 0; E && I < BitWidth; ++I, E >>= 1) {
15514+
if (E & 1) {
15515+
bool IsAdd = (E & 3) == 1;
15516+
Sequence.push_back({IsAdd, I});
15517+
E -= IsAdd ? 1 : -1;
15518+
}
15519+
}
15520+
15521+
SDValue Result = DAG.getConstant(0, DL, N->getValueType(0));
15522+
SDValue N0 = N->getOperand(0);
15523+
15524+
for (const auto &Op : Sequence) {
15525+
SDValue ShiftVal;
15526+
if (Op.second > 0)
15527+
ShiftVal =
15528+
DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(Op.second, DL, VT));
15529+
else
15530+
ShiftVal = N0;
15531+
15532+
ISD::NodeType AddSubOp = Op.first ? ISD::ADD : ISD::SUB;
15533+
Result = DAG.getNode(AddSubOp, DL, VT, Result, ShiftVal);
15534+
}
15535+
return Result;
15536+
}
15537+
// Try to expand a multiply to a sequence of shifts and add/subs,
15538+
// for a machine without native mul instruction.
15539+
static SDValue expandMulToBasicOps(SDNode *N, SelectionDAG &DAG,
15540+
uint64_t MulAmt) {
15541+
EVT VT = N->getValueType(0);
15542+
const uint64_t BitWidth = VT.getFixedSizeInBits();
15543+
SDLoc DL(N);
15544+
15545+
if (MulAmt == 0)
15546+
return DAG.getConstant(0, DL, N->getValueType(0));
15547+
15548+
// Try to factorize into (2^N) * (2^M_1 +/- 1) * (2^M_2 +/- 1) * ...
15549+
uint64_t TrailingZeros = llvm::countr_zero(MulAmt);
15550+
uint64_t E = MulAmt >> TrailingZeros;
15551+
15552+
llvm::SmallVector<std::pair<bool, uint64_t>> Factors; // {is_2^M+1, M}
15553+
15554+
while (E > 1) {
15555+
bool Found = false;
15556+
for (int64_t I = BitWidth - 1; I >= 2; --I) {
15557+
uint64_t Factor = 1ULL << I;
15558+
15559+
if (E % (Factor + 1) == 0) {
15560+
Factors.push_back({true, I});
15561+
E /= Factor + 1;
15562+
Found = true;
15563+
break;
15564+
}
15565+
if (E % (Factor - 1) == 0) {
15566+
Factors.push_back({false, I});
15567+
E /= Factor - 1;
15568+
Found = true;
15569+
break;
15570+
}
15571+
}
15572+
if (!Found)
15573+
break;
15574+
}
15575+
15576+
SDValue Result;
15577+
SDValue N0 = N->getOperand(0);
15578+
15579+
bool UseFactorization = !Factors.empty() && (Factors.size() < 5);
15580+
15581+
if (UseFactorization) {
15582+
if (E == 1)
15583+
Result = N0;
15584+
else
15585+
Result = expandMulToNAFSequence(N, DAG, DL, E);
15586+
15587+
for (const auto &F : Factors) {
15588+
SDValue ShiftVal = DAG.getNode(ISD::SHL, DL, VT, Result,
15589+
DAG.getConstant(F.second, DL, VT));
15590+
15591+
ISD::NodeType AddSubOp = F.first ? ISD::ADD : ISD::SUB;
15592+
Result = DAG.getNode(AddSubOp, DL, N->getValueType(0), ShiftVal, Result);
15593+
}
15594+
15595+
if (TrailingZeros > 0)
15596+
Result = DAG.getNode(ISD::SHL, DL, VT, Result,
15597+
DAG.getConstant(TrailingZeros, DL, VT));
15598+
15599+
return Result;
15600+
}
15601+
15602+
return expandMulToNAFSequence(N, DAG, DL, MulAmt);
15603+
}
15604+
1550515605
// X * (2^N +/- 2^M) -> (add/sub (shl X, C1), (shl X, C2))
1550615606
static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG,
1550715607
uint64_t MulAmt) {
@@ -15537,21 +15637,24 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
1553715637
if (DAG.getMachineFunction().getFunction().hasMinSize())
1553815638
return SDValue();
1553915639

15540-
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15541-
return SDValue();
15542-
1554315640
if (VT != Subtarget.getXLenVT())
1554415641
return SDValue();
1554515642

15546-
const bool HasShlAdd = Subtarget.hasStdExtZba() ||
15547-
Subtarget.hasVendorXTHeadBa() ||
15548-
Subtarget.hasVendorXAndesPerf();
15549-
1555015643
ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
1555115644
if (!CNode)
1555215645
return SDValue();
1555315646
uint64_t MulAmt = CNode->getZExtValue();
1555415647

15648+
if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul())
15649+
return expandMulToBasicOps(N, DAG, MulAmt);
15650+
15651+
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
15652+
return SDValue();
15653+
15654+
const bool HasShlAdd = Subtarget.hasStdExtZba() ||
15655+
Subtarget.hasVendorXTHeadBa() ||
15656+
Subtarget.hasVendorXAndesPerf();
15657+
1555515658
// WARNING: The code below is knowingly incorrect with regards to undef semantics.
1555615659
// We're adding additional uses of X here, and in principle, we should be freezing
1555715660
// X before doing so. However, adding freeze here causes real regressions, and no

llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -262,20 +262,33 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
262262
; RV64I-NEXT: sext.w a1, a0
263263
; RV64I-NEXT: beqz a1, .LBB2_2
264264
; RV64I-NEXT: # %bb.1: # %cond.false
265-
; RV64I-NEXT: addi sp, sp, -16
266-
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
267-
; RV64I-NEXT: neg a1, a0
265+
; RV64I-NEXT: negw a1, a0
268266
; RV64I-NEXT: and a0, a0, a1
269-
; RV64I-NEXT: lui a1, 30667
270-
; RV64I-NEXT: addiw a1, a1, 1329
271-
; RV64I-NEXT: call __muldi3
267+
; RV64I-NEXT: slli a1, a0, 6
268+
; RV64I-NEXT: slli a2, a0, 8
269+
; RV64I-NEXT: slli a3, a0, 10
270+
; RV64I-NEXT: slli a4, a0, 12
271+
; RV64I-NEXT: add a1, a1, a2
272+
; RV64I-NEXT: slli a2, a0, 16
273+
; RV64I-NEXT: subw a3, a3, a4
274+
; RV64I-NEXT: slli a4, a0, 18
275+
; RV64I-NEXT: subw a2, a2, a4
276+
; RV64I-NEXT: slli a4, a0, 4
277+
; RV64I-NEXT: subw a4, a0, a4
278+
; RV64I-NEXT: add a1, a4, a1
279+
; RV64I-NEXT: slli a4, a0, 14
280+
; RV64I-NEXT: subw a3, a3, a4
281+
; RV64I-NEXT: slli a4, a0, 23
282+
; RV64I-NEXT: subw a2, a2, a4
283+
; RV64I-NEXT: slli a0, a0, 27
284+
; RV64I-NEXT: add a1, a1, a3
285+
; RV64I-NEXT: add a0, a2, a0
286+
; RV64I-NEXT: add a0, a1, a0
272287
; RV64I-NEXT: srliw a0, a0, 27
273288
; RV64I-NEXT: lui a1, %hi(.LCPI2_0)
274289
; RV64I-NEXT: addi a1, a1, %lo(.LCPI2_0)
275290
; RV64I-NEXT: add a0, a1, a0
276291
; RV64I-NEXT: lbu a0, 0(a0)
277-
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
278-
; RV64I-NEXT: addi sp, sp, 16
279292
; RV64I-NEXT: ret
280293
; RV64I-NEXT: .LBB2_2:
281294
; RV64I-NEXT: li a0, 32
@@ -730,20 +743,33 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
730743
;
731744
; RV64I-LABEL: test_cttz_i32_zero_undef:
732745
; RV64I: # %bb.0:
733-
; RV64I-NEXT: addi sp, sp, -16
734-
; RV64I-NEXT: sd ra, 8(sp) # 8-byte Folded Spill
735-
; RV64I-NEXT: neg a1, a0
746+
; RV64I-NEXT: negw a1, a0
736747
; RV64I-NEXT: and a0, a0, a1
737-
; RV64I-NEXT: lui a1, 30667
738-
; RV64I-NEXT: addiw a1, a1, 1329
739-
; RV64I-NEXT: call __muldi3
748+
; RV64I-NEXT: slli a1, a0, 6
749+
; RV64I-NEXT: slli a2, a0, 8
750+
; RV64I-NEXT: slli a3, a0, 10
751+
; RV64I-NEXT: slli a4, a0, 12
752+
; RV64I-NEXT: add a1, a1, a2
753+
; RV64I-NEXT: slli a2, a0, 16
754+
; RV64I-NEXT: subw a3, a3, a4
755+
; RV64I-NEXT: slli a4, a0, 18
756+
; RV64I-NEXT: subw a2, a2, a4
757+
; RV64I-NEXT: slli a4, a0, 4
758+
; RV64I-NEXT: subw a4, a0, a4
759+
; RV64I-NEXT: add a1, a4, a1
760+
; RV64I-NEXT: slli a4, a0, 14
761+
; RV64I-NEXT: subw a3, a3, a4
762+
; RV64I-NEXT: slli a4, a0, 23
763+
; RV64I-NEXT: subw a2, a2, a4
764+
; RV64I-NEXT: slli a0, a0, 27
765+
; RV64I-NEXT: add a1, a1, a3
766+
; RV64I-NEXT: add a0, a2, a0
767+
; RV64I-NEXT: add a0, a1, a0
740768
; RV64I-NEXT: srliw a0, a0, 27
741769
; RV64I-NEXT: lui a1, %hi(.LCPI6_0)
742770
; RV64I-NEXT: addi a1, a1, %lo(.LCPI6_0)
743771
; RV64I-NEXT: add a0, a1, a0
744772
; RV64I-NEXT: lbu a0, 0(a0)
745-
; RV64I-NEXT: ld ra, 8(sp) # 8-byte Folded Reload
746-
; RV64I-NEXT: addi sp, sp, 16
747773
; RV64I-NEXT: ret
748774
;
749775
; RV32M-LABEL: test_cttz_i32_zero_undef:

0 commit comments

Comments
 (0)