llvm
diff --git a/‎llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Lines changed: 109 additions & 6 deletions b/‎llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Lines changed: 109 additions & 6 deletions
diff --git a/‎llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
Lines changed: 42 additions & 16 deletions b/‎llvm/test/CodeGen/RISCV/ctlz-cttz-ctpop.ll
Lines changed: 42 additions & 16 deletions
@@ -20,6 +20,7 @@
 #include "RISCVSelectionDAGInfo.h"
 #include "RISCVSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -15456,6 +15457,105 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
   return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
 }
 
+static SDValue expandMulToNAFSequence(SDNode *N, SelectionDAG &DAG,
+                                      const SDLoc &DL, uint64_t MulAmt) {
+  EVT VT = N->getValueType(0);
+  const uint64_t BitWidth = VT.getFixedSizeInBits();
+
+  // Find the Non-adjacent form of the multiplier.
+  llvm::SmallVector<std::pair<bool, uint64_t>> Sequence; // {isAdd, shamt}
+  for (uint64_t E = MulAmt, I = 0; E && I < BitWidth; ++I, E >>= 1) {
+    if (E & 1) {
+      bool IsAdd = (E & 3) == 1;
+      Sequence.push_back({IsAdd, I});
+      E -= IsAdd ? 1 : -1;
+    }
+  }
+
+  SDValue Result = DAG.getConstant(0, DL, N->getValueType(0));
+  SDValue N0 = N->getOperand(0);
+
+  for (const auto &Op : Sequence) {
+    SDValue ShiftVal;
+    if (Op.second > 0)
+      ShiftVal =
+          DAG.getNode(ISD::SHL, DL, VT, N0, DAG.getConstant(Op.second, DL, VT));
+    else
+      ShiftVal = N0;
+
+    ISD::NodeType AddSubOp = Op.first ? ISD::ADD : ISD::SUB;
+    Result = DAG.getNode(AddSubOp, DL, VT, Result, ShiftVal);
+  }
+  return Result;
+}
+// Try to expand a multiply to a sequence of shifts and add/subs,
+// for a machine without native mul instruction.
+static SDValue expandMulToBasicOps(SDNode *N, SelectionDAG &DAG,
+                                   uint64_t MulAmt) {
+  EVT VT = N->getValueType(0);
+  const uint64_t BitWidth = VT.getFixedSizeInBits();
+  SDLoc DL(N);
+
+  if (MulAmt == 0)
+    return DAG.getConstant(0, DL, N->getValueType(0));
+
+  // Try to factorize into (2^N) * (2^M_1 +/- 1) * (2^M_2 +/- 1) * ...
+  uint64_t TrailingZeros = llvm::countr_zero(MulAmt);
+  uint64_t E = MulAmt >> TrailingZeros;
+
+  llvm::SmallVector<std::pair<bool, uint64_t>> Factors; // {is_2^M+1, M}
+
+  while (E > 1) {
+    bool Found = false;
+    for (int64_t I = BitWidth - 1; I >= 2; --I) {
+      uint64_t Factor = 1ULL << I;
+
+      if (E % (Factor + 1) == 0) {
+        Factors.push_back({true, I});
+        E /= Factor + 1;
+        Found = true;
+        break;
+      }
+      if (E % (Factor - 1) == 0) {
+        Factors.push_back({false, I});
+        E /= Factor - 1;
+        Found = true;
+        break;
+      }
+    }
+    if (!Found)
+      break;
+  }
+
+  SDValue Result;
+  SDValue N0 = N->getOperand(0);
+
+  bool UseFactorization = !Factors.empty() && (Factors.size() < 5);
+
+  if (UseFactorization) {
+    if (E == 1)
+      Result = N0;
+    else
+      Result = expandMulToNAFSequence(N, DAG, DL, E);
+
+    for (const auto &F : Factors) {
+      SDValue ShiftVal = DAG.getNode(ISD::SHL, DL, VT, Result,
+                                     DAG.getConstant(F.second, DL, VT));
+
+      ISD::NodeType AddSubOp = F.first ? ISD::ADD : ISD::SUB;
+      Result = DAG.getNode(AddSubOp, DL, N->getValueType(0), ShiftVal, Result);
+    }
+
+    if (TrailingZeros > 0)
+      Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+                           DAG.getConstant(TrailingZeros, DL, VT));
+
+    return Result;
+  }
+
+  return expandMulToNAFSequence(N, DAG, DL, MulAmt);
+}
+
 // Try to expand a scalar multiply to a faster sequence.
 static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
@@ -15467,20 +15567,23 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
   if (DAG.getMachineFunction().getFunction().hasMinSize())
     return SDValue();
 
-  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
-    return SDValue();
-
   if (VT != Subtarget.getXLenVT())
     return SDValue();
 
-  const bool HasShlAdd =
-      Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
-
   ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!CNode)
     return SDValue();
   uint64_t MulAmt = CNode->getZExtValue();
 
+  if (!Subtarget.hasStdExtM() && !Subtarget.hasStdExtZmmul())
+    return expandMulToBasicOps(N, DAG, MulAmt);
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  const bool HasShlAdd =
+      Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
+
   // WARNING: The code below is knowingly incorrect with regards to undef semantics.
   // We're adding additional uses of X here, and in principle, we should be freezing
   // X before doing so.  However, adding freeze here causes real regressions, and no
 
@@ -262,20 +262,33 @@ define i32 @test_cttz_i32(i32 %a) nounwind {
 ; RV64I-NEXT:    sext.w a1, a0
 ; RV64I-NEXT:    beqz a1, .LBB2_2
 ; RV64I-NEXT:  # %bb.1: # %cond.false
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    negw a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 30667
-; RV64I-NEXT:    addiw a1, a1, 1329
-; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a3, a0, 10
+; RV64I-NEXT:    slli a4, a0, 12
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    slli a4, a0, 18
+; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    slli a4, a0, 4
+; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    add a1, a4, a1
+; RV64I-NEXT:    slli a4, a0, 14
+; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    slli a4, a0, 23
+; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    slli a0, a0, 27
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srliw a0, a0, 27
 ; RV64I-NEXT:    lui a1, %hi(.LCPI2_0)
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI2_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ; RV64I-NEXT:  .LBB2_2:
 ; RV64I-NEXT:    li a0, 32
@@ -730,20 +743,33 @@ define i32 @test_cttz_i32_zero_undef(i32 %a) nounwind {
 ;
 ; RV64I-LABEL: test_cttz_i32_zero_undef:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    addi sp, sp, -16
-; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
-; RV64I-NEXT:    neg a1, a0
+; RV64I-NEXT:    negw a1, a0
 ; RV64I-NEXT:    and a0, a0, a1
-; RV64I-NEXT:    lui a1, 30667
-; RV64I-NEXT:    addiw a1, a1, 1329
-; RV64I-NEXT:    call __muldi3
+; RV64I-NEXT:    slli a1, a0, 6
+; RV64I-NEXT:    slli a2, a0, 8
+; RV64I-NEXT:    slli a3, a0, 10
+; RV64I-NEXT:    slli a4, a0, 12
+; RV64I-NEXT:    add a1, a1, a2
+; RV64I-NEXT:    slli a2, a0, 16
+; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    slli a4, a0, 18
+; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    slli a4, a0, 4
+; RV64I-NEXT:    subw a4, a0, a4
+; RV64I-NEXT:    add a1, a4, a1
+; RV64I-NEXT:    slli a4, a0, 14
+; RV64I-NEXT:    subw a3, a3, a4
+; RV64I-NEXT:    slli a4, a0, 23
+; RV64I-NEXT:    subw a2, a2, a4
+; RV64I-NEXT:    slli a0, a0, 27
+; RV64I-NEXT:    add a1, a1, a3
+; RV64I-NEXT:    add a0, a2, a0
+; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    srliw a0, a0, 27
 ; RV64I-NEXT:    lui a1, %hi(.LCPI6_0)
 ; RV64I-NEXT:    addi a1, a1, %lo(.LCPI6_0)
 ; RV64I-NEXT:    add a0, a1, a0
 ; RV64I-NEXT:    lbu a0, 0(a0)
-; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
-; RV64I-NEXT:    addi sp, sp, 16
 ; RV64I-NEXT:    ret
 ;
 ; RV32M-LABEL: test_cttz_i32_zero_undef: