-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV] Expand constant multiplication for targets without M extension #137195
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-risc-v Author: Iris Shi (el-ev) ChangesCloses #137023 Patch is 48.24 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137195.diff 9 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 02451ee716865..34d789d1ff5c8 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -15436,6 +15436,31 @@ static SDValue performXORCombine(SDNode *N, SelectionDAG &DAG,
return combineSelectAndUseCommutative(N, DAG, /*AllOnes*/ false, Subtarget);
}
+// 2^N +/- 2^M -> (add/sub (shl X, C1), (shl X, C2))
+static SDValue expandMulToAddOrSubOfShl(SDNode *N, SelectionDAG &DAG) {
+ ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!CNode)
+ return SDValue();
+ uint64_t MulAmt = CNode->getZExtValue();
+ uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
+ ISD::NodeType Op;
+ if (isPowerOf2_64(MulAmt + MulAmtLowBit))
+ Op = ISD::SUB;
+ else if (isPowerOf2_64(MulAmt - MulAmtLowBit))
+ Op = ISD::ADD;
+ else
+ return SDValue();
+ uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit;
+ SDLoc DL(N);
+ SDValue Shift1 =
+ DAG.getNode(ISD::SHL, DL, N->getValueType(0), N->getOperand(0),
+ DAG.getConstant(Log2_64(ShiftAmt1), DL, N->getValueType(0)));
+ SDValue Shift2 = DAG.getNode(
+ ISD::SHL, DL, N->getValueType(0), N->getOperand(0),
+ DAG.getConstant(Log2_64(MulAmtLowBit), DL, N->getValueType(0)));
+ return DAG.getNode(Op, DL, N->getValueType(0), Shift1, Shift2);
+}
+
// Try to expand a scalar multiply to a faster sequence.
static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -15443,18 +15468,24 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
+ const bool HasShlAdd =
+ Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
+
// LI + MUL is usually smaller than the alternative sequence.
if (DAG.getMachineFunction().getFunction().hasMinSize())
return SDValue();
- if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
- return SDValue();
-
if (VT != Subtarget.getXLenVT())
return SDValue();
- const bool HasShlAdd =
- Subtarget.hasStdExtZba() || Subtarget.hasVendorXTHeadBa();
+ // This may prevent some ShlAdd optimizations. Try this combination
+ // later if we have that.
+ if (!HasShlAdd)
+ if (SDValue V = expandMulToAddOrSubOfShl(N, DAG))
+ return V;
+
+ if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+ return SDValue();
ConstantSDNode *CNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (!CNode)
@@ -15569,22 +15600,7 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::SUB, DL, VT, Shift1, Mul359);
}
}
- }
- // 2^N - 2^M -> (sub (shl X, C1), (shl X, C2))
- uint64_t MulAmtLowBit = MulAmt & (-MulAmt);
- if (isPowerOf2_64(MulAmt + MulAmtLowBit)) {
- uint64_t ShiftAmt1 = MulAmt + MulAmtLowBit;
- SDLoc DL(N);
- SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(ShiftAmt1), DL, VT));
- SDValue Shift2 =
- DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
- DAG.getConstant(Log2_64(MulAmtLowBit), DL, VT));
- return DAG.getNode(ISD::SUB, DL, VT, Shift1, Shift2);
- }
-
- if (HasShlAdd) {
for (uint64_t Divisor : {3, 5, 9}) {
if (MulAmt % Divisor != 0)
continue;
@@ -15608,6 +15624,10 @@ static SDValue expandMul(SDNode *N, SelectionDAG &DAG,
}
}
}
+
+ // Delayed
+ if (SDValue V = expandMulToAddOrSubOfShl(N, DAG))
+ return V;
}
return SDValue();
diff --git a/llvm/test/CodeGen/RISCV/mul.ll b/llvm/test/CodeGen/RISCV/mul.ll
index 548c7e1c6ea8c..15aa522815605 100644
--- a/llvm/test/CodeGen/RISCV/mul.ll
+++ b/llvm/test/CodeGen/RISCV/mul.ll
@@ -464,11 +464,45 @@ define i32 @mulhu_constant(i32 %a) nounwind {
ret i32 %4
}
+define i32 @muli32_p10(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p10:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a0, 1
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32IM-LABEL: muli32_p10:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a1, a0, 1
+; RV32IM-NEXT: slli a0, a0, 3
+; RV32IM-NEXT: add a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV64I-LABEL: muli32_p10:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 1
+; RV64I-NEXT: slli a0, a0, 3
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64IM-LABEL: muli32_p10:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a1, a0, 1
+; RV64IM-NEXT: slli a0, a0, 3
+; RV64IM-NEXT: addw a0, a0, a1
+; RV64IM-NEXT: ret
+ %1 = mul i32 %a, 10
+ ret i32 %1
+}
+
define i32 @muli32_p14(i32 %a) nounwind {
; RV32I-LABEL: muli32_p14:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 14
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 1
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p14:
; RV32IM: # %bb.0:
@@ -494,11 +528,45 @@ define i32 @muli32_p14(i32 %a) nounwind {
ret i32 %1
}
+define i32 @muli32_p20(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p20:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a0, 2
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add a0, a0, a1
+; RV32I-NEXT: ret
+;
+; RV32IM-LABEL: muli32_p20:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a1, a0, 2
+; RV32IM-NEXT: slli a0, a0, 4
+; RV32IM-NEXT: add a0, a0, a1
+; RV32IM-NEXT: ret
+;
+; RV64I-LABEL: muli32_p20:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 2
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64IM-LABEL: muli32_p20:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a1, a0, 2
+; RV64IM-NEXT: slli a0, a0, 4
+; RV64IM-NEXT: addw a0, a0, a1
+; RV64IM-NEXT: ret
+ %1 = mul i32 %a, 20
+ ret i32 %1
+}
+
define i32 @muli32_p28(i32 %a) nounwind {
; RV32I-LABEL: muli32_p28:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 28
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 2
+; RV32I-NEXT: slli a0, a0, 5
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p28:
; RV32IM: # %bb.0:
@@ -527,8 +595,10 @@ define i32 @muli32_p28(i32 %a) nounwind {
define i32 @muli32_p30(i32 %a) nounwind {
; RV32I-LABEL: muli32_p30:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 30
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 1
+; RV32I-NEXT: slli a0, a0, 5
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p30:
; RV32IM: # %bb.0:
@@ -557,8 +627,10 @@ define i32 @muli32_p30(i32 %a) nounwind {
define i32 @muli32_p56(i32 %a) nounwind {
; RV32I-LABEL: muli32_p56:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 56
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 3
+; RV32I-NEXT: slli a0, a0, 6
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p56:
; RV32IM: # %bb.0:
@@ -587,8 +659,10 @@ define i32 @muli32_p56(i32 %a) nounwind {
define i32 @muli32_p60(i32 %a) nounwind {
; RV32I-LABEL: muli32_p60:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 60
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 2
+; RV32I-NEXT: slli a0, a0, 6
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p60:
; RV32IM: # %bb.0:
@@ -617,8 +691,10 @@ define i32 @muli32_p60(i32 %a) nounwind {
define i32 @muli32_p62(i32 %a) nounwind {
; RV32I-LABEL: muli32_p62:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 62
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 1
+; RV32I-NEXT: slli a0, a0, 6
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p62:
; RV32IM: # %bb.0:
@@ -672,6 +748,34 @@ define i32 @muli32_p65(i32 %a) nounwind {
ret i32 %1
}
+define i32 @muli32_p66(i32 %a) nounwind {
+; RV32I-LABEL: muli32_p66:
+; RV32I: # %bb.0:
+; RV32I-NEXT: slli a1, a0, 6
+; RV32I-NEXT: add a0, a1, a0
+; RV32I-NEXT: ret
+;
+; RV32IM-LABEL: muli32_p66:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a1, a0, 6
+; RV32IM-NEXT: add a0, a1, a0
+; RV32IM-NEXT: ret
+;
+; RV64I-LABEL: muli32_p66:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 6
+; RV64I-NEXT: addw a0, a1, a0
+; RV64I-NEXT: ret
+;
+; RV64IM-LABEL: muli32_p66:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a1, a0, 6
+; RV64IM-NEXT: addw a0, a1, a0
+; RV64IM-NEXT: ret
+ %1 = mul i32 %a, 65
+ ret i32 %1
+}
+
define i32 @muli32_p63(i32 %a) nounwind {
; RV32I-LABEL: muli32_p63:
; RV32I: # %bb.0:
@@ -778,7 +882,89 @@ define i64 @muli64_p63(i64 %a) nounwind {
ret i64 %1
}
+define i64 @muli64_p60(i64 %a) nounwind {
+; RV32I-LABEL: muli64_p60:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a2, 60
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: call __muldi3
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IM-LABEL: muli64_p60:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a2, 60
+; RV32IM-NEXT: slli a3, a1, 2
+; RV32IM-NEXT: slli a1, a1, 6
+; RV32IM-NEXT: sub a1, a1, a3
+; RV32IM-NEXT: slli a3, a0, 2
+; RV32IM-NEXT: mulhu a2, a0, a2
+; RV32IM-NEXT: slli a0, a0, 6
+; RV32IM-NEXT: add a1, a2, a1
+; RV32IM-NEXT: sub a0, a0, a3
+; RV32IM-NEXT: ret
+;
+; RV64I-LABEL: muli64_p60:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 2
+; RV64I-NEXT: slli a0, a0, 6
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64IM-LABEL: muli64_p60:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a1, a0, 2
+; RV64IM-NEXT: slli a0, a0, 6
+; RV64IM-NEXT: sub a0, a0, a1
+; RV64IM-NEXT: ret
+ %1 = mul i64 %a, 60
+ ret i64 %1
+}
+define i64 @muli64_p68(i64 %a) nounwind {
+; RV32I-LABEL: muli64_p68:
+; RV32I: # %bb.0:
+; RV32I-NEXT: addi sp, sp, -16
+; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT: li a2, 68
+; RV32I-NEXT: li a3, 0
+; RV32I-NEXT: call __muldi3
+; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT: addi sp, sp, 16
+; RV32I-NEXT: ret
+;
+; RV32IM-LABEL: muli64_p68:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: li a2, 68
+; RV32IM-NEXT: slli a3, a1, 2
+; RV32IM-NEXT: slli a1, a1, 6
+; RV32IM-NEXT: add a1, a1, a3
+; RV32IM-NEXT: slli a3, a0, 2
+; RV32IM-NEXT: mulhu a2, a0, a2
+; RV32IM-NEXT: slli a0, a0, 6
+; RV32IM-NEXT: add a1, a2, a1
+; RV32IM-NEXT: add a0, a0, a3
+; RV32IM-NEXT: ret
+;
+; RV64I-LABEL: muli64_p68:
+; RV64I: # %bb.0:
+; RV64I-NEXT: slli a1, a0, 2
+; RV64I-NEXT: slli a0, a0, 6
+; RV64I-NEXT: add a0, a0, a1
+; RV64I-NEXT: ret
+;
+; RV64IM-LABEL: muli64_p68:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a1, a0, 2
+; RV64IM-NEXT: slli a0, a0, 6
+; RV64IM-NEXT: add a0, a0, a1
+; RV64IM-NEXT: ret
+ %1 = mul i64 %a, 68
+ ret i64 %1
+}
define i32 @muli32_m63(i32 %a) nounwind {
; RV32I-LABEL: muli32_m63:
@@ -930,8 +1116,10 @@ define i64 @muli64_m65(i64 %a) nounwind {
define i32 @muli32_p384(i32 %a) nounwind {
; RV32I-LABEL: muli32_p384:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 384
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 7
+; RV32I-NEXT: slli a0, a0, 9
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p384:
; RV32IM: # %bb.0:
@@ -960,8 +1148,10 @@ define i32 @muli32_p384(i32 %a) nounwind {
define i32 @muli32_p12288(i32 %a) nounwind {
; RV32I-LABEL: muli32_p12288:
; RV32I: # %bb.0:
-; RV32I-NEXT: lui a1, 3
-; RV32I-NEXT: tail __mulsi3
+; RV32I-NEXT: slli a1, a0, 12
+; RV32I-NEXT: slli a0, a0, 14
+; RV32I-NEXT: sub a0, a0, a1
+; RV32I-NEXT: ret
;
; RV32IM-LABEL: muli32_p12288:
; RV32IM: # %bb.0:
@@ -1137,12 +1327,16 @@ define i64 @muli64_p4352(i64 %a) nounwind {
;
; RV32IM-LABEL: muli64_p4352:
; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a2, a1, 8
+; RV32IM-NEXT: slli a1, a1, 12
+; RV32IM-NEXT: add a1, a1, a2
; RV32IM-NEXT: li a2, 17
; RV32IM-NEXT: slli a2, a2, 8
-; RV32IM-NEXT: mul a1, a1, a2
-; RV32IM-NEXT: mulhu a3, a0, a2
-; RV32IM-NEXT: add a1, a3, a1
-; RV32IM-NEXT: mul a0, a0, a2
+; RV32IM-NEXT: mulhu a2, a0, a2
+; RV32IM-NEXT: add a1, a2, a1
+; RV32IM-NEXT: slli a2, a0, 8
+; RV32IM-NEXT: slli a0, a0, 12
+; RV32IM-NEXT: add a0, a0, a2
; RV32IM-NEXT: ret
;
; RV64I-LABEL: muli64_p4352:
@@ -1327,10 +1521,10 @@ define i128 @muli128_m3840(i128 %a) nounwind {
; RV32I-NEXT: sltu a7, a5, a4
; RV32I-NEXT: sub a6, a6, t2
; RV32I-NEXT: mv t1, a7
-; RV32I-NEXT: beq t0, a3, .LBB36_2
+; RV32I-NEXT: beq t0, a3, .LBB41_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: sltu t1, t0, a3
-; RV32I-NEXT: .LBB36_2:
+; RV32I-NEXT: .LBB41_2:
; RV32I-NEXT: sub a2, a2, a1
; RV32I-NEXT: sub a1, t0, a3
; RV32I-NEXT: sub a5, a5, a4
@@ -1441,10 +1635,10 @@ define i128 @muli128_m63(i128 %a) nounwind {
; RV32I-NEXT: sltu a7, a3, a6
; RV32I-NEXT: or t0, t0, a5
; RV32I-NEXT: mv a5, a7
-; RV32I-NEXT: beq a4, t0, .LBB37_2
+; RV32I-NEXT: beq a4, t0, .LBB42_2
; RV32I-NEXT: # %bb.1:
; RV32I-NEXT: sltu a5, a4, t0
-; RV32I-NEXT: .LBB37_2:
+; RV32I-NEXT: .LBB42_2:
; RV32I-NEXT: srli t1, a4, 26
; RV32I-NEXT: slli t2, a2, 6
; RV32I-NEXT: srli t3, a2, 26
@@ -1869,12 +2063,16 @@ define i64 @muland_demand(i64 %x) nounwind {
; RV64I-NEXT: li a1, -29
; RV64I-NEXT: srli a1, a1, 2
; RV64I-NEXT: and a0, a0, a1
-; RV64I-NEXT: li a1, 12
-; RV64I-NEXT: tail __muldi3
+; RV64I-NEXT: slli a1, a0, 2
+; RV64I-NEXT: slli a0, a0, 4
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: ret
;
; RV64IM-LABEL: muland_demand:
; RV64IM: # %bb.0:
-; RV64IM-NEXT: andi a0, a0, -8
+; RV64IM-NEXT: li a1, -29
+; RV64IM-NEXT: srli a1, a1, 2
+; RV64IM-NEXT: and a0, a0, a1
; RV64IM-NEXT: slli a1, a0, 2
; RV64IM-NEXT: slli a0, a0, 4
; RV64IM-NEXT: sub a0, a0, a1
@@ -1905,9 +2103,10 @@ define i64 @mulzext_demand(i32 signext %x) nounwind {
;
; RV64I-LABEL: mulzext_demand:
; RV64I: # %bb.0:
-; RV64I-NEXT: li a1, 3
-; RV64I-NEXT: slli a1, a1, 32
-; RV64I-NEXT: tail __muldi3
+; RV64I-NEXT: slli a1, a0, 32
+; RV64I-NEXT: slli a0, a0, 34
+; RV64I-NEXT: sub a0, a0, a1
+; RV64I-NEXT: ret
;
; RV64IM-LABEL: mulzext_demand:
; RV64IM: # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
index 44ab0e1fef6c1..0fc0adbfa83d9 100644
--- a/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xtheadba.ll
@@ -116,8 +116,9 @@ define i32 @addmul6(i32 %a, i32 %b) {
define i32 @addmul10(i32 %a, i32 %b) {
; RV32I-LABEL: addmul10:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 10
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 1
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -153,8 +154,9 @@ define i32 @addmul12(i32 %a, i32 %b) {
define i32 @addmul18(i32 %a, i32 %b) {
; RV32I-LABEL: addmul18:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 18
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 1
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -171,8 +173,9 @@ define i32 @addmul18(i32 %a, i32 %b) {
define i32 @addmul20(i32 %a, i32 %b) {
; RV32I-LABEL: addmul20:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 20
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 2
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -208,8 +211,9 @@ define i32 @addmul24(i32 %a, i32 %b) {
define i32 @addmul36(i32 %a, i32 %b) {
; RV32I-LABEL: addmul36:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 36
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 2
+; RV32I-NEXT: slli a0, a0, 5
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -226,8 +230,9 @@ define i32 @addmul36(i32 %a, i32 %b) {
define i32 @addmul40(i32 %a, i32 %b) {
; RV32I-LABEL: addmul40:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 40
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 3
+; RV32I-NEXT: slli a0, a0, 5
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -244,8 +249,9 @@ define i32 @addmul40(i32 %a, i32 %b) {
define i32 @addmul72(i32 %a, i32 %b) {
; RV32I-LABEL: addmul72:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 72
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 3
+; RV32I-NEXT: slli a0, a0, 6
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -279,8 +285,9 @@ define i32 @mul96(i32 %a) {
define i32 @mul160(i32 %a) {
; RV32I-LABEL: mul160:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 160
-; RV32I-NEXT: mul a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 5
+; RV32I-NEXT: slli a0, a0, 7
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
; RV32XTHEADBA-LABEL: mul160:
@@ -312,8 +319,9 @@ define i32 @mul200(i32 %a) {
define i32 @mul288(i32 %a) {
; RV32I-LABEL: mul288:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 288
-; RV32I-NEXT: mul a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 5
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
; RV32XTHEADBA-LABEL: mul288:
@@ -328,8 +336,9 @@ define i32 @mul288(i32 %a) {
define i32 @mul258(i32 %a) {
; RV32I-LABEL: mul258:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 258
-; RV32I-NEXT: mul a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 1
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
; RV32XTHEADBA-LABEL: mul258:
@@ -344,8 +353,9 @@ define i32 @mul258(i32 %a) {
define i32 @mul260(i32 %a) {
; RV32I-LABEL: mul260:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 260
-; RV32I-NEXT: mul a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 2
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
; RV32XTHEADBA-LABEL: mul260:
@@ -360,8 +370,9 @@ define i32 @mul260(i32 %a) {
define i32 @mul264(i32 %a) {
; RV32I-LABEL: mul264:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a1, 264
-; RV32I-NEXT: mul a0, a0, a1
+; RV32I-NEXT: slli a1, a0, 3
+; RV32I-NEXT: slli a0, a0, 8
+; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
; RV32XTHEADBA-LABEL: mul264:
diff --git a/llvm/test/CodeGen/RISCV/rv32zba.ll b/llvm/test/CodeGen/RISCV/rv32zba.ll
index fec156ac2be27..f8ca41782c6e1 100644
--- a/llvm/test/CodeGen/RISCV/rv32zba.ll
+++ b/llvm/test/CodeGen/RISCV/rv32zba.ll
@@ -82,8 +82,9 @@ define i32 @addmul6(i32 %a, i32 %b) {
define i32 @addmul10(i32 %a, i32 %b) {
; RV32I-LABEL: addmul10:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 10
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 1
+; RV32I-NEXT: slli a0, a0, 3
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -119,8 +120,9 @@ define i32 @addmul12(i32 %a, i32 %b) {
define i32 @addmul18(i32 %a, i32 %b) {
; RV32I-LABEL: addmul18:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 18
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 1
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -137,8 +139,9 @@ define i32 @addmul18(i32 %a, i32 %b) {
define i32 @addmul20(i32 %a, i32 %b) {
; RV32I-LABEL: addmul20:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 20
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 2
+; RV32I-NEXT: slli a0, a0, 4
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -174,8 +177,9 @@ define i32 @addmul24(i32 %a, i32 %b) {
define i32 @addmul36(i32 %a, i32 %b) {
; RV32I-LABEL: addmul36:
; RV32I: # %bb.0:
-; RV32I-NEXT: li a2, 36
-; RV32I-NEXT: mul a0, a0, a2
+; RV32I-NEXT: slli a2, a0, 2
+; RV32I-NEXT: slli a0, a0, 5
+; RV32I-NEXT: add a0, a0, a2
; RV32I-NEXT: add a0, a0, a1
; RV32I-NEXT: ret
;
@@ -192,...
[truncated]
|
Should we be solving the general case? What if there are more than 2 bits set in the multiplicand? |
I'm not sure whether it's profitable enough to break down a mul into more than two However, it may be beneficial if the mul could be broken down to a |
The original bug report was for CPUs that don't implement M or Zmmul. For those CPUs, we can probably use more instructions to avoid a library call. Unless we're optimizing for size. |
0336625
to
1cc9447
Compare
2^N + 2^M
expanding pattern for muld2308b1
to
1af67fb
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
79e4586
to
42bd798
Compare
This seems to be three independent changes:
I suggest splitting to separate PRs. Regarding the second change, why are we doing it? It changes:
to:
|
It maks sense. I will update this PR to include only the first change. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you expand your review description with the motivation, and a bit of justification? In particular:
- Why is unconditional expansion the right answer?
- Why not use the simpler sum of powers of two? How much does the complexity of the factorization and NAF lowering really buy us? Feel free to cite a paper/blog if this is "well known".
6d7a485
to
a9379d0
Compare
a9379d0
to
0cb0fad
Compare
} | ||
// Try to expand a multiply to a sequence of shifts and add/subs, | ||
// for a machine without native mul instruction. | ||
static SDValue expandMulToBasicOps(SDNode *N, SelectionDAG &DAG, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you mind dropping this from the current patch, and using only the expandMulToNAFSequence scheme? You can add back the second option in a second review, I just want to focus review time.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
0cb0fad
to
038b1a6
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
038b1a6
to
e9a9dbc
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Closes #137023
On RISC-V machines without a native multiply instruction (e.g.,
rv32i
base), multiplying a variable by a constant integer often compiles to a call to a library routine like__mul{s,d}i3
.This library function implements multiplication in software using a loop of shifts and adds, processing the constant bit by bit. On rv32i, it requires a minimum of 8 instructions (for multiply by
0
) and up to about 200 instructions (by0xffffffff
), involves heavy branching and function call overhead.When not optimizing for size, we could expand the constant multiplication into a sequence of shift and add/sub instructions. For now we use non-adjacent form for the shift and add/sub sequence, which could save 1/2 - 2/3 instructions compared to a shl+add-only sequence.