-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[RISCV][SDAG] Prefer ShortForwardBranch to lower sdiv by pow2 #67364
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 @llvm/pr-subscribers-backend-x86 ChangesThis patch lowers This patch also removes duplicate logic in the X86 and AArch64 backend. But we cannot do this for the PowerPC backend since it generates a special instruction Full diff: https://github.com/llvm/llvm-project/pull/67364.diff 7 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index b2a0a8c15cf14de..b2ebec61c996df1 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -4915,6 +4915,10 @@ class TargetLowering : public TargetLoweringBase {
SmallVectorImpl<SDNode *> &Created) const;
SDValue BuildUDIV(SDNode *N, SelectionDAG &DAG, bool IsAfterLegalization,
SmallVectorImpl<SDNode *> &Created) const;
+ // Build sdiv by power-of-2 with conditional move instructions
+ SDValue buildSDIVPow2WithCMov(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const;
/// Targets may override this function to provide custom SDIV lowering for
/// power-of-2 denominators. If the target returns an empty SDValue, LLVM
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6aaaa2f68638c7d..7b6f99365512a43 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6009,6 +6009,48 @@ TargetLowering::BuildSREMPow2(SDNode *N, const APInt &Divisor,
return SDValue();
}
+/// Build sdiv by power-of-2 with conditional move instructions
+/// Ref: "Hacker's Delight" by Henry Warren 10-1
+/// If conditional move/branch is preferred, we lower sdiv x, +/-2**k into:
+/// bgez x, label
+/// add x, x, 2**k-1
+/// label:
+/// sra res, x, k
+/// neg res, res (when the divisor is negative)
+SDValue TargetLowering::buildSDIVPow2WithCMov(
+ SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ unsigned Lg2 = Divisor.countr_zero();
+ EVT VT = N->getValueType(0);
+
+ SDLoc DL(N);
+ SDValue N0 = N->getOperand(0);
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
+ SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
+
+ // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
+ SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
+ SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
+ SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
+
+ Created.push_back(Cmp.getNode());
+ Created.push_back(Add.getNode());
+ Created.push_back(CMov.getNode());
+
+ // Divide by pow2.
+ SDValue SRA =
+ DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (Divisor.isNonNegative())
+ return SRA;
+
+ Created.push_back(SRA.getNode());
+ return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+}
+
/// Given an ISD::SDIV node expressing a divide by constant,
/// return a DAG expression to select that will generate the same value by
/// multiplying by a magic number.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3de6bd1ec94a82a..fb4a5594a04d5f3 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16102,33 +16102,7 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
!(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()))
return SDValue();
- SDLoc DL(N);
- SDValue N0 = N->getOperand(0);
- unsigned Lg2 = Divisor.countr_zero();
- SDValue Zero = DAG.getConstant(0, DL, VT);
- SDValue Pow2MinusOne = DAG.getConstant((1ULL << Lg2) - 1, DL, VT);
-
- // Add (N0 < 0) ? Pow2 - 1 : 0;
- SDValue CCVal;
- SDValue Cmp = getAArch64Cmp(N0, Zero, ISD::SETLT, CCVal, DAG, DL);
- SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
- SDValue CSel = DAG.getNode(AArch64ISD::CSEL, DL, VT, Add, N0, CCVal, Cmp);
-
- Created.push_back(Cmp.getNode());
- Created.push_back(Add.getNode());
- Created.push_back(CSel.getNode());
-
- // Divide by pow2.
- SDValue SRA =
- DAG.getNode(ISD::SRA, DL, VT, CSel, DAG.getConstant(Lg2, DL, MVT::i64));
-
- // If we're dividing by a positive value, we're done. Otherwise, we must
- // negate the result.
- if (Divisor.isNonNegative())
- return SRA;
-
- Created.push_back(SRA.getNode());
- return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+ return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
}
SDValue
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 686350de29883aa..b40885f8d597943 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -18803,6 +18803,26 @@ unsigned RISCVTargetLowering::getCustomCtpopCost(EVT VT,
return isCtpopFast(VT) ? 0 : 1;
}
+SDValue
+RISCVTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+ SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const {
+ AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
+ if (isIntDivCheap(N->getValueType(0), Attr))
+ return SDValue(N, 0); // Lower SDIV as SDIV
+
+ // Only perform this transform if short forward branch opt is supported.
+ if (!Subtarget.hasShortForwardBranchOpt())
+ return SDValue();
+ EVT VT = N->getValueType(0);
+ if (!(VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit())))
+ return SDValue();
+ unsigned Lg2 = Divisor.countr_zero();
+ // ensure 2**k-1 < 2048
+ if (Lg2 >= 11)
+ return SDValue();
+ return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
+}
namespace llvm::RISCVVIntrinsicsTable {
#define GET_RISCVVIntrinsicsTable_IMPL
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 0b8e6994a876ac8..5f6f22487d8ac3a 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -954,6 +954,9 @@ class RISCVTargetLowering : public TargetLowering {
/// For available scheduling models FDIV + two independent FMULs are much
/// faster than two FDIVs.
unsigned combineRepeatedFPDivisors() const override;
+
+ SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
+ SmallVectorImpl<SDNode *> &Created) const override;
};
namespace RISCV {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 71998140325b1f7..040358a5d1eeb92 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -22624,32 +22624,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
if (Lg2 == 1)
return SDValue();
- SDLoc DL(N);
- SDValue N0 = N->getOperand(0);
- SDValue Zero = DAG.getConstant(0, DL, VT);
- APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2);
- SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT);
-
- // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right.
- SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT);
- SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne);
- SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0);
-
- Created.push_back(Cmp.getNode());
- Created.push_back(Add.getNode());
- Created.push_back(CMov.getNode());
-
- // Divide by pow2.
- SDValue SRA =
- DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
-
- // If we're dividing by a positive value, we're done. Otherwise, we must
- // negate the result.
- if (Divisor.isNonNegative())
- return SRA;
-
- Created.push_back(SRA.getNode());
- return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA);
+ return TargetLowering::buildSDIVPow2WithCMov(N, Divisor, DAG, Created);
}
/// Result of 'and' is compared against zero. Change to a BT node if possible.
diff --git a/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
new file mode 100644
index 000000000000000..ba46fe800763368
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/sdiv-pow2-cmov.ll
@@ -0,0 +1,386 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefix=NOSFB %s
+; RUN: llc -mtriple=riscv64 -mcpu=sifive-u74 -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=SFB %s
+
+define signext i32 @sdiv2_32(i32 signext %0) {
+; NOSFB-LABEL: sdiv2_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srliw a1, a0, 31
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: sraiw a0, a0, 1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdiv2_32:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB0_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 1
+; SFB-NEXT: .LBB0_2:
+; SFB-NEXT: sraiw a0, a0, 1
+; SFB-NEXT: ret
+ %res = sdiv i32 %0, 2
+ ret i32 %res
+}
+
+define signext i32 @sdivneg2_32(i32 signext %0) {
+; NOSFB-LABEL: sdivneg2_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srliw a1, a0, 31
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: sraiw a0, a0, 1
+; NOSFB-NEXT: neg a0, a0
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdivneg2_32:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB1_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 1
+; SFB-NEXT: .LBB1_2:
+; SFB-NEXT: sraiw a0, a0, 1
+; SFB-NEXT: neg a0, a0
+; SFB-NEXT: ret
+ %res = sdiv i32 %0, -2
+ ret i32 %res
+}
+
+define i64 @sdiv2_64(i64 %0) {
+; NOSFB-LABEL: sdiv2_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srli a1, a0, 63
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: srai a0, a0, 1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdiv2_64:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB2_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 1
+; SFB-NEXT: .LBB2_2:
+; SFB-NEXT: srai a0, a0, 1
+; SFB-NEXT: ret
+ %res = sdiv i64 %0, 2
+ ret i64 %res
+}
+
+define i64 @sdivneg2_64(i64 %0) {
+; NOSFB-LABEL: sdivneg2_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srli a1, a0, 63
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: srai a0, a0, 1
+; NOSFB-NEXT: neg a0, a0
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdivneg2_64:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB3_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 1
+; SFB-NEXT: .LBB3_2:
+; SFB-NEXT: srai a0, a0, 1
+; SFB-NEXT: neg a0, a0
+; SFB-NEXT: ret
+ %res = sdiv i64 %0, -2
+ ret i64 %res
+}
+
+define signext i32 @srem2_32(i32 signext %0) {
+; NOSFB-LABEL: srem2_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srliw a1, a0, 31
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: subw a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: srem2_32:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB4_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 1
+; SFB-NEXT: .LBB4_2:
+; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: subw a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i32 %0, 2
+ ret i32 %res
+}
+
+define signext i32 @sremneg2_32(i32 signext %0) {
+; NOSFB-LABEL: sremneg2_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srliw a1, a0, 31
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: subw a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sremneg2_32:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB5_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 1
+; SFB-NEXT: .LBB5_2:
+; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: subw a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i32 %0, -2
+ ret i32 %res
+}
+
+define i64 @srem2_64(i64 %0) {
+; NOSFB-LABEL: srem2_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srli a1, a0, 63
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: sub a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: srem2_64:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB6_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 1
+; SFB-NEXT: .LBB6_2:
+; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: sub a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i64 %0, 2
+ ret i64 %res
+}
+
+define i64 @sremneg2_64(i64 %0) {
+; NOSFB-LABEL: sremneg2_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srli a1, a0, 63
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -2
+; NOSFB-NEXT: sub a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sremneg2_64:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB7_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 1
+; SFB-NEXT: .LBB7_2:
+; SFB-NEXT: andi a1, a1, -2
+; SFB-NEXT: sub a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i64 %0, -2
+ ret i64 %res
+}
+
+define signext i32 @sdiv8_32(i32 signext %0) {
+; NOSFB-LABEL: sdiv8_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: slli a1, a0, 1
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: sraiw a0, a0, 3
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdiv8_32:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB8_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 7
+; SFB-NEXT: .LBB8_2:
+; SFB-NEXT: sraiw a0, a0, 3
+; SFB-NEXT: ret
+ %res = sdiv i32 %0, 8
+ ret i32 %res
+}
+
+define signext i32 @sdivneg8_32(i32 signext %0) {
+; NOSFB-LABEL: sdivneg8_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: slli a1, a0, 1
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: sraiw a0, a0, 3
+; NOSFB-NEXT: neg a0, a0
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdivneg8_32:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB9_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 7
+; SFB-NEXT: .LBB9_2:
+; SFB-NEXT: sraiw a0, a0, 3
+; SFB-NEXT: neg a0, a0
+; SFB-NEXT: ret
+ %res = sdiv i32 %0, -8
+ ret i32 %res
+}
+
+define i64 @sdiv8_64(i64 %0) {
+; NOSFB-LABEL: sdiv8_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srai a1, a0, 63
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: srai a0, a0, 3
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdiv8_64:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB10_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 7
+; SFB-NEXT: .LBB10_2:
+; SFB-NEXT: srai a0, a0, 3
+; SFB-NEXT: ret
+ %res = sdiv i64 %0, 8
+ ret i64 %res
+}
+
+define i64 @sdivneg8_64(i64 %0) {
+; NOSFB-LABEL: sdivneg8_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srai a1, a0, 63
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: srai a0, a0, 3
+; NOSFB-NEXT: neg a0, a0
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdivneg8_64:
+; SFB: # %bb.0:
+; SFB-NEXT: bgez a0, .LBB11_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a0, a0, 7
+; SFB-NEXT: .LBB11_2:
+; SFB-NEXT: srai a0, a0, 3
+; SFB-NEXT: neg a0, a0
+; SFB-NEXT: ret
+ %res = sdiv i64 %0, -8
+ ret i64 %res
+}
+
+define signext i32 @srem8_32(i32 signext %0) {
+; NOSFB-LABEL: srem8_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: slli a1, a0, 1
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: subw a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: srem8_32:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB12_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 7
+; SFB-NEXT: .LBB12_2:
+; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: subw a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i32 %0, 8
+ ret i32 %res
+}
+
+define signext i32 @sremneg8_32(i32 signext %0) {
+; NOSFB-LABEL: sremneg8_32:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: slli a1, a0, 1
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: subw a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sremneg8_32:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB13_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 7
+; SFB-NEXT: .LBB13_2:
+; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: subw a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i32 %0, -8
+ ret i32 %res
+}
+
+define i64 @srem8_64(i64 %0) {
+; NOSFB-LABEL: srem8_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srai a1, a0, 63
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: sub a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: srem8_64:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB14_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 7
+; SFB-NEXT: .LBB14_2:
+; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: sub a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i64 %0, 8
+ ret i64 %res
+}
+
+define i64 @sremneg8_64(i64 %0) {
+; NOSFB-LABEL: sremneg8_64:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srai a1, a0, 63
+; NOSFB-NEXT: srli a1, a1, 61
+; NOSFB-NEXT: add a1, a1, a0
+; NOSFB-NEXT: andi a1, a1, -8
+; NOSFB-NEXT: sub a0, a0, a1
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sremneg8_64:
+; SFB: # %bb.0:
+; SFB-NEXT: mv a1, a0
+; SFB-NEXT: bgez a0, .LBB15_2
+; SFB-NEXT: # %bb.1:
+; SFB-NEXT: addi a1, a0, 7
+; SFB-NEXT: .LBB15_2:
+; SFB-NEXT: andi a1, a1, -8
+; SFB-NEXT: sub a0, a0, a1
+; SFB-NEXT: ret
+ %res = srem i64 %0, -8
+ ret i64 %res
+}
+
+; Negative tests
+define i64 @sdiv4096(i64 %0) {
+; NOSFB-LABEL: sdiv4096:
+; NOSFB: # %bb.0:
+; NOSFB-NEXT: srai a1, a0, 63
+; NOSFB-NEXT: srli a1, a1, 52
+; NOSFB-NEXT: add a0, a0, a1
+; NOSFB-NEXT: srai a0, a0, 12
+; NOSFB-NEXT: ret
+;
+; SFB-LABEL: sdiv4096:
+; SFB: # %bb.0:
+; SFB-NEXT: srai a1, a0, 63
+; SFB-NEXT: srli a1, a1, 52
+; SFB-NEXT: add a0, a0, a1
+; SFB-NEXT: srai a0, a0, 12
+; SFB-NEXT: ret
+ %res = sdiv i64 %0, 4096
+ ret i64 %res
+}
|
Error: Command failed due to missing milestone. |
Error: Command failed due to missing milestone. |
Error: Command failed due to missing milestone. |
Error: Command failed due to missing milestone. |
2147442
to
5826025
Compare
Address comments.
d34e7b6
to
9c31964
Compare
Ping @topperc @davemgreen |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Any more comments? @davemgreen |
Yeah I think I'm happy. So long as nothing is getting worse then this LGTM. Thanks |
…7364) This patch lowers `sdiv x, +/-2**k` to `add + select + shift` when the short forward branch optimization is enabled. The latter inst seq performs faster than the seq generated by target-independent DAGCombiner. This algorithm is described in ***Hacker's Delight***. This patch also removes duplicate logic in the X86 and AArch64 backend. But we cannot do this for the PowerPC backend since it generates a special instruction `addze`.
This patch lowers
sdiv x, +/-2**k
toadd + select + shift
when the short forward branch optimization is enabled. The latter inst seq performs faster than the seq generated by target-independent DAGCombiner. This algorithm is described in Hacker's Delight.This patch also removes duplicate logic in the X86 and AArch64 backend. But we cannot do this for the PowerPC backend since it generates a special instruction
addze
.