Skip to content

Commit dcd246c

Browse files
authored
[ARM] Add scalar add_sat costs. (#100988)
These can usually generate: - qadd / qsub for signed i32 scalars - uqadd16 / qadd16 / uqsub16 / qsub16 with an extend for signed/unsigned i8/i16 - Are expanded to an add + cmp + sel otherwise This can lead to differences in unrolling etc, but should be a better cost for the instructions.
1 parent c649194 commit dcd246c

File tree

4 files changed

+134
-112
lines changed

4 files changed

+134
-112
lines changed

llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1888,7 +1888,8 @@ ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
18881888
InstructionCost
18891889
ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
18901890
TTI::TargetCostKind CostKind) {
1891-
switch (ICA.getID()) {
1891+
unsigned Opc = ICA.getID();
1892+
switch (Opc) {
18921893
case Intrinsic::get_active_lane_mask:
18931894
// Currently we make a somewhat optimistic assumption that
18941895
// active_lane_mask's are always free. In reality it may be freely folded
@@ -1904,17 +1905,38 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
19041905
case Intrinsic::ssub_sat:
19051906
case Intrinsic::uadd_sat:
19061907
case Intrinsic::usub_sat: {
1908+
bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1909+
bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1910+
Type *RetTy = ICA.getReturnType();
1911+
1912+
if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1913+
if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
1914+
return 1; // qadd / qsub
1915+
if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
1916+
return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
1917+
// Otherwise return the cost of expanding the node. Generally an add +
1918+
// icmp + sel.
1919+
CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1920+
Type *CondTy = RetTy->getWithNewBitWidth(1);
1921+
return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
1922+
RetTy, CostKind) +
1923+
2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
1924+
CostKind) +
1925+
2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
1926+
CostKind);
1927+
}
1928+
19071929
if (!ST->hasMVEIntegerOps())
19081930
break;
1909-
Type *VT = ICA.getReturnType();
19101931

1911-
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
1932+
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
19121933
if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
19131934
LT.second == MVT::v16i8) {
19141935
// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
19151936
// need to extend the type, as it uses shr(qadd(shl, shl)).
19161937
unsigned Instrs =
1917-
LT.second.getScalarSizeInBits() == VT->getScalarSizeInBits() ? 1 : 4;
1938+
LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
1939+
: 4;
19181940
return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
19191941
}
19201942
break;
@@ -1948,7 +1970,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
19481970
case Intrinsic::fptoui_sat: {
19491971
if (ICA.getArgTypes().empty())
19501972
break;
1951-
bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1973+
bool IsSigned = Opc == Intrinsic::fptosi_sat;
19521974
auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
19531975
EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
19541976
// Check for the legal types, with the corect subtarget features.

0 commit comments

Comments
 (0)