@@ -1888,7 +1888,8 @@ ARMTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
1888
1888
InstructionCost
1889
1889
ARMTTIImpl::getIntrinsicInstrCost (const IntrinsicCostAttributes &ICA,
1890
1890
TTI::TargetCostKind CostKind) {
1891
- switch (ICA.getID ()) {
1891
+ unsigned Opc = ICA.getID ();
1892
+ switch (Opc) {
1892
1893
case Intrinsic::get_active_lane_mask:
1893
1894
// Currently we make a somewhat optimistic assumption that
1894
1895
// active_lane_mask's are always free. In reality it may be freely folded
@@ -1904,17 +1905,38 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1904
1905
case Intrinsic::ssub_sat:
1905
1906
case Intrinsic::uadd_sat:
1906
1907
case Intrinsic::usub_sat: {
1908
+ bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1909
+ bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
1910
+ Type *RetTy = ICA.getReturnType ();
1911
+
1912
+ if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
1913
+ if (IsSigned && ST->hasDSP () && ITy->getBitWidth () == 32 )
1914
+ return 1 ; // qadd / qsub
1915
+ if (ST->hasDSP () && (ITy->getBitWidth () == 8 || ITy->getBitWidth () == 16 ))
1916
+ return 2 ; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
1917
+ // Otherwise return the cost of expanding the node. Generally an add +
1918
+ // icmp + sel.
1919
+ CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1920
+ Type *CondTy = RetTy->getWithNewBitWidth (1 );
1921
+ return getArithmeticInstrCost (IsAdd ? Instruction::Add : Instruction::Sub,
1922
+ RetTy, CostKind) +
1923
+ 2 * getCmpSelInstrCost (BinaryOperator::ICmp, RetTy, CondTy, Pred,
1924
+ CostKind) +
1925
+ 2 * getCmpSelInstrCost (BinaryOperator::Select, RetTy, CondTy, Pred,
1926
+ CostKind);
1927
+ }
1928
+
1907
1929
if (!ST->hasMVEIntegerOps ())
1908
1930
break ;
1909
- Type *VT = ICA.getReturnType ();
1910
1931
1911
- std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost (VT );
1932
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost (RetTy );
1912
1933
if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
1913
1934
LT.second == MVT::v16i8) {
1914
1935
// This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
1915
1936
// need to extend the type, as it uses shr(qadd(shl, shl)).
1916
1937
unsigned Instrs =
1917
- LT.second .getScalarSizeInBits () == VT->getScalarSizeInBits () ? 1 : 4 ;
1938
+ LT.second .getScalarSizeInBits () == RetTy->getScalarSizeInBits () ? 1
1939
+ : 4 ;
1918
1940
return LT.first * ST->getMVEVectorCostFactor (CostKind) * Instrs;
1919
1941
}
1920
1942
break ;
@@ -1948,7 +1970,7 @@ ARMTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1948
1970
case Intrinsic::fptoui_sat: {
1949
1971
if (ICA.getArgTypes ().empty ())
1950
1972
break ;
1951
- bool IsSigned = ICA. getID () == Intrinsic::fptosi_sat;
1973
+ bool IsSigned = Opc == Intrinsic::fptosi_sat;
1952
1974
auto LT = getTypeLegalizationCost (ICA.getArgTypes ()[0 ]);
1953
1975
EVT MTy = TLI->getValueType (DL, ICA.getReturnType ());
1954
1976
// Check for the legal types, with the corect subtarget features.
0 commit comments