Skip to content

Commit cceda2b

Browse files
committed
[AArch64] Consider negated powers of 2 when calculating throughput cost
Negated powers of 2 have similar or (exact in the case of remainder) codegen with lowering sdiv. In the case of sdiv, it just negates the result in the end anyway, so nothing dissimilar at all.
1 parent d34b392 commit cceda2b

File tree

5 files changed

+126
-115
lines changed

5 files changed

+126
-115
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4008,22 +4008,33 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
40084008
if (Op2Info.isPowerOf2()) {
40094009
return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
40104010
: (3 * AsrCost + AddCost);
4011+
} else if (Op2Info.isNegatedPowerOf2()) {
4012+
return ISD == ISD::SDIV
4013+
? (3 * AddCost + 2 * AsrCost) // One more for the neg
4014+
: (3 * AsrCost + AddCost); // Same as Power of 2 for Rem
40114015
} else {
40124016
return MulCost + AsrCost + 2 * AddCost;
40134017
}
40144018
} else if (VT.isVector()) {
40154019
InstructionCost UsraCost = 2 * AsrCost;
4016-
if (Op2Info.isPowerOf2()) {
4020+
if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
40174021
// Division with scalable types corresponds to native 'asrd'
40184022
// instruction when SVE is available.
40194023
// e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4024+
4025+
// One more for the negation in SDIV
4026+
InstructionCost cost =
4027+
(Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
40204028
if (Ty->isScalableTy() && ST->hasSVE())
4021-
return 2 * AsrCost;
4022-
return UsraCost +
4023-
(ISD == ISD::SDIV
4024-
? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) *
4025-
AsrCost
4026-
: 2 * AddCost);
4029+
cost += 2 * AsrCost;
4030+
else {
4031+
cost +=
4032+
UsraCost +
4033+
(ISD == ISD::SDIV
4034+
? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4035+
: 2 * AddCost);
4036+
}
4037+
return cost;
40274038
} else if (LT.second == MVT::v2i64) {
40284039
return VT.getVectorNumElements() *
40294040
getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,

llvm/test/Analysis/CostModel/AArch64/div.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -870,27 +870,27 @@ define void @sdiv_uniformconstnegpow2() {
870870
; CHECK-LABEL: 'sdiv_uniformconstnegpow2'
871871
; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %I128 = sdiv i128 undef, -16
872872
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I64 = sdiv i64 undef, -16
873-
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
874-
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
875-
; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
873+
; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16)
874+
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16)
875+
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16)
876876
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I32 = sdiv i32 undef, -16
877-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
878-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
879-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
880-
; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
877+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16)
878+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16)
879+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16)
880+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16)
881881
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I16 = sdiv i16 undef, -16
882-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
883-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
884-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
885-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
886-
; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
882+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16)
883+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16)
884+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16)
885+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16)
886+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16)
887887
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I8 = sdiv i8 undef, -16
888-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
889-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
890-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
891-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
892-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
893-
; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
888+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16)
889+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16)
890+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16)
891+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16)
892+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16)
893+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16)
894894
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
895895
;
896896
%I128 = sdiv i128 undef, -16

llvm/test/Analysis/CostModel/AArch64/rem.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -870,27 +870,27 @@ define void @srem_uniformconstnegpow2() {
870870
; CHECK-LABEL: 'srem_uniformconstnegpow2'
871871
; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %I128 = srem i128 undef, -16
872872
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I64 = srem i64 undef, -16
873-
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
874-
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
875-
; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
873+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16)
874+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16)
875+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16)
876876
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I32 = srem i32 undef, -16
877-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
878-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
879-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
880-
; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
877+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16)
878+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16)
879+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16)
880+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16)
881881
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I16 = srem i16 undef, -16
882-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
883-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
884-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
885-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
886-
; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
882+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16)
883+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16)
884+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16)
885+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16)
886+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16)
887887
; CHECK-NEXT: Cost Model: Found costs of 4 for: %I8 = srem i8 undef, -16
888-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
889-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
890-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
891-
; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
892-
; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
893-
; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
888+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16)
889+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16)
890+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16)
891+
; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16)
892+
; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16)
893+
; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16)
894894
; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
895895
;
896896
%I128 = srem i128 undef, -16

0 commit comments

Comments
 (0)