Skip to content

Commit 168a44a

Browse files
committed
[CostModel][X86] Improve extract/insert element costs (PR43605)
This tries to improve the accuracy of extract/insert element costs by accounting for subvector extraction/insertion for >128-bit vectors and the shuffling of elements to/from the 0'th index. It also adds INSERTPS for f32 types and PINSR/PEXTR costs for integer types (at the moment we assume the same cost as MOVD/MOVQ - which isn't always true). Differential Revision: https://reviews.llvm.org/D74976
1 parent 965ba42 commit 168a44a

31 files changed

+2789
-1831
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 49 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2393,10 +2393,11 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
23932393
};
23942394

23952395
assert(Val->isVectorTy() && "This must be a vector type");
2396-
23972396
Type *ScalarType = Val->getScalarType();
2397+
int RegisterFileMoveCost = 0;
23982398

2399-
if (Index != -1U) {
2399+
if (Index != -1U && (Opcode == Instruction::ExtractElement ||
2400+
Opcode == Instruction::InsertElement)) {
24002401
// Legalize the type.
24012402
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
24022403

@@ -2405,32 +2406,69 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
24052406
return 0;
24062407

24072408
// The type may be split. Normalize the index to the new type.
2408-
unsigned Width = LT.second.getVectorNumElements();
2409-
Index = Index % Width;
2409+
unsigned NumElts = LT.second.getVectorNumElements();
2410+
unsigned SubNumElts = NumElts;
2411+
Index = Index % NumElts;
2412+
2413+
// For >128-bit vectors, we need to extract higher 128-bit subvectors.
2414+
// For inserts, we also need to insert the subvector back.
2415+
if (LT.second.getSizeInBits() > 128) {
2416+
assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
2417+
unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
2418+
SubNumElts = NumElts / NumSubVecs;
2419+
if (SubNumElts <= Index) {
2420+
RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
2421+
Index %= SubNumElts;
2422+
}
2423+
}
24102424

24112425
if (Index == 0) {
24122426
// Floating point scalars are already located in index #0.
2427+
// Many insertions to #0 can fold away for scalar fp-ops, so let's assume
2428+
// true for all.
24132429
if (ScalarType->isFloatingPointTy())
2414-
return 0;
2430+
return RegisterFileMoveCost;
24152431

2416-
// Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
2417-
if (ScalarType->isIntegerTy())
2418-
return 1;
2432+
// Assume movd/movq XMM -> GPR is relatively cheap on all targets.
2433+
if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
2434+
return 1 + RegisterFileMoveCost;
24192435
}
24202436

24212437
int ISD = TLI->InstructionOpcodeToISD(Opcode);
24222438
assert(ISD && "Unexpected vector opcode");
24232439
MVT MScalarTy = LT.second.getScalarType();
24242440
if (ST->isSLM())
24252441
if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
2426-
return Entry->Cost;
2442+
return Entry->Cost + RegisterFileMoveCost;
2443+
2444+
// Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
2445+
if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
2446+
(MScalarTy.isInteger() && ST->hasSSE41()))
2447+
return 1 + RegisterFileMoveCost;
2448+
2449+
// Assume insertps is relatively cheap on all targets.
2450+
if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
2451+
Opcode == Instruction::InsertElement)
2452+
return 1 + RegisterFileMoveCost;
2453+
2454+
// For extractions we just need to shuffle the element to index 0, which
2455+
// should be very cheap (assume cost = 1). For insertions we need to shuffle
2456+
// the elements to its destination. In both cases we must handle the
2457+
// subvector move(s).
2458+
// TODO: Under what circumstances should we shuffle using the full width?
2459+
int ShuffleCost = 1;
2460+
if (Opcode == Instruction::InsertElement) {
2461+
Type *SubTy = VectorType::get(Val->getVectorElementType(), SubNumElts);
2462+
ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
2463+
}
2464+
int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
2465+
return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
24272466
}
24282467

24292468
// Add to the base cost if we know that the extracted element of a vector is
24302469
// destined to be moved to and used in the integer register file.
2431-
int RegisterFileMoveCost = 0;
24322470
if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
2433-
RegisterFileMoveCost = 1;
2471+
RegisterFileMoveCost += 1;
24342472

24352473
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
24362474
}

llvm/test/Analysis/CostModel/X86/arith-fp.ll

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -673,9 +673,9 @@ define i32 @fdiv(i32 %arg) {
673673
define i32 @frem(i32 %arg) {
674674
; SSE1-LABEL: 'frem'
675675
; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
676-
; SSE1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
677-
; SSE1-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
678-
; SSE1-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
676+
; SSE1-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
677+
; SSE1-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
678+
; SSE1-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
679679
; SSE1-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
680680
; SSE1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2F64 = frem <2 x double> undef, undef
681681
; SSE1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V4F64 = frem <4 x double> undef, undef
@@ -684,9 +684,9 @@ define i32 @frem(i32 %arg) {
684684
;
685685
; SSE2-LABEL: 'frem'
686686
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
687-
; SSE2-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
688-
; SSE2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F32 = frem <8 x float> undef, undef
689-
; SSE2-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %V16F32 = frem <16 x float> undef, undef
687+
; SSE2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4F32 = frem <4 x float> undef, undef
688+
; SSE2-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %V8F32 = frem <8 x float> undef, undef
689+
; SSE2-NEXT: Cost Model: Found an estimated cost of 68 for instruction: %V16F32 = frem <16 x float> undef, undef
690690
; SSE2-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
691691
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
692692
; SSE2-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V4F64 = frem <4 x double> undef, undef
@@ -707,23 +707,23 @@ define i32 @frem(i32 %arg) {
707707
; AVX-LABEL: 'frem'
708708
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
709709
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
710-
; AVX-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef
711-
; AVX-NEXT: Cost Model: Found an estimated cost of 60 for instruction: %V16F32 = frem <16 x float> undef, undef
710+
; AVX-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
711+
; AVX-NEXT: Cost Model: Found an estimated cost of 80 for instruction: %V16F32 = frem <16 x float> undef, undef
712712
; AVX-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
713713
; AVX-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
714-
; AVX-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef
715-
; AVX-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V8F64 = frem <8 x double> undef, undef
714+
; AVX-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
715+
; AVX-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %V8F64 = frem <8 x double> undef, undef
716716
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
717717
;
718718
; AVX512-LABEL: 'frem'
719719
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
720720
; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F32 = frem <4 x float> undef, undef
721-
; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F32 = frem <8 x float> undef, undef
722-
; AVX512-NEXT: Cost Model: Found an estimated cost of 62 for instruction: %V16F32 = frem <16 x float> undef, undef
721+
; AVX512-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8F32 = frem <8 x float> undef, undef
722+
; AVX512-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V16F32 = frem <16 x float> undef, undef
723723
; AVX512-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
724724
; AVX512-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2F64 = frem <2 x double> undef, undef
725-
; AVX512-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V4F64 = frem <4 x double> undef, undef
726-
; AVX512-NEXT: Cost Model: Found an estimated cost of 30 for instruction: %V8F64 = frem <8 x double> undef, undef
725+
; AVX512-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V4F64 = frem <4 x double> undef, undef
726+
; AVX512-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V8F64 = frem <8 x double> undef, undef
727727
; AVX512-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
728728
;
729729
; SLM-LABEL: 'frem'
@@ -1059,9 +1059,9 @@ define i32 @fcopysign(i32 %arg) {
10591059
define i32 @fma(i32 %arg) {
10601060
; SSE1-LABEL: 'fma'
10611061
; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
1062-
; SSE1-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
1063-
; SSE1-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
1064-
; SSE1-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
1062+
; SSE1-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
1063+
; SSE1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
1064+
; SSE1-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
10651065
; SSE1-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
10661066
; SSE1-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
10671067
; SSE1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
@@ -1070,9 +1070,9 @@ define i32 @fma(i32 %arg) {
10701070
;
10711071
; SSE2-LABEL: 'fma'
10721072
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
1073-
; SSE2-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
1074-
; SSE2-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
1075-
; SSE2-NEXT: Cost Model: Found an estimated cost of 172 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
1073+
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
1074+
; SSE2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
1075+
; SSE2-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
10761076
; SSE2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
10771077
; SSE2-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
10781078
; SSE2-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
@@ -1093,12 +1093,12 @@ define i32 @fma(i32 %arg) {
10931093
; AVX-LABEL: 'fma'
10941094
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
10951095
; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
1096-
; AVX-NEXT: Cost Model: Found an estimated cost of 87 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
1097-
; AVX-NEXT: Cost Model: Found an estimated cost of 174 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
1096+
; AVX-NEXT: Cost Model: Found an estimated cost of 94 for instruction: %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
1097+
; AVX-NEXT: Cost Model: Found an estimated cost of 188 for instruction: %V16F32 = call <16 x float> @llvm.fma.v16f32(<16 x float> undef, <16 x float> undef, <16 x float> undef)
10981098
; AVX-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
10991099
; AVX-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
1100-
; AVX-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
1101-
; AVX-NEXT: Cost Model: Found an estimated cost of 86 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
1100+
; AVX-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
1101+
; AVX-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V8F64 = call <8 x double> @llvm.fma.v8f64(<8 x double> undef, <8 x double> undef, <8 x double> undef)
11021102
; AVX-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
11031103
;
11041104
; AVX512-LABEL: 'fma'

0 commit comments

Comments
 (0)