Skip to content

Commit 5cbba0f

Browse files
committed
[TTI] getCommonMaskedMemoryOpCost - consistently use getScalarizationOverhead instead of ExtractElement costs for address/mask extraction.
These aren't unknown extraction indices, we will be extracting every address/mask element in sequence.
1 parent 08bb121 commit 5cbba0f

File tree

14 files changed

+1084
-923
lines changed

14 files changed

+1084
-923
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -223,15 +223,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
223223
// First, compute the cost of the individual memory operations.
224224
InstructionCost AddrExtractCost =
225225
IsGatherScatter
226-
? getVectorInstrCost(
227-
Instruction::ExtractElement,
226+
? getScalarizationOverhead(
228227
FixedVectorType::get(
229228
PointerType::get(VT->getElementType(), 0), VF),
230-
CostKind, -1, nullptr, nullptr)
229+
/*Insert=*/false, /*Extract=*/true, CostKind)
231230
: 0;
232-
InstructionCost LoadCost =
233-
VF * (AddrExtractCost + getMemoryOpCost(Opcode, VT->getElementType(),
234-
Alignment, 0, CostKind));
231+
232+
// The cost of the scalar loads/stores.
233+
InstructionCost MemoryOpCost =
234+
VF *
235+
getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind);
235236

236237
// Next, compute the cost of packing the result in a vector.
237238
InstructionCost PackingCost =
@@ -247,16 +248,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
247248
// operations accurately is quite difficult and the current solution
248249
// provides a very rough estimate only.
249250
ConditionalCost =
250-
VF *
251-
(getVectorInstrCost(
252-
Instruction::ExtractElement,
253-
FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
254-
CostKind, -1, nullptr, nullptr) +
255-
getCFInstrCost(Instruction::Br, CostKind) +
256-
getCFInstrCost(Instruction::PHI, CostKind));
251+
getScalarizationOverhead(
252+
FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
253+
/*Insert=*/false, /*Extract=*/true, CostKind) +
254+
VF * (getCFInstrCost(Instruction::Br, CostKind) +
255+
getCFInstrCost(Instruction::PHI, CostKind));
257256
}
258257

259-
return LoadCost + PackingCost + ConditionalCost;
258+
return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
260259
}
261260

262261
protected:

llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
55

66
define void @fixed() {
77
; CHECK-LABEL: 'fixed'
8-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
9-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
12-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
13-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
14-
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
15-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
16-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
17-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
19-
; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
20-
; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
21-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
22-
; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
23-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
24-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
25-
; CHECK-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
24+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
25+
; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
2626
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2727
;
2828
entry:

0 commit comments

Comments
 (0)