Skip to content

Commit 77b8cc9

Browse files
committed
[TTI] getCommonMaskedMemoryOpCost - consistently use getScalarizationOverhead instead of ExtractElement costs for address/mask extraction.
These aren't unknown extraction indices, we will be extracting every address/mask element in sequence.
1 parent 3f71d29 commit 77b8cc9

File tree

14 files changed

+1084
-924
lines changed

14 files changed

+1084
-924
lines changed

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -224,16 +224,16 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
224224
// First, compute the cost of the individual memory operations.
225225
InstructionCost AddrExtractCost =
226226
IsGatherScatter
227-
? getVectorInstrCost(
228-
Instruction::ExtractElement,
227+
? getScalarizationOverhead(
229228
FixedVectorType::get(
230229
PointerType::get(VT->getElementType(), 0), VF),
231-
CostKind, -1, nullptr, nullptr)
230+
/*Insert=*/false, /*Extract=*/true, CostKind)
232231
: 0;
233-
InstructionCost LoadCost =
234-
VF *
235-
(AddrExtractCost + getMemoryOpCost(Opcode, VT->getElementType(),
236-
Alignment, AddressSpace, CostKind));
232+
233+
// The cost of the scalar loads/stores.
234+
InstructionCost MemoryOpCost =
235+
VF * getMemoryOpCost(Opcode, VT->getElementType(), Alignment,
236+
AddressSpace, CostKind);
237237

238238
// Next, compute the cost of packing the result in a vector.
239239
InstructionCost PackingCost =
@@ -249,16 +249,14 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
249249
// operations accurately is quite difficult and the current solution
250250
// provides a very rough estimate only.
251251
ConditionalCost =
252-
VF *
253-
(getVectorInstrCost(
254-
Instruction::ExtractElement,
255-
FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
256-
CostKind, -1, nullptr, nullptr) +
257-
getCFInstrCost(Instruction::Br, CostKind) +
258-
getCFInstrCost(Instruction::PHI, CostKind));
252+
getScalarizationOverhead(
253+
FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VF),
254+
/*Insert=*/false, /*Extract=*/true, CostKind) +
255+
VF * (getCFInstrCost(Instruction::Br, CostKind) +
256+
getCFInstrCost(Instruction::PHI, CostKind));
259257
}
260258

261-
return LoadCost + PackingCost + ConditionalCost;
259+
return AddrExtractCost + MemoryOpCost + PackingCost + ConditionalCost;
262260
}
263261

264262
protected:

llvm/test/Analysis/CostModel/AArch64/masked_ldst.ll

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,24 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
55

66
define void @fixed() {
77
; CHECK-LABEL: 'fixed'
8-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
9-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
10-
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
11-
; CHECK-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
12-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
13-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
14-
; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
15-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
16-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
17-
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
18-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
19-
; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
20-
; CHECK-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
21-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
22-
; CHECK-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
23-
; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
24-
; CHECK-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
25-
; CHECK-NEXT: Cost Model: Found an estimated cost of 184 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
8+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i8 = call <2 x i8> @llvm.masked.load.v2i8.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i8> undef)
9+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i8 = call <4 x i8> @llvm.masked.load.v4i8.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i8> undef)
10+
; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8i8 = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i8> undef)
11+
; CHECK-NEXT: Cost Model: Found an estimated cost of 112 for instruction: %v16i8 = call <16 x i8> @llvm.masked.load.v16i8.p0(ptr undef, i32 8, <16 x i1> undef, <16 x i8> undef)
12+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i16 = call <2 x i16> @llvm.masked.load.v2i16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i16> undef)
13+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i16 = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i16> undef)
14+
; CHECK-NEXT: Cost Model: Found an estimated cost of 56 for instruction: %v8i16 = call <8 x i16> @llvm.masked.load.v8i16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x i16> undef)
15+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i32 = call <2 x i32> @llvm.masked.load.v2i32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i32> undef)
16+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i32 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i32> undef)
17+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %v2i64 = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x i64> undef)
18+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f16 = call <2 x half> @llvm.masked.load.v2f16.p0(ptr undef, i32 8, <2 x i1> undef, <2 x half> undef)
19+
; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f16 = call <4 x half> @llvm.masked.load.v4f16.p0(ptr undef, i32 8, <4 x i1> undef, <4 x half> undef)
20+
; CHECK-NEXT: Cost Model: Found an estimated cost of 54 for instruction: %v8f16 = call <8 x half> @llvm.masked.load.v8f16.p0(ptr undef, i32 8, <8 x i1> undef, <8 x half> undef)
21+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f32 = call <2 x float> @llvm.masked.load.v2f32.p0(ptr undef, i32 8, <2 x i1> undef, <2 x float> undef)
22+
; CHECK-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %v4f32 = call <4 x float> @llvm.masked.load.v4f32.p0(ptr undef, i32 8, <4 x i1> undef, <4 x float> undef)
23+
; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v2f64 = call <2 x double> @llvm.masked.load.v2f64.p0(ptr undef, i32 8, <2 x i1> undef, <2 x double> undef)
24+
; CHECK-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %v4i64 = call <4 x i64> @llvm.masked.load.v4i64.p0(ptr undef, i32 8, <4 x i1> undef, <4 x i64> undef)
25+
; CHECK-NEXT: Cost Model: Found an estimated cost of 216 for instruction: %v32f16 = call <32 x half> @llvm.masked.load.v32f16.p0(ptr undef, i32 8, <32 x i1> undef, <32 x half> undef)
2626
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
2727
;
2828
entry:

0 commit comments

Comments
 (0)