Skip to content

Commit 23fe1fc

Browse files
committed
[TTI][X86] getGSScalarCost - don't bother with adding cost of ICMP for each i1 mask element
These can nearly always be folded into the existing cost of the branch, and brings the throughput costs of the scalarised gather/scatter code much closer to the llvm-mca/uica estimates
1 parent 079fdef commit 23fe1fc

File tree

6 files changed

+657
-660
lines changed

6 files changed

+657
-660
lines changed

llvm/lib/Target/X86/X86TargetTransformInfo.cpp

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5864,11 +5864,8 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
58645864
FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
58655865
MaskUnpackCost = getScalarizationOverhead(
58665866
MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
5867-
InstructionCost ScalarCompareCost = getCmpSelInstrCost(
5868-
Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
5869-
CmpInst::BAD_ICMP_PREDICATE, CostKind);
58705867
InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
5871-
MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
5868+
MaskUnpackCost += VF * BranchCost;
58725869
}
58735870

58745871
InstructionCost AddressUnpackCost = getScalarizationOverhead(

llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -310,7 +310,7 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
310310

311311
define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
312312
; THRU-LABEL: 'maskedgather'
313-
; THRU-NEXT: Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
313+
; THRU-NEXT: Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
314314
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
315315
;
316316
; LATE-LABEL: 'maskedgather'
@@ -331,7 +331,7 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
331331

332332
define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
333333
; THRU-LABEL: 'maskedscatter'
334-
; THRU-NEXT: Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
334+
; THRU-NEXT: Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
335335
; THRU-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
336336
;
337337
; LATE-LABEL: 'maskedscatter'

llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ define void @test() {
4949
;
5050
; AVX512-LABEL: 'test'
5151
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
52-
; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
53-
; AVX512: LV: Found an estimated cost of 21 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
52+
; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
53+
; AVX512: LV: Found an estimated cost of 17 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
5454
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
5555
; AVX512: LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
5656
; AVX512: LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4

llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,8 @@ define void @test() {
4949
;
5050
; AVX512-LABEL: 'test'
5151
; AVX512: LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
52-
; AVX512: LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
53-
; AVX512: LV: Found an estimated cost of 22 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
52+
; AVX512: LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
53+
; AVX512: LV: Found an estimated cost of 18 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
5454
; AVX512: LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
5555
; AVX512: LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
5656
; AVX512: LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8

0 commit comments

Comments
 (0)