[TTI][X86] getGSScalarCost - don't bother with adding cost of ICMP for each i1 mask element

RKSimon · RKSimon · commit 23fe1fc6b78b · 2024-05-11T13:37:48.000+01:00
These can nearly always be folded into the existing cost of the branch, and brings the throughput costs of the scalarised gather/scatter code much closer to the llvm-mca/uica estimates
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5864,11 +5864,8 @@ InstructionCost X86TTIImpl::getGSScalarCost(unsigned Opcode,
         FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
     MaskUnpackCost = getScalarizationOverhead(
         MaskTy, DemandedElts, /*Insert=*/false, /*Extract=*/true, CostKind);
-    InstructionCost ScalarCompareCost = getCmpSelInstrCost(
-        Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()), nullptr,
-        CmpInst::BAD_ICMP_PREDICATE, CostKind);
     InstructionCost BranchCost = getCFInstrCost(Instruction::Br, CostKind);
-    MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
+    MaskUnpackCost += VF * BranchCost;
   }
 
   InstructionCost AddressUnpackCost = getScalarizationOverhead(
diff --git a/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/X86/intrinsic-cost-kinds.ll
@@ -310,7 +310,7 @@ define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x
 
 define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 ; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedgather'
@@ -331,7 +331,7 @@ define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
 
 define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
 ; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 77 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+; THRU-NEXT:  Cost Model: Found an estimated cost of 61 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
 ; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; LATE-LABEL: 'maskedscatter'
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i32-with-i8-index.ll
@@ -49,8 +49,8 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
-; AVX512:  LV: Found an estimated cost of 21 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i32, ptr %inB, align 4
+; AVX512:  LV: Found an estimated cost of 17 for VF 4 For instruction: %valB.loaded = load i32, ptr %inB, align 4
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i32, ptr %inB, align 4
 ; AVX512:  LV: Found an estimated cost of 18 for VF 16 For instruction: %valB.loaded = load i32, ptr %inB, align 4
 ; AVX512:  LV: Found an estimated cost of 36 for VF 32 For instruction: %valB.loaded = load i32, ptr %inB, align 4
diff --git a/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll b/llvm/test/Analysis/CostModel/X86/masked-gather-i64-with-i8-index.ll
@@ -49,8 +49,8 @@ define void @test() {
 ;
 ; AVX512-LABEL: 'test'
 ; AVX512:  LV: Found an estimated cost of 1 for VF 1 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 10 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
-; AVX512:  LV: Found an estimated cost of 22 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX512:  LV: Found an estimated cost of 8 for VF 2 For instruction: %valB.loaded = load i64, ptr %inB, align 8
+; AVX512:  LV: Found an estimated cost of 18 for VF 4 For instruction: %valB.loaded = load i64, ptr %inB, align 8
 ; AVX512:  LV: Found an estimated cost of 10 for VF 8 For instruction: %valB.loaded = load i64, ptr %inB, align 8
 ; AVX512:  LV: Found an estimated cost of 20 for VF 16 For instruction: %valB.loaded = load i64, ptr %inB, align 8
 ; AVX512:  LV: Found an estimated cost of 40 for VF 32 For instruction: %valB.loaded = load i64, ptr %inB, align 8
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost-inseltpoison.ll
diff --git a/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll b/llvm/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll