Skip to content

Commit 14d39c0

Browse files
committed
[LV] Move check if any vector insts will be generated to VPlan.
This patch moves the check if any vector instructions will be generated from getInstructionCost to be based on VPlan. This simplifies getInstructionCost, is more accurate as we check the final result and also allows us to exit early once we visit a recipe that generates vector instructions. The helper can then be re-used by the VPlan-based cost model to match the legacy selectVectorizationFactor behavior, this fixing a crash and paving the way to recommit #92555.
1 parent 4acc8ee commit 14d39c0

File tree

6 files changed

+96
-100
lines changed

6 files changed

+96
-100
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 81 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1648,12 +1648,7 @@ class LoopVectorizationCostModel {
16481648

16491649
/// Returns the execution time cost of an instruction for a given vector
16501650
/// width. Vector width of one means scalar.
1651-
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1652-
1653-
/// The cost-computation logic from getInstructionCost which provides
1654-
/// the vector type as an output parameter.
1655-
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1656-
Type *&VectorTy);
1651+
InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
16571652

16581653
/// Return the cost of instructions in an inloop reduction pattern, if I is
16591654
/// part of that pattern.
@@ -4879,6 +4874,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
48794874
} while (!Tail.empty());
48804875
}
48814876

4877+
static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
4878+
const TargetTransformInfo &TTI) {
4879+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4880+
Plan.getCanonicalIV()->getScalarType()->getContext());
4881+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4882+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4883+
for (VPRecipeBase &R : *VPBB) {
4884+
if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
4885+
VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
4886+
VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
4887+
VPVectorPointerRecipe>(&R))
4888+
continue;
4889+
4890+
auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
4891+
Type *ScalarTy = TypeInfo.inferScalarType(VPV);
4892+
Type *VectorTy = ToVectorTy(ScalarTy, VF);
4893+
unsigned NumParts = TTI.getNumberOfParts(VectorTy);
4894+
if (!NumParts)
4895+
return false;
4896+
if (VF.isScalable())
4897+
// <vscale x 1 x iN> is assumed to be profitable over iN because
4898+
// scalable registers are a distinct register class from scalar ones.
4899+
// If we ever find a target which wants to lower scalable vectors
4900+
// back to scalars, we'll need to update this code to explicitly
4901+
// ask TTI about the register class uses for each part.
4902+
return NumParts <= VF.getKnownMinValue();
4903+
else
4904+
return NumParts < VF.getKnownMinValue();
4905+
};
4906+
SmallVector<VPValue *> VPValuesToCheck;
4907+
if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
4908+
VPValuesToCheck.push_back(WidenStore->getOperand(1));
4909+
} else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
4910+
append_range(VPValuesToCheck, IG->getStoredValues());
4911+
} else {
4912+
append_range(VPValuesToCheck, R.definedValues());
4913+
}
4914+
if (any_of(VPValuesToCheck,
4915+
[&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
4916+
return true;
4917+
}
4918+
}
4919+
4920+
return false;
4921+
}
4922+
48824923
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
48834924
InstructionCost ExpectedCost =
48844925
CM.expectedCost(ElementCount::getFixed(1)).first;
@@ -4929,7 +4970,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
49294970
LLVM_DEBUG(dbgs() << ".\n");
49304971
#endif
49314972

4932-
if (!C.second && !ForceVectorization) {
4973+
if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
49334974
LLVM_DEBUG(
49344975
dbgs()
49354976
<< "LV: Not considering vector loop of width " << VF
@@ -5801,15 +5842,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
58015842

58025843
// Compute the cost of the vector instruction. Note that this cost already
58035844
// includes the scalarization overhead of the predicated instruction.
5804-
InstructionCost VectorCost = getInstructionCost(I, VF).first;
5845+
InstructionCost VectorCost = getInstructionCost(I, VF);
58055846

58065847
// Compute the cost of the scalarized instruction. This cost is the cost of
58075848
// the instruction as if it wasn't if-converted and instead remained in the
58085849
// predicated block. We will scale this cost by block probability after
58095850
// computing the scalarization overhead.
58105851
InstructionCost ScalarCost =
5811-
VF.getFixedValue() *
5812-
getInstructionCost(I, ElementCount::getFixed(1)).first;
5852+
VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
58135853

58145854
// Compute the scalarization overhead of needed insertelement instructions
58155855
// and phi nodes.
@@ -5869,22 +5909,19 @@ LoopVectorizationCostModel::expectedCost(
58695909
(VF.isVector() && VecValuesToIgnore.count(&I)))
58705910
continue;
58715911

5872-
VectorizationCostTy C = getInstructionCost(&I, VF);
5912+
InstructionCost C = getInstructionCost(&I, VF);
58735913

58745914
// Check if we should override the cost.
5875-
if (C.first.isValid() &&
5876-
ForceTargetInstructionCost.getNumOccurrences() > 0)
5877-
C.first = InstructionCost(ForceTargetInstructionCost);
5915+
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5916+
C = InstructionCost(ForceTargetInstructionCost);
58785917

58795918
// Keep a list of instructions with invalid costs.
5880-
if (Invalid && !C.first.isValid())
5919+
if (Invalid && !C.isValid())
58815920
Invalid->emplace_back(&I, VF);
58825921

5883-
BlockCost.first += C.first;
5884-
BlockCost.second |= C.second;
5885-
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5886-
<< " for VF " << VF << " For instruction: " << I
5887-
<< '\n');
5922+
BlockCost.first += C;
5923+
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5924+
<< VF << " For instruction: " << I << '\n');
58885925
}
58895926

58905927
// If we are vectorizing a predicated block, it will have been
@@ -6297,49 +6334,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
62976334
return getWideningCost(I, VF);
62986335
}
62996336

6300-
LoopVectorizationCostModel::VectorizationCostTy
6301-
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6302-
ElementCount VF) {
6303-
// If we know that this instruction will remain uniform, check the cost of
6304-
// the scalar version.
6305-
if (isUniformAfterVectorization(I, VF))
6306-
VF = ElementCount::getFixed(1);
6307-
6308-
if (VF.isVector() && isProfitableToScalarize(I, VF))
6309-
return VectorizationCostTy(InstsToScalarize[VF][I], false);
6310-
6311-
// Forced scalars do not have any scalarization overhead.
6312-
auto ForcedScalar = ForcedScalars.find(VF);
6313-
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6314-
auto InstSet = ForcedScalar->second;
6315-
if (InstSet.count(I))
6316-
return VectorizationCostTy(
6317-
(getInstructionCost(I, ElementCount::getFixed(1)).first *
6318-
VF.getKnownMinValue()),
6319-
false);
6320-
}
6321-
6322-
Type *VectorTy;
6323-
InstructionCost C = getInstructionCost(I, VF, VectorTy);
6324-
6325-
bool TypeNotScalarized = false;
6326-
if (VF.isVector() && VectorTy->isVectorTy()) {
6327-
if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6328-
if (VF.isScalable())
6329-
// <vscale x 1 x iN> is assumed to be profitable over iN because
6330-
// scalable registers are a distinct register class from scalar ones.
6331-
// If we ever find a target which wants to lower scalable vectors
6332-
// back to scalars, we'll need to update this code to explicitly
6333-
// ask TTI about the register class uses for each part.
6334-
TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6335-
else
6336-
TypeNotScalarized = NumParts < VF.getKnownMinValue();
6337-
} else
6338-
C = InstructionCost::getInvalid();
6339-
}
6340-
return VectorizationCostTy(C, TypeNotScalarized);
6341-
}
6342-
63436337
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
63446338
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
63456339

@@ -6730,8 +6724,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
67306724
}
67316725

67326726
InstructionCost
6733-
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6734-
Type *&VectorTy) {
6727+
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6728+
ElementCount VF) {
6729+
// If we know that this instruction will remain uniform, check the cost of
6730+
// the scalar version.
6731+
if (isUniformAfterVectorization(I, VF))
6732+
VF = ElementCount::getFixed(1);
6733+
6734+
if (VF.isVector() && isProfitableToScalarize(I, VF))
6735+
return InstsToScalarize[VF][I];
6736+
6737+
// Forced scalars do not have any scalarization overhead.
6738+
auto ForcedScalar = ForcedScalars.find(VF);
6739+
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6740+
auto InstSet = ForcedScalar->second;
6741+
if (InstSet.count(I))
6742+
return getInstructionCost(I, ElementCount::getFixed(1)) *
6743+
VF.getKnownMinValue();
6744+
}
6745+
67356746
Type *RetTy = I->getType();
67366747
if (canTruncateToMinimalBitwidth(I, VF))
67376748
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6754,6 +6765,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
67546765
};
67556766
(void) hasSingleCopyAfterVectorization;
67566767

6768+
Type *VectorTy;
67576769
if (isScalarAfterVectorization(I, VF)) {
67586770
// With the exception of GEPs and PHIs, after scalarization there should
67596771
// only be one copy of the instruction generated in the loop. This is
@@ -6769,6 +6781,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
67696781
} else
67706782
VectorTy = ToVectorTy(RetTy, VF);
67716783

6784+
if (VF.isVector() && VectorTy->isVectorTy() &&
6785+
!TTI.getNumberOfParts(VectorTy))
6786+
return InstructionCost::getInvalid();
6787+
67726788
// TODO: We need to estimate the cost of intrinsic calls.
67736789
switch (I->getOpcode()) {
67746790
case Instruction::GetElementPtr:

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
6868
case VPInstruction::PtrAdd:
6969
// Return the type based on the pointer argument (i.e. first operand).
7070
return inferScalarType(R->getOperand(0));
71+
case VPInstruction::BranchOnCond:
72+
case VPInstruction::BranchOnCount:
73+
return Type::getVoidTy(Ctx);
7174
default:
7275
break;
7376
}
@@ -248,8 +251,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
248251
})
249252
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
250253
[](const auto *R) { return R->getScalarType(); })
251-
.Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
252-
VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
254+
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
255+
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
256+
VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
253257
return inferScalarType(R->getOperand(0));
254258
})
255259
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
22

33
define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
4-
;CHECK: vector.body:
4+
; CHECK-NOT: vector.body:
55
entry:
66
%0 = alloca i8, align 1
77
br label %loop

llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -622,38 +622,15 @@ define void @wide_iv_trunc_reuse(ptr %dst) {
622622
; CHECK-LABEL: define void @wide_iv_trunc_reuse(
623623
; CHECK-SAME: ptr [[DST:%.*]]) {
624624
; CHECK-NEXT: entry:
625-
; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
626-
; CHECK: vector.ph:
627-
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
628-
; CHECK: vector.body:
629-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
630-
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
631-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
632-
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1
633-
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
634-
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3
635-
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4
636-
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5
637-
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6
638-
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], 7
639-
; CHECK-NEXT: store i32 [[TMP7]], ptr [[DST]], align 4
640-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
641-
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
642-
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
643-
; CHECK: middle.block:
644-
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
645-
; CHECK: scalar.ph:
646-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
647-
; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
648625
; CHECK-NEXT: br label [[LOOP:%.*]]
649626
; CHECK: loop:
650-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
651-
; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
627+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
628+
; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
652629
; CHECK-NEXT: store i32 [[IV_2]], ptr [[DST]], align 4
653630
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
654631
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 0
655632
; CHECK-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32
656-
; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
633+
; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
657634
; CHECK: exit:
658635
; CHECK-NEXT: ret void
659636
;
@@ -701,6 +678,4 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
701678
; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
702679
; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
703680
; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
704-
; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
705-
; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
706681
;.

llvm/test/Transforms/LoopVectorize/pr32859.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
1+
; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s
22

33
; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
44
; but the IR Verifier requires for PHI one entry for each predecessor of

llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
2+
; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
33

44
; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
55
define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
@@ -65,8 +65,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
6565
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
6666
; CHECK: [[VECTOR_BODY]]:
6767
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
68-
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
69-
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
68+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
69+
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
70+
; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
7071
; CHECK: [[MIDDLE_BLOCK]]:
7172
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
7273
; CHECK: [[SCALAR_PH]]:

0 commit comments

Comments
 (0)