Skip to content

Commit 82f5e6a

Browse files
committed
[LV] Move check if any vector insts will be generated to VPlan.
This patch moves the check if any vector instructions will be generated from getInstructionCost to be based on VPlan. This simplifies getInstructionCost, is more accurate as we check the final result and also allows us to exit early once we visit a recipe that generates vector instructions. The helper can then be re-used by the VPlan-based cost model to match the legacy selectVectorizationFactor behavior, this fixing a crash and paving the way to recommit #92555.
1 parent daaea12 commit 82f5e6a

File tree

6 files changed

+96
-100
lines changed

6 files changed

+96
-100
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 81 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -1642,12 +1642,7 @@ class LoopVectorizationCostModel {
16421642

16431643
/// Returns the execution time cost of an instruction for a given vector
16441644
/// width. Vector width of one means scalar.
1645-
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1646-
1647-
/// The cost-computation logic from getInstructionCost which provides
1648-
/// the vector type as an output parameter.
1649-
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1650-
Type *&VectorTy);
1645+
InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
16511646

16521647
/// Return the cost of instructions in an inloop reduction pattern, if I is
16531648
/// part of that pattern.
@@ -4873,6 +4868,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
48734868
} while (!Tail.empty());
48744869
}
48754870

4871+
static bool willGenerateVectorInstructions(VPlan &Plan, ElementCount VF,
4872+
const TargetTransformInfo &TTI) {
4873+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4874+
Plan.getCanonicalIV()->getScalarType()->getContext());
4875+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4876+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4877+
for (VPRecipeBase &R : *VPBB) {
4878+
if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
4879+
VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
4880+
VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
4881+
VPVectorPointerRecipe>(&R))
4882+
continue;
4883+
4884+
auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
4885+
Type *ScalarTy = TypeInfo.inferScalarType(VPV);
4886+
Type *VectorTy = ToVectorTy(ScalarTy, VF);
4887+
unsigned NumParts = TTI.getNumberOfParts(VectorTy);
4888+
if (!NumParts)
4889+
return false;
4890+
if (VF.isScalable())
4891+
// <vscale x 1 x iN> is assumed to be profitable over iN because
4892+
// scalable registers are a distinct register class from scalar ones.
4893+
// If we ever find a target which wants to lower scalable vectors
4894+
// back to scalars, we'll need to update this code to explicitly
4895+
// ask TTI about the register class uses for each part.
4896+
return NumParts <= VF.getKnownMinValue();
4897+
else
4898+
return NumParts < VF.getKnownMinValue();
4899+
};
4900+
SmallVector<VPValue *> VPValuesToCheck;
4901+
if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
4902+
VPValuesToCheck.push_back(WidenStore->getOperand(1));
4903+
} else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
4904+
append_range(VPValuesToCheck, IG->getStoredValues());
4905+
} else {
4906+
append_range(VPValuesToCheck, R.definedValues());
4907+
}
4908+
if (any_of(VPValuesToCheck,
4909+
[&WillWiden](VPValue *VPV) { return WillWiden(VPV); }))
4910+
return true;
4911+
}
4912+
}
4913+
4914+
return false;
4915+
}
4916+
48764917
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
48774918
InstructionCost ExpectedCost =
48784919
CM.expectedCost(ElementCount::getFixed(1)).first;
@@ -4923,7 +4964,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
49234964
LLVM_DEBUG(dbgs() << ".\n");
49244965
#endif
49254966

4926-
if (!C.second && !ForceVectorization) {
4967+
if (!willGenerateVectorInstructions(*P, VF, TTI) && !ForceVectorization) {
49274968
LLVM_DEBUG(
49284969
dbgs()
49294970
<< "LV: Not considering vector loop of width " << VF
@@ -5795,15 +5836,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
57955836

57965837
// Compute the cost of the vector instruction. Note that this cost already
57975838
// includes the scalarization overhead of the predicated instruction.
5798-
InstructionCost VectorCost = getInstructionCost(I, VF).first;
5839+
InstructionCost VectorCost = getInstructionCost(I, VF);
57995840

58005841
// Compute the cost of the scalarized instruction. This cost is the cost of
58015842
// the instruction as if it wasn't if-converted and instead remained in the
58025843
// predicated block. We will scale this cost by block probability after
58035844
// computing the scalarization overhead.
58045845
InstructionCost ScalarCost =
5805-
VF.getFixedValue() *
5806-
getInstructionCost(I, ElementCount::getFixed(1)).first;
5846+
VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
58075847

58085848
// Compute the scalarization overhead of needed insertelement instructions
58095849
// and phi nodes.
@@ -5863,22 +5903,19 @@ LoopVectorizationCostModel::expectedCost(
58635903
(VF.isVector() && VecValuesToIgnore.count(&I)))
58645904
continue;
58655905

5866-
VectorizationCostTy C = getInstructionCost(&I, VF);
5906+
InstructionCost C = getInstructionCost(&I, VF);
58675907

58685908
// Check if we should override the cost.
5869-
if (C.first.isValid() &&
5870-
ForceTargetInstructionCost.getNumOccurrences() > 0)
5871-
C.first = InstructionCost(ForceTargetInstructionCost);
5909+
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5910+
C = InstructionCost(ForceTargetInstructionCost);
58725911

58735912
// Keep a list of instructions with invalid costs.
5874-
if (Invalid && !C.first.isValid())
5913+
if (Invalid && !C.isValid())
58755914
Invalid->emplace_back(&I, VF);
58765915

5877-
BlockCost.first += C.first;
5878-
BlockCost.second |= C.second;
5879-
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5880-
<< " for VF " << VF << " For instruction: " << I
5881-
<< '\n');
5916+
BlockCost.first += C;
5917+
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5918+
<< VF << " For instruction: " << I << '\n');
58825919
}
58835920

58845921
// If we are vectorizing a predicated block, it will have been
@@ -6291,49 +6328,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
62916328
return getWideningCost(I, VF);
62926329
}
62936330

6294-
LoopVectorizationCostModel::VectorizationCostTy
6295-
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6296-
ElementCount VF) {
6297-
// If we know that this instruction will remain uniform, check the cost of
6298-
// the scalar version.
6299-
if (isUniformAfterVectorization(I, VF))
6300-
VF = ElementCount::getFixed(1);
6301-
6302-
if (VF.isVector() && isProfitableToScalarize(I, VF))
6303-
return VectorizationCostTy(InstsToScalarize[VF][I], false);
6304-
6305-
// Forced scalars do not have any scalarization overhead.
6306-
auto ForcedScalar = ForcedScalars.find(VF);
6307-
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6308-
auto InstSet = ForcedScalar->second;
6309-
if (InstSet.count(I))
6310-
return VectorizationCostTy(
6311-
(getInstructionCost(I, ElementCount::getFixed(1)).first *
6312-
VF.getKnownMinValue()),
6313-
false);
6314-
}
6315-
6316-
Type *VectorTy;
6317-
InstructionCost C = getInstructionCost(I, VF, VectorTy);
6318-
6319-
bool TypeNotScalarized = false;
6320-
if (VF.isVector() && VectorTy->isVectorTy()) {
6321-
if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6322-
if (VF.isScalable())
6323-
// <vscale x 1 x iN> is assumed to be profitable over iN because
6324-
// scalable registers are a distinct register class from scalar ones.
6325-
// If we ever find a target which wants to lower scalable vectors
6326-
// back to scalars, we'll need to update this code to explicitly
6327-
// ask TTI about the register class uses for each part.
6328-
TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6329-
else
6330-
TypeNotScalarized = NumParts < VF.getKnownMinValue();
6331-
} else
6332-
C = InstructionCost::getInvalid();
6333-
}
6334-
return VectorizationCostTy(C, TypeNotScalarized);
6335-
}
6336-
63376331
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
63386332
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
63396333

@@ -6724,8 +6718,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
67246718
}
67256719

67266720
InstructionCost
6727-
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6728-
Type *&VectorTy) {
6721+
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6722+
ElementCount VF) {
6723+
// If we know that this instruction will remain uniform, check the cost of
6724+
// the scalar version.
6725+
if (isUniformAfterVectorization(I, VF))
6726+
VF = ElementCount::getFixed(1);
6727+
6728+
if (VF.isVector() && isProfitableToScalarize(I, VF))
6729+
return InstsToScalarize[VF][I];
6730+
6731+
// Forced scalars do not have any scalarization overhead.
6732+
auto ForcedScalar = ForcedScalars.find(VF);
6733+
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6734+
auto InstSet = ForcedScalar->second;
6735+
if (InstSet.count(I))
6736+
return getInstructionCost(I, ElementCount::getFixed(1)) *
6737+
VF.getKnownMinValue();
6738+
}
6739+
67296740
Type *RetTy = I->getType();
67306741
if (canTruncateToMinimalBitwidth(I, VF))
67316742
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6748,6 +6759,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
67486759
};
67496760
(void) hasSingleCopyAfterVectorization;
67506761

6762+
Type *VectorTy;
67516763
if (isScalarAfterVectorization(I, VF)) {
67526764
// With the exception of GEPs and PHIs, after scalarization there should
67536765
// only be one copy of the instruction generated in the loop. This is
@@ -6763,6 +6775,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
67636775
} else
67646776
VectorTy = ToVectorTy(RetTy, VF);
67656777

6778+
if (VF.isVector() && VectorTy->isVectorTy() &&
6779+
!TTI.getNumberOfParts(VectorTy))
6780+
return InstructionCost::getInvalid();
6781+
67666782
// TODO: We need to estimate the cost of intrinsic calls.
67676783
switch (I->getOpcode()) {
67686784
case Instruction::GetElementPtr:

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,9 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
6868
case VPInstruction::PtrAdd:
6969
// Return the type based on the pointer argument (i.e. first operand).
7070
return inferScalarType(R->getOperand(0));
71+
case VPInstruction::BranchOnCond:
72+
case VPInstruction::BranchOnCount:
73+
return Type::getVoidTy(Ctx);
7174
default:
7275
break;
7376
}
@@ -248,8 +251,9 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
248251
})
249252
.Case<VPWidenIntOrFpInductionRecipe, VPDerivedIVRecipe>(
250253
[](const auto *R) { return R->getScalarType(); })
251-
.Case<VPPredInstPHIRecipe, VPWidenPHIRecipe, VPScalarIVStepsRecipe,
252-
VPWidenGEPRecipe>([this](const VPRecipeBase *R) {
254+
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
255+
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
256+
VPWidenCanonicalIVRecipe>([this](const VPRecipeBase *R) {
253257
return inferScalarType(R->getOperand(0));
254258
})
255259
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,

llvm/test/Transforms/LoopVectorize/SystemZ/zero_unroll.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; RUN: opt -S -passes=loop-vectorize -mtriple=s390x-linux-gnu -vectorizer-min-trip-count=8 < %s | FileCheck %s
22

33
define i32 @main(i32 %arg, ptr nocapture readnone %arg1) #0 {
4-
;CHECK: vector.body:
4+
; CHECK-NOT: vector.body:
55
entry:
66
%0 = alloca i8, align 1
77
br label %loop

llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -622,38 +622,15 @@ define void @wide_iv_trunc_reuse(ptr %dst) {
622622
; CHECK-LABEL: define void @wide_iv_trunc_reuse(
623623
; CHECK-SAME: ptr [[DST:%.*]]) {
624624
; CHECK-NEXT: entry:
625-
; CHECK-NEXT: br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
626-
; CHECK: vector.ph:
627-
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
628-
; CHECK: vector.body:
629-
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
630-
; CHECK-NEXT: [[OFFSET_IDX:%.*]] = trunc i64 [[INDEX]] to i32
631-
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
632-
; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 1
633-
; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 2
634-
; CHECK-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], 3
635-
; CHECK-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 4
636-
; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 5
637-
; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[OFFSET_IDX]], 6
638-
; CHECK-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], 7
639-
; CHECK-NEXT: store i32 [[TMP7]], ptr [[DST]], align 4
640-
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
641-
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
642-
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
643-
; CHECK: middle.block:
644-
; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
645-
; CHECK: scalar.ph:
646-
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1, [[MIDDLE_BLOCK]] ], [ 1, [[ENTRY:%.*]] ]
647-
; CHECK-NEXT: [[BC_RESUME_VAL1:%.*]] = phi i32 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
648625
; CHECK-NEXT: br label [[LOOP:%.*]]
649626
; CHECK: loop:
650-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
651-
; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
627+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 1, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
628+
; CHECK-NEXT: [[IV_2:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[IV_TRUNC:%.*]], [[LOOP]] ]
652629
; CHECK-NEXT: store i32 [[IV_2]], ptr [[DST]], align 4
653630
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
654631
; CHECK-NEXT: [[EC:%.*]] = icmp eq i64 [[IV]], 0
655632
; CHECK-NEXT: [[IV_TRUNC]] = trunc i64 [[IV]] to i32
656-
; CHECK-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP27:![0-9]+]]
633+
; CHECK-NEXT: br i1 [[EC]], label [[EXIT:%.*]], label [[LOOP]]
657634
; CHECK: exit:
658635
; CHECK-NEXT: ret void
659636
;
@@ -701,6 +678,4 @@ attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
701678
; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
702679
; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
703680
; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
704-
; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
705-
; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
706681
;.

llvm/test/Transforms/LoopVectorize/pr32859.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s
1+
; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -S | FileCheck %s
22

33
; Out of the LCSSA form we could have 'phi i32 [ loop-invariant, %for.inc.2.i ]'
44
; but the IR Verifier requires for PHI one entry for each predecessor of

llvm/test/Transforms/LoopVectorize/vplan-incomplete-cases.ll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
2+
; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S %s | FileCheck %s
33

44
; This test used to crash due to missing Or/Not cases in inferScalarTypeForRecipe.
55
define void @vplan_incomplete_cases_tc2(i8 %x, i8 %y) {
@@ -65,8 +65,9 @@ define void @vplan_incomplete_cases_tc3(i8 %x, i8 %y) {
6565
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
6666
; CHECK: [[VECTOR_BODY]]:
6767
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
68-
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
69-
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
68+
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2
69+
; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX_NEXT]], 4
70+
; CHECK-NEXT: br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
7071
; CHECK: [[MIDDLE_BLOCK]]:
7172
; CHECK-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
7273
; CHECK: [[SCALAR_PH]]:

0 commit comments

Comments
 (0)