Skip to content

Commit 8a0393b

Browse files
committed
Address review comments
1 parent 13c73c4 commit 8a0393b

File tree

3 files changed

+78
-29
lines changed

3 files changed

+78
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10175,6 +10175,12 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1017510175
}
1017610176
}
1017710177

10178+
/// For loops with uncountable early exits, find the cost of doing work when
10179+
/// exiting the loop early, such as calculating the final exit values of
10180+
/// variables used outside the loop.
10181+
/// TODO: This is currently overly pessimistic because the loop may not take
10182+
/// the early exit, but better to keep this conservative for now. In future,
10183+
/// it might be possible to relax this by using branch probabilities.
1017810184
static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
1017910185
VPlan &Plan, ElementCount VF) {
1018010186
InstructionCost Cost = 0;
@@ -10183,37 +10189,44 @@ static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
1018310189
LLVM_DEBUG(
1018410190
dbgs() << "Calculating cost of work in vector early exit block:\n");
1018510191
for (auto *ExitVPBB : Plan.getExitBlocks()) {
10186-
for (auto *PredVPBB : ExitVPBB->getPredecessors())
10192+
for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
10193+
// If the predecessor is not the middle.block, then it must be the
10194+
// vector.early.exit block, which may contain work to calculate the exit
10195+
// values of variables used outside the loop.
1018710196
if (PredVPBB != Plan.getMiddleBlock())
1018810197
for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
1018910198
Cost += R.cost(VF, CostCtx);
10199+
}
1019010200
}
1019110201
return Cost;
1019210202
}
1019310203

10204+
/// This function determines whether or not it's still profitable to vectorize
10205+
/// the loop given the extra work we have to do outside of the loop:
10206+
/// 1. Perform the runtime checks before entering the loop to ensure it's safe
10207+
/// to vectorize.
10208+
/// 2. In the case of loops with uncountable early exits, we may have to do
10209+
/// extra work when exiting the loop early, such as calculating the final
10210+
/// exit values of variables used outside the loop.
1019410211
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
10195-
VectorizationFactor &VF, Loop *L,
10196-
const TargetTransformInfo &TTI,
10212+
VectorizationFactor &VF,
10213+
LoopVectorizationCostModel &CM,
1019710214
PredicatedScalarEvolution &PSE,
10198-
ScalarEpilogueLowering SEL,
10199-
std::optional<unsigned> VScale,
10200-
InstructionCost EarlyExitCost) {
10201-
InstructionCost CheckCost = Checks.getCost();
10202-
if (!CheckCost.isValid() && !EarlyExitCost.isValid())
10215+
VPlan &Plan,
10216+
ScalarEpilogueLowering SEL) {
10217+
InstructionCost TotalCost = Checks.getCost();
10218+
if (!TotalCost.isValid())
1020310219
return false;
1020410220

10205-
InstructionCost TotalCost = 0;
10206-
if (CheckCost.isValid())
10207-
TotalCost += CheckCost;
10208-
1020910221
// Add on the cost of work required in the vector early exit block, if one
1021010222
// exists.
10211-
if (EarlyExitCost.isValid())
10212-
TotalCost += EarlyExitCost;
10223+
if (CM.Legal->hasUncountableEarlyExit())
10224+
TotalCost += calculateEarlyExitCost(CM, Plan, VF.Width);
1021310225

1021410226
// When interleaving only scalar and vector cost will be equal, which in turn
1021510227
// would lead to a divide by 0. Fall back to hard threshold.
1021610228
if (VF.Width.isScalar()) {
10229+
// TODO: Should we rename VectorizeMemoryCheckThreshold?
1021710230
if (TotalCost > VectorizeMemoryCheckThreshold) {
1021810231
LLVM_DEBUG(
1021910232
dbgs()
@@ -10240,7 +10253,9 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1024010253
// The total cost of the vector loop is
1024110254
// RtC + VecC * (TC / VF) + EpiC
1024210255
// where
10243-
// * RtC is the cost of the generated runtime checks
10256+
// * RtC is the cost of the generated runtime checks plus the cost of
10257+
// performing any additional work in the vector.early.exit block for loops
10258+
// with uncountable early exits.
1024410259
// * VecC is the cost of a single vector iteration.
1024510260
// * TC is the actual trip count of the loop
1024610261
// * VF is the vectorization factor
@@ -10257,7 +10272,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1025710272
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1025810273
// the computations are performed on doubles, not integers and the result
1025910274
// is rounded up, hence we get an upper estimate of the TC.
10260-
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10275+
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, CM.getVScaleForTuning());
1026110276
uint64_t RtC = *TotalCost.getValue();
1026210277
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1026310278
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10285,7 +10300,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1028510300

1028610301
// Skip vectorization if the expected trip count is less than the minimum
1028710302
// required trip count.
10288-
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10303+
if (auto ExpectedTC = getSmallBestKnownTC(PSE, CM.TheLoop)) {
1028910304
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
1029010305
VF.MinProfitableTripCount)) {
1029110306
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
@@ -10682,17 +10697,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1068210697
if (VF.Width.isVector() || SelectedIC > 1)
1068310698
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
1068410699

10685-
InstructionCost EarlyExitCost = InstructionCost::getInvalid();
10686-
if (VF.Width.isVector() && LVL.hasUncountableEarlyExit())
10687-
EarlyExitCost =
10688-
calculateEarlyExitCost(CM, LVP.getPlanFor(VF.Width), VF.Width);
10689-
1069010700
// Check if it is profitable to vectorize with runtime checks.
1069110701
bool ForceVectorization =
1069210702
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1069310703
if (!ForceVectorization &&
10694-
!isOutsideLoopWorkProfitable(Checks, VF, L, *TTI, PSE, SEL,
10695-
CM.getVScaleForTuning(), EarlyExitCost)) {
10704+
!isOutsideLoopWorkProfitable(Checks, VF, CM, PSE,
10705+
LVP.getPlanFor(VF.Width), SEL)) {
1069610706
ORE->emit([&]() {
1069710707
return OptimizationRemarkAnalysisAliasing(
1069810708
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -746,9 +746,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
746746
case VPInstruction::ExtractFirstActive: {
747747
// Calculate the cost of determining the lane index.
748748
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
749-
IntrinsicCostAttributes Attrs(
750-
Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx),
751-
{PoisonValue::get(PredTy), ConstantInt::getTrue(Ctx.LLVMCtx)});
749+
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
750+
Type::getInt64Ty(Ctx.LLVMCtx),
751+
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
752752
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
753753
// Add on the cost of extracting the element.
754754
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
77

88
declare void @init_mem(ptr, i64);
99

10-
define i64 @same_exit_block_pre_inc_use1() #1 {
11-
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
10+
define i64 @same_exit_block_pre_inc_use1_sve() #1 {
11+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve'
1212
; CHECK: LV: Selecting VF: vscale x 16
1313
; CHECK: Calculating cost of work in vector early exit block:
1414
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
@@ -44,4 +44,43 @@ loop.end:
4444
ret i64 %retval
4545
}
4646

47+
define i64 @same_exit_block_pre_inc_use1_nosve() {
48+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve'
49+
; CHECK: LV: Selecting VF: 16
50+
; CHECK: Calculating cost of work in vector early exit block:
51+
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
52+
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
53+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
54+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
55+
; CHECK-NEXT: LV: Too many memory checks needed.
56+
entry:
57+
%p1 = alloca [1024 x i8]
58+
%p2 = alloca [1024 x i8]
59+
call void @init_mem(ptr %p1, i64 1024)
60+
call void @init_mem(ptr %p2, i64 1024)
61+
br label %loop
62+
63+
loop:
64+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
65+
%index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ]
66+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
67+
%ld1 = load i8, ptr %arrayidx, align 1
68+
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
69+
%ld2 = load i8, ptr %arrayidx1, align 1
70+
%cmp3 = icmp eq i8 %ld1, %ld2
71+
br i1 %cmp3, label %loop.inc, label %loop.end
72+
73+
loop.inc:
74+
%index.next = add i64 %index, 1
75+
%index2.next = add i64 %index2, 2
76+
%exitcond = icmp ne i64 %index.next, 67
77+
br i1 %exitcond, label %loop, label %loop.end
78+
79+
loop.end:
80+
%val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
81+
%val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ]
82+
%retval = add i64 %val1, %val2
83+
ret i64 %retval
84+
}
85+
4786
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

0 commit comments

Comments
 (0)