Skip to content

Commit de1c2f2

Browse files
authored
[LoopVectorizer][AArch64] Move getMinTripCountTailFoldingThreshold later. (#132170)
This moves the checks of MinTripCountTailFoldingThreshold later, during the calculation of whether to tail fold. This allows it to check beforehand whether tail predication is required, either for scalable or fixed-width vectors. This option is only specified for AArch64, where it returns the minimum of 5. This patch aims to allow the vectorization of TC=4 loops, preventing them from performing slower when SVE is present.
1 parent 3ab70e3 commit de1c2f2

File tree

2 files changed

+141
-27
lines changed

2 files changed

+141
-27
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 40 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4025,11 +4025,8 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40254025
MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
40264026
}
40274027

4028-
if (MaxPowerOf2RuntimeVF && *MaxPowerOf2RuntimeVF > 0) {
4029-
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4030-
"MaxFixedVF must be a power of 2");
4031-
unsigned MaxVFtimesIC =
4032-
UserIC ? *MaxPowerOf2RuntimeVF * UserIC : *MaxPowerOf2RuntimeVF;
4028+
auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
4029+
unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
40334030
ScalarEvolution *SE = PSE.getSE();
40344031
// Currently only loops with countable exits are vectorized, but calling
40354032
// getSymbolicMaxBackedgeTakenCount allows enablement work for loops with
@@ -4043,13 +4040,41 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
40434040
const SCEV *Rem = SE->getURemExpr(
40444041
SE->applyLoopGuards(ExitCount, TheLoop),
40454042
SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
4046-
if (Rem->isZero()) {
4043+
return Rem->isZero();
4044+
};
4045+
4046+
if (MaxPowerOf2RuntimeVF > 0) {
4047+
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
4048+
"MaxFixedVF must be a power of 2");
4049+
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
40474050
// Accept MaxFixedVF if we do not have a tail.
40484051
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
40494052
return MaxFactors;
40504053
}
40514054
}
40524055

4056+
auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
4057+
if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
4058+
if (MaxPowerOf2RuntimeVF > 0) {
4059+
// If we have a low-trip-count, and the fixed-width VF is known to divide
4060+
// the trip count but the scalable factor does not, use the fixed-width
4061+
// factor in preference to allow the generation of a non-predicated loop.
4062+
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
4063+
NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
4064+
LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
4065+
"remain for any chosen VF.\n");
4066+
MaxFactors.ScalableVF = ElementCount::getScalable(0);
4067+
return MaxFactors;
4068+
}
4069+
}
4070+
4071+
reportVectorizationFailure(
4072+
"The trip count is below the minial threshold value.",
4073+
"loop trip count is too low, avoiding vectorization", "LowTripCount",
4074+
ORE, TheLoop);
4075+
return FixedScalableVFPair::getNone();
4076+
}
4077+
40534078
// If we don't know the precise trip count, or if the trip count that we
40544079
// found modulo the vectorization factor is not zero, try to fold the tail
40554080
// by masking.
@@ -10604,26 +10629,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1060410629
if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
1060510630
LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
1060610631
else {
10607-
if (*ExpectedTC > TTI->getMinTripCountTailFoldingThreshold()) {
10608-
LLVM_DEBUG(dbgs() << "\n");
10609-
// Predicate tail-folded loops are efficient even when the loop
10610-
// iteration count is low. However, setting the epilogue policy to
10611-
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10612-
// with runtime checks. It's more effective to let
10613-
// `isOutsideLoopWorkProfitable` determine if vectorization is
10614-
// beneficial for the loop.
10615-
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10616-
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10617-
} else {
10618-
LLVM_DEBUG(dbgs() << " But the target considers the trip count too "
10619-
"small to consider vectorizing.\n");
10620-
reportVectorizationFailure(
10621-
"The trip count is below the minial threshold value.",
10622-
"loop trip count is too low, avoiding vectorization",
10623-
"LowTripCount", ORE, L);
10624-
Hints.emitRemarkWithHints();
10625-
return false;
10626-
}
10632+
LLVM_DEBUG(dbgs() << "\n");
10633+
// Predicate tail-folded loops are efficient even when the loop
10634+
// iteration count is low. However, setting the epilogue policy to
10635+
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10636+
// with runtime checks. It's more effective to let
10637+
// `isOutsideLoopWorkProfitable` determine if vectorization is
10638+
// beneficial for the loop.
10639+
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10640+
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
1062710641
}
1062810642
}
1062910643

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_count_predicates.ll

Lines changed: 101 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ target triple = "aarch64-unknown-linux-gnu"
1818

1919
; DEBUG-LABEL: LV: Checking a loop in 'trip_count_too_small'
2020
; DEBUG: LV: Found a loop with a very small trip count. This loop is worth vectorizing only if no scalar iteration overheads are incurred.
21-
; DEBUG: LV: Not vectorizing: The trip count is below the minial threshold value..
21+
; DEBUG: LV: Not vectorizing: Runtime SCEV check is required with -Os/-Oz.
2222

2323
; DEBUG-LABEL: LV: Checking a loop in 'too_many_runtime_checks'
2424
; DEBUG: LV: Found trip count: 0
@@ -490,9 +490,103 @@ while.end:
490490
ret void
491491
}
492492

493+
; This has a trip-count of 4, and should vectorize with vf==4.
494+
define i32 @tc4(ptr noundef readonly captures(none) %tmp) vscale_range(1,16) {
495+
; CHECK-LABEL: define i32 @tc4(
496+
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]]) #[[ATTR1]] {
497+
; CHECK-NEXT: [[ENTRY:.*]]:
498+
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
499+
; CHECK: [[VECTOR_PH]]:
500+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
501+
; CHECK: [[VECTOR_BODY]]:
502+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
503+
; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP3:%.*]], %[[VECTOR_BODY]] ]
504+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDEX]]
505+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw i32, ptr [[ARRAYIDX1]], i32 0
506+
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
507+
; CHECK-NEXT: [[TMP3]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]]
508+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
509+
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
510+
; CHECK: [[MIDDLE_BLOCK]]:
511+
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
512+
; CHECK-NEXT: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
513+
; CHECK: [[SCALAR_PH]]:
514+
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 4, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
515+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP4]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
516+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
517+
; CHECK: [[FOR_COND_CLEANUP]]:
518+
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ [[TMP4]], %[[MIDDLE_BLOCK]] ]
519+
; CHECK-NEXT: ret i32 [[ADD_LCSSA]]
520+
; CHECK: [[FOR_BODY]]:
521+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
522+
; CHECK-NEXT: [[SUM_0179:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD]], %[[FOR_BODY]] ]
523+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
524+
; CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
525+
; CHECK-NEXT: [[ADD]] = add i32 [[SUM_0179]], [[TMP5]]
526+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
527+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 4
528+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
529+
;
530+
entry:
531+
br label %for.body
532+
533+
for.cond.cleanup: ; preds = %for.body
534+
%add.lcssa = phi i32 [ %add, %for.body ]
535+
ret i32 %add.lcssa
536+
537+
for.body: ; preds = %entry, %for.body
538+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
539+
%sum.0179 = phi i32 [ 0, %entry ], [ %add, %for.body ]
540+
%arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
541+
%0 = load i32, ptr %arrayidx1, align 4
542+
%add = add i32 %sum.0179, %0
543+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
544+
%exitcond.not = icmp eq i64 %indvars.iv.next, 4
545+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
546+
}
547+
548+
; This has a trip-count of 4 from a profile.
549+
define i32 @tc4_from_profile(ptr noundef readonly captures(none) %tmp, i64 %N) vscale_range(1,16) {
550+
; CHECK-LABEL: define i32 @tc4_from_profile(
551+
; CHECK-SAME: ptr noundef readonly captures(none) [[TMP:%.*]], i64 [[N:%.*]]) #[[ATTR1]] {
552+
; CHECK-NEXT: [[ENTRY:.*]]:
553+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
554+
; CHECK: [[FOR_COND_CLEANUP:.*]]:
555+
; CHECK-NEXT: [[TMP4:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ]
556+
; CHECK-NEXT: ret i32 [[TMP4]]
557+
; CHECK: [[FOR_BODY]]:
558+
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
559+
; CHECK-NEXT: [[SUM_0179:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[ADD]], %[[FOR_BODY]] ]
560+
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw [4 x i32], ptr [[TMP]], i64 0, i64 [[INDVARS_IV]]
561+
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
562+
; CHECK-NEXT: [[ADD]] = add i32 [[SUM_0179]], [[TMP0]]
563+
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
564+
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
565+
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]]
566+
;
567+
entry:
568+
br label %for.body
569+
570+
for.cond.cleanup: ; preds = %for.body
571+
%add.lcssa = phi i32 [ %add, %for.body ]
572+
ret i32 %add.lcssa
573+
574+
for.body: ; preds = %entry, %for.body
575+
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
576+
%sum.0179 = phi i32 [ 0, %entry ], [ %add, %for.body ]
577+
%arrayidx1 = getelementptr inbounds nuw [4 x i32], ptr %tmp, i64 0, i64 %indvars.iv
578+
%0 = load i32, ptr %arrayidx1, align 4
579+
%add = add i32 %sum.0179, %0
580+
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
581+
%exitcond.not = icmp eq i64 %indvars.iv.next, %N
582+
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !prof !2
583+
}
584+
493585

494586
!0 = distinct !{!0, !1}
495587
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
588+
!2 = !{!"branch_weights", i32 10, i32 30}
589+
496590
;.
497591
; CHECK-VS1: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
498592
; CHECK-VS1: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -501,6 +595,9 @@ while.end:
501595
; CHECK-VS1: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
502596
; CHECK-VS1: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
503597
; CHECK-VS1: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
598+
; CHECK-VS1: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
599+
; CHECK-VS1: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
600+
; CHECK-VS1: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
504601
;.
505602
; CHECK-VS2: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
506603
; CHECK-VS2: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -509,4 +606,7 @@ while.end:
509606
; CHECK-VS2: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]]}
510607
; CHECK-VS2: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
511608
; CHECK-VS2: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]]}
609+
; CHECK-VS2: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
610+
; CHECK-VS2: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
611+
; CHECK-VS2: [[PROF9]] = !{!"branch_weights", i32 10, i32 30}
512612
;.

0 commit comments

Comments
 (0)