Skip to content

[LoopUtils] Don't wrap in getLoopEstimatedTripCount #129080

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions llvm/include/llvm/Transforms/Utils/LoopUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -318,8 +318,9 @@ void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
/// Returns a loop's estimated trip count based on branch weight metadata.
/// In addition if \p EstimatedLoopInvocationWeight is not null it is
/// initialized with weight of loop's latch leading to the exit.
/// Returns 0 when the count is estimated to be 0, or std::nullopt when a
/// meaningful estimate can not be made.
/// Returns a valid positive trip count, or std::nullopt when a meaningful
/// estimate cannot be made (including when the trip count wouldn't fit in the
/// result type).
std::optional<unsigned>
getLoopEstimatedTripCount(Loop *L,
unsigned *EstimatedLoopInvocationWeight = nullptr);
Expand Down
24 changes: 11 additions & 13 deletions llvm/lib/Transforms/Utils/LoopPeel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -640,20 +640,18 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
LLVM_DEBUG(dbgs() << "Profile-based estimated trip count is "
<< *EstimatedTripCount << "\n");

if (*EstimatedTripCount) {
if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
unsigned PeelCount = *EstimatedTripCount;
LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
PP.PeelCount = PeelCount;
return;
}
LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
LLVM_DEBUG(dbgs() << "Max peel count by cost: "
<< (Threshold / LoopSize - 1) << "\n");
if (*EstimatedTripCount + AlreadyPeeled <= MaxPeelCount) {
unsigned PeelCount = *EstimatedTripCount;
LLVM_DEBUG(dbgs() << "Peeling first " << PeelCount << " iterations.\n");
PP.PeelCount = PeelCount;
return;
}
LLVM_DEBUG(dbgs() << "Already peel count: " << AlreadyPeeled << "\n");
LLVM_DEBUG(dbgs() << "Max peel count: " << UnrollPeelMaxCount << "\n");
LLVM_DEBUG(dbgs() << "Loop cost: " << LoopSize << "\n");
LLVM_DEBUG(dbgs() << "Max peel cost: " << Threshold << "\n");
LLVM_DEBUG(dbgs() << "Max peel count by cost: "
<< (Threshold / LoopSize - 1) << "\n");
}
}

Expand Down
7 changes: 6 additions & 1 deletion llvm/lib/Transforms/Utils/LoopUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -820,7 +820,7 @@ static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {

/// Return the estimated trip count for any exiting branch which dominates
/// the loop latch.
static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
static std::optional<unsigned> getEstimatedTripCount(BranchInst *ExitingBranch,
Loop *L,
uint64_t &OrigExitWeight) {
// To estimate the number of times the loop body was executed, we want to
Expand All @@ -842,6 +842,11 @@ static std::optional<uint64_t> getEstimatedTripCount(BranchInst *ExitingBranch,
// Estimated exit count is a ratio of the loop weight by the weight of the
// edge exiting the loop, rounded to nearest.
uint64_t ExitCount = llvm::divideNearest(LoopWeight, ExitWeight);

// When ExitCount + 1 would wrap in unsigned, return std::nullopt instead.
if (ExitCount >= std::numeric_limits<unsigned>::max())
return std::nullopt;

// Estimated trip count is one plus estimated exit count.
return ExitCount + 1;
}
Expand Down
9 changes: 5 additions & 4 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -421,8 +421,10 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
}

/// Returns "best known" trip count for the specified loop \p L as defined by
/// the following procedure:
/// Returns "best known" trip count, which is either a valid positive trip count
/// or std::nullopt when an estimate cannot be made (including when the trip
/// count would overflow), for the specified loop \p L as defined by the
/// following procedure:
/// 1) Returns exact trip count if it is known.
/// 2) Returns expected trip count according to profile data if any.
/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
Expand Down Expand Up @@ -2032,7 +2034,6 @@ class GeneratedRTChecks {
PSE, OuterLoop, /* CanUseConstantMax = */ false))
BestTripCount = *EstimatedTC;

BestTripCount = std::max(BestTripCount, 1U);
InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;

// Let's ensure the cost is always at least 1.
Expand Down Expand Up @@ -5029,7 +5030,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
if (TailTripCountUB == TailTripCountLB)
MaxInterleaveCount = InterleaveCountUB;
}
} else if (BestKnownTC && *BestKnownTC > 0) {
} else if (BestKnownTC) {
// At least one iteration must be scalar when this constraint holds. So the
// maximum available iterations for interleaving is one less.
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ target triple = "aarch64-linux-gnu"

%pair = type { i8, i8 }

; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 32, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -29,7 +29,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 33, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -53,7 +53,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 48, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_48(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -77,7 +77,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 63, when the auto-vectorizer chooses VF 16,
; it should conservatively choose IC 1 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_63(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -101,7 +101,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 64, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_64(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -125,10 +125,10 @@ for.end:
ret void
}

; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
; This has the same profile-guided estimated trip count as loop_with_profile_tc_64 but since the
; resulting interleaved group in this case may access memory out-of-bounds, it requires a scalar
; epilogue iteration for correctness, making at most 63 iterations available for interleaving.
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; When the auto-vectorizer chooses VF 16, it should choose IC 1 to leave a smaller scalar
; remainder than IC 2
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 1)
define void @loop_with_profile_tc_64_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -149,7 +149,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 100, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 2 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_100(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -173,7 +173,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 128, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_128(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand Down Expand Up @@ -206,11 +206,11 @@ for.end:
ret void
}

; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
; This has the same profile-guided estimated trip count as loop_with_profile_tc_128 but since
; the resulting interleaved group in this case may access memory out-of-bounds, it requires
; a scalar epilogue iteration for correctness, making at most 127 iterations available for
; interleaving.
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; When the auto-vectorizer chooses VF 16, it should choose IC 2 to leave a smaller scalar
; remainder than IC 4
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
define void @loop_with_profile_tc_128_scalar_epilogue_reqd(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand Down Expand Up @@ -240,7 +240,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 129, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_129(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -264,7 +264,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 180, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_180(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -288,7 +288,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 193, when the auto-vectorizer chooses VF 16,
; it should choose conservatively IC 4 so that the vector loop runs twice at least
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 4)
define void @loop_with_profile_tc_193(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -312,7 +312,7 @@ for.end:
ret void
}

; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
; For a loop with a profile-guided estimated TC of 1000, when the auto-vectorizer chooses VF 16,
; the IC will be capped by the target-specific maximum interleave count
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_profile_tc_1000(ptr noalias %p, ptr noalias %q, i64 %n) {
Expand All @@ -336,6 +336,30 @@ for.end:
ret void
}

; When the loop weight is UINT_MAX, and the exit count is 1, the trip count
; computation could wrap.
; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 8)
define void @loop_with_profile_wrap(ptr noalias %p, ptr noalias %q, i64 %n) {
entry:
br label %for.body

for.body:
%i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
%tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
%tmp1 = load i8, ptr %tmp0, align 1
%tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
%tmp3 = load i8, ptr %tmp2, align 1
%add = add i8 %tmp1, %tmp3
%qi = getelementptr i8, ptr %q, i64 %i
store i8 %add, ptr %qi, align 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp eq i64 %i.next, %n
br i1 %cond, label %for.end, label %for.body, !prof !11

for.end:
ret void
}

!0 = !{!"branch_weights", i32 1, i32 31}
!1 = !{!"branch_weights", i32 1, i32 32}
!2 = !{!"branch_weights", i32 1, i32 47}
Expand All @@ -347,3 +371,4 @@ for.end:
!8 = !{!"branch_weights", i32 1, i32 179}
!9 = !{!"branch_weights", i32 1, i32 192}
!10 = !{!"branch_weights", i32 1, i32 999}
!11 = !{!"branch_weights", i32 1, i32 -1}
Original file line number Diff line number Diff line change
Expand Up @@ -180,8 +180,8 @@ outer.exit:
define void @outer_pgo_minus1(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_minus1'
; CHECK: Calculating cost of runtime checks:
; CHECK-NOT: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced
; CHECK: Total cost of runtime checks: 6
; CHECK: We expect runtime memory checks to be hoisted out of the outer loop. Cost reduced from 6 to 3
; CHECK: Total cost of runtime checks: 3
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
entry:
br label %outer.loop
Expand Down