@@ -4930,7 +4930,6 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4930
4930
if (Legal->hasUncountableEarlyExit ())
4931
4931
return 1 ;
4932
4932
4933
- auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop);
4934
4933
const bool HasReductions = !Legal->getReductionVars ().empty ();
4935
4934
4936
4935
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -5006,25 +5005,33 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5006
5005
}
5007
5006
5008
5007
unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning);
5009
- unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
5010
- if (KnownTC > 0 ) {
5011
- // At least one iteration must be scalar when this constraint holds. So the
5012
- // maximum available iterations for interleaving is one less.
5013
- unsigned AvailableTC =
5014
- requiresScalarEpilogue (VF.isVector ()) ? KnownTC - 1 : KnownTC;
5015
-
5016
- // If trip count is known we select between two prospective ICs, where
5008
+
5009
+ // Try to get the exact trip count, or an estimate based on profiling data or
5010
+ // ConstantMax from PSE, failing that.
5011
+ if (auto BestKnownTC = getSmallBestKnownTC (PSE, TheLoop)) {
5012
+ // At least one iteration must be scalar when this constraint holds. So the
5013
+ // maximum available iterations for interleaving is one less.
5014
+ unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5015
+ ? (*BestKnownTC) - 1
5016
+ : *BestKnownTC;
5017
+
5018
+ unsigned InterleaveCountLB = bit_floor (std::max (
5019
+ 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5020
+
5021
+ if (PSE.getSE ()->getSmallConstantTripCount (TheLoop) > 0 ) {
5022
+ // If the estimated trip count is actually an exact one we select between
5023
+ // two prospective ICs, where
5024
+ //
5017
5025
// 1) the aggressive IC is capped by the trip count divided by VF
5018
5026
// 2) the conservative IC is capped by the trip count divided by (VF * 2)
5027
+ //
5019
5028
// The final IC is selected in a way that the epilogue loop trip count is
5020
5029
// minimized while maximizing the IC itself, so that we either run the
5021
- // vector loop at least once if it generates a small epilogue loop, or else
5022
- // we run the vector loop at least twice.
5030
+ // vector loop at least once if it generates a small epilogue loop, or
5031
+ // else we run the vector loop at least twice.
5023
5032
5024
- unsigned InterleaveCountUB = bit_floor (
5025
- std::max (1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5026
- unsigned InterleaveCountLB = bit_floor (std::max (
5027
- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5033
+ unsigned InterleaveCountUB = bit_floor (std::max (
5034
+ 1u , std::min (AvailableTC / EstimatedVF, MaxInterleaveCount)));
5028
5035
MaxInterleaveCount = InterleaveCountLB;
5029
5036
5030
5037
if (InterleaveCountUB != InterleaveCountLB) {
@@ -5037,20 +5044,14 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5037
5044
if (TailTripCountUB == TailTripCountLB)
5038
5045
MaxInterleaveCount = InterleaveCountUB;
5039
5046
}
5040
- } else if (BestKnownTC) {
5041
- // At least one iteration must be scalar when this constraint holds. So the
5042
- // maximum available iterations for interleaving is one less.
5043
- unsigned AvailableTC = requiresScalarEpilogue (VF.isVector ())
5044
- ? (*BestKnownTC) - 1
5045
- : *BestKnownTC;
5046
-
5047
- // If trip count is an estimated compile time constant, limit the
5048
- // IC to be capped by the trip count divided by VF * 2, such that the vector
5049
- // loop runs at least twice to make interleaving seem profitable when there
5050
- // is an epilogue loop present. Since exact Trip count is not known we
5051
- // choose to be conservative in our IC estimate.
5052
- MaxInterleaveCount = bit_floor (std::max (
5053
- 1u , std::min (AvailableTC / (EstimatedVF * 2 ), MaxInterleaveCount)));
5047
+ } else {
5048
+ // If trip count is an estimated compile time constant, limit the
5049
+ // IC to be capped by the trip count divided by VF * 2, such that the
5050
+ // vector loop runs at least twice to make interleaving seem profitable
5051
+ // when there is an epilogue loop present. Since exact Trip count is not
5052
+ // known we choose to be conservative in our IC estimate.
5053
+ MaxInterleaveCount = InterleaveCountLB;
5054
+ }
5054
5055
}
5055
5056
5056
5057
assert (MaxInterleaveCount > 0 &&
0 commit comments