@@ -4863,250 +4863,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4863
4863
}
4864
4864
}
4865
4865
4866
- // / Get the VF scaling factor applied to the recipe's output, if the recipe has
4867
- // / one.
4868
- static unsigned getVFScaleFactor (VPRecipeBase *R) {
4869
- if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4870
- return RR->getVFScaleFactor ();
4871
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4872
- return RR->getVFScaleFactor ();
4873
- return 1 ;
4874
- }
4875
-
4876
- // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4877
- // / by calculating the highest number of values that are live at a single
4878
- // / location as a rough estimate. Returns the register usage for each VF in \p
4879
- // / VFs.
4880
- static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4881
- calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4882
- const TargetTransformInfo &TTI,
4883
- const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4884
- // Each 'key' in the map opens a new interval. The values
4885
- // of the map are the index of the 'last seen' usage of the
4886
- // recipe that is the key.
4887
- using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4888
-
4889
- // Maps indices to recipes.
4890
- SmallVector<VPRecipeBase *, 64 > Idx2Recipe;
4891
- // Marks the end of each interval.
4892
- IntervalMap EndPoint;
4893
- // Saves the list of recipe indices that are used in the loop.
4894
- SmallPtrSet<VPRecipeBase *, 8 > Ends;
4895
- // Saves the list of values that are used in the loop but are defined outside
4896
- // the loop (not including non-recipe values such as arguments and
4897
- // constants).
4898
- SmallSetVector<VPValue *, 8 > LoopInvariants;
4899
- LoopInvariants.insert (&Plan.getVectorTripCount ());
4900
-
4901
- // We scan the loop in a topological order in order and assign a number to
4902
- // each recipe. We use RPO to ensure that defs are met before their users. We
4903
- // assume that each recipe that has in-loop users starts an interval. We
4904
- // record every time that an in-loop value is used, so we have a list of the
4905
- // first and last occurrences of each recipe.
4906
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4907
- Plan.getVectorLoopRegion ());
4908
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4909
- if (!VPBB->getParent ())
4910
- break ;
4911
- for (VPRecipeBase &R : *VPBB) {
4912
- Idx2Recipe.push_back (&R);
4913
-
4914
- // Save the end location of each USE.
4915
- for (VPValue *U : R.operands ()) {
4916
- auto *DefR = U->getDefiningRecipe ();
4917
-
4918
- // Ignore non-recipe values such as arguments, constants, etc.
4919
- // FIXME: Might need some motivation why these values are ignored. If
4920
- // for example an argument is used inside the loop it will increase the
4921
- // register pressure (so shouldn't we add it to LoopInvariants).
4922
- if (!DefR && (!U->getLiveInIRValue () ||
4923
- !isa<Instruction>(U->getLiveInIRValue ())))
4924
- continue ;
4925
-
4926
- // If this recipe is outside the loop then record it and continue.
4927
- if (!DefR) {
4928
- LoopInvariants.insert (U);
4929
- continue ;
4930
- }
4931
-
4932
- // Overwrite previous end points.
4933
- EndPoint[DefR] = Idx2Recipe.size ();
4934
- Ends.insert (DefR);
4935
- }
4936
- }
4937
- if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4938
- // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4939
- // exiting block, where their increment will get materialized eventually.
4940
- for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4941
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4942
- EndPoint[&R] = Idx2Recipe.size ();
4943
- Ends.insert (&R);
4944
- }
4945
- }
4946
- }
4947
- }
4948
-
4949
- // Saves the list of intervals that end with the index in 'key'.
4950
- using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4951
- SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4952
-
4953
- // Next, we transpose the EndPoints into a multi map that holds the list of
4954
- // intervals that *end* at a specific location.
4955
- for (auto &Interval : EndPoint)
4956
- TransposeEnds[Interval.second ].push_back (Interval.first );
4957
-
4958
- SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4959
- SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4960
- SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4961
-
4962
- LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4963
-
4964
- VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4965
-
4966
- const auto &TTICapture = TTI;
4967
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4968
- if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4969
- (VF.isScalable () &&
4970
- !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4971
- return 0 ;
4972
- return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4973
- };
4974
-
4975
- // We scan the instructions linearly and record each time that a new interval
4976
- // starts, by placing it in a set. If we find this value in TransposEnds then
4977
- // we remove it from the set. The max register usage is the maximum register
4978
- // usage of the recipes of the set.
4979
- for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
4980
- VPRecipeBase *R = Idx2Recipe[Idx];
4981
-
4982
- // Remove all of the recipes that end at this location.
4983
- RecipeList &List = TransposeEnds[Idx];
4984
- for (VPRecipeBase *ToRemove : List)
4985
- OpenIntervals.erase (ToRemove);
4986
-
4987
- // Ignore recipes that are never used within the loop and do not have side
4988
- // effects.
4989
- if (!Ends.count (R) && !R->mayHaveSideEffects ())
4990
- continue ;
4991
-
4992
- // Skip recipes for ignored values.
4993
- // TODO: Should mark recipes for ephemeral values that cannot be removed
4994
- // explictly in VPlan.
4995
- if (isa<VPSingleDefRecipe>(R) &&
4996
- ValuesToIgnore.contains (
4997
- cast<VPSingleDefRecipe>(R)->getUnderlyingValue ()))
4998
- continue ;
4999
-
5000
- // For each VF find the maximum usage of registers.
5001
- for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
5002
- // Count the number of registers used, per register class, given all open
5003
- // intervals.
5004
- // Note that elements in this SmallMapVector will be default constructed
5005
- // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5006
- // there is no previous entry for ClassID.
5007
- SmallMapVector<unsigned , unsigned , 4 > RegUsage;
5008
-
5009
- for (auto *R : OpenIntervals) {
5010
- // Skip recipes that weren't present in the original loop.
5011
- // TODO: Remove after removing the legacy
5012
- // LoopVectorizationCostModel::calculateRegisterUsage
5013
- if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
5014
- VPBranchOnMaskRecipe>(R))
5015
- continue ;
5016
-
5017
- if (VFs[J].isScalar () ||
5018
- isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5019
- VPScalarIVStepsRecipe>(R) ||
5020
- (isa<VPInstruction>(R) &&
5021
- all_of (cast<VPSingleDefRecipe>(R)->users (), [&](VPUser *U) {
5022
- return cast<VPRecipeBase>(U)->usesScalars (R->getVPSingleValue ());
5023
- }))) {
5024
- unsigned ClassID = TTI.getRegisterClassForType (
5025
- false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
5026
- // FIXME: The target might use more than one register for the type
5027
- // even in the scalar case.
5028
- RegUsage[ClassID] += 1 ;
5029
- } else {
5030
- // The output from scaled phis and scaled reductions actually has
5031
- // fewer lanes than the VF.
5032
- unsigned ScaleFactor = getVFScaleFactor (R);
5033
- ElementCount VF = VFs[J].divideCoefficientBy (ScaleFactor);
5034
- LLVM_DEBUG (if (VF != VFs[J]) {
5035
- dbgs () << " LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
5036
- << " for " << *R << " \n " ;
5037
- });
5038
-
5039
- for (VPValue *DefV : R->definedValues ()) {
5040
- Type *ScalarTy = TypeInfo.inferScalarType (DefV);
5041
- unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
5042
- RegUsage[ClassID] += GetRegUsage (ScalarTy, VF);
5043
- }
5044
- }
5045
- }
5046
-
5047
- for (const auto &Pair : RegUsage) {
5048
- auto &Entry = MaxUsages[J][Pair.first ];
5049
- Entry = std::max (Entry, Pair.second );
5050
- }
5051
- }
5052
-
5053
- LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
5054
- << OpenIntervals.size () << ' \n ' );
5055
-
5056
- // Add the current recipe to the list of open intervals.
5057
- OpenIntervals.insert (R);
5058
- }
5059
-
5060
- // We also search for instructions that are defined outside the loop, but are
5061
- // used inside the loop. We need this number separately from the max-interval
5062
- // usage number because when we unroll, loop-invariant values do not take
5063
- // more register.
5064
- LoopVectorizationCostModel::RegisterUsage RU;
5065
- for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
5066
- // Note that elements in this SmallMapVector will be default constructed
5067
- // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5068
- // there is no previous entry for ClassID.
5069
- SmallMapVector<unsigned , unsigned , 4 > Invariant;
5070
-
5071
- for (auto *In : LoopInvariants) {
5072
- // FIXME: The target might use more than one register for the type
5073
- // even in the scalar case.
5074
- bool IsScalar = all_of (In->users (), [&](VPUser *U) {
5075
- return cast<VPRecipeBase>(U)->usesScalars (In);
5076
- });
5077
-
5078
- ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
5079
- unsigned ClassID = TTI.getRegisterClassForType (
5080
- VF.isVector (), TypeInfo.inferScalarType (In));
5081
- Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
5082
- }
5083
-
5084
- LLVM_DEBUG ({
5085
- dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
5086
- dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
5087
- << " item\n " ;
5088
- for (const auto &pair : MaxUsages[Idx]) {
5089
- dbgs () << " LV(REG): RegisterClass: "
5090
- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5091
- << " registers\n " ;
5092
- }
5093
- dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
5094
- << " item\n " ;
5095
- for (const auto &pair : Invariant) {
5096
- dbgs () << " LV(REG): RegisterClass: "
5097
- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
5098
- << " registers\n " ;
5099
- }
5100
- });
5101
-
5102
- RU.LoopInvariantRegs = Invariant;
5103
- RU.MaxLocalUsers = MaxUsages[Idx];
5104
- RUs[Idx] = RU;
5105
- }
5106
-
5107
- return RUs;
5108
- }
5109
-
5110
4866
unsigned
5111
4867
LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
5112
4868
InstructionCost LoopCost) {
@@ -5158,8 +4914,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
5158
4914
return 1 ;
5159
4915
}
5160
4916
5161
- RegisterUsage R =
5162
- ::calculateRegisterUsage (Plan, {VF}, TTI, ValuesToIgnore)[0];
4917
+ VPRegisterUsage R =
4918
+ calculateRegisterUsageForVPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
5163
4919
// We divide by these constants so assume that we have at least one
5164
4920
// instruction that uses at least one register.
5165
4921
for (auto &Pair : R.MaxLocalUsers ) {
0 commit comments