@@ -987,25 +987,6 @@ class LoopVectorizationCostModel {
987
987
// / decision in a map for use in planning and plan execution.
988
988
void setVectorizedCallDecision (ElementCount VF);
989
989
990
- // / A struct that represents some properties of the register usage
991
- // / of a loop.
992
- struct RegisterUsage {
993
- // / Holds the number of loop invariant values that are used in the loop.
994
- // / The key is ClassID of target-provided register class.
995
- SmallMapVector<unsigned , unsigned , 4 > LoopInvariantRegs;
996
- // / Holds the maximum number of concurrent live intervals in the loop.
997
- // / The key is ClassID of target-provided register class.
998
- SmallMapVector<unsigned , unsigned , 4 > MaxLocalUsers;
999
-
1000
- // / Check if any of the tracked live intervals exceeds the number of
1001
- // / available registers for the target.
1002
- bool exceedsMaxNumRegs (const TargetTransformInfo &TTI) const {
1003
- return any_of (MaxLocalUsers, [&TTI](auto &LU) {
1004
- return LU.second > TTI.getNumberOfRegisters (LU.first );
1005
- });
1006
- }
1007
- };
1008
-
1009
990
// / Collect values we want to ignore in the cost model.
1010
991
void collectValuesToIgnore ();
1011
992
@@ -4343,15 +4324,6 @@ static bool hasReplicatorRegion(VPlan &Plan) {
4343
4324
}
4344
4325
4345
4326
#ifndef NDEBUG
4346
- // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4347
- // / by calculating the highest number of values that are live at a single
4348
- // / location as a rough estimate. Returns the register usage for each VF in \p
4349
- // / VFs.
4350
- static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4351
- calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4352
- const TargetTransformInfo &TTI,
4353
- const SmallPtrSetImpl<const Value *> &ValuesToIgnore);
4354
-
4355
4327
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor () {
4356
4328
InstructionCost ExpectedCost = CM.expectedCost (ElementCount::getFixed (1 ));
4357
4329
LLVM_DEBUG (dbgs () << " LV: Scalar loop costs: " << ExpectedCost << " .\n " );
@@ -4377,7 +4349,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4377
4349
for (auto &P : VPlans) {
4378
4350
ArrayRef<ElementCount> VFs (P->vectorFactors ().begin (),
4379
4351
P->vectorFactors ().end ());
4380
- auto RUs = :: calculateRegisterUsage (*P, VFs, TTI, CM.ValuesToIgnore );
4352
+ auto RUs = calculateRegisterUsageForPlan (*P, VFs, TTI, CM.ValuesToIgnore );
4381
4353
for (auto [VF, RU] : zip_equal (VFs, RUs)) {
4382
4354
// The cost for scalar VF=1 is already calculated, so ignore it.
4383
4355
if (VF.isScalar ())
@@ -4704,254 +4676,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
4704
4676
}
4705
4677
}
4706
4678
4707
- // / Get the VF scaling factor applied to the recipe's output, if the recipe has
4708
- // / one.
4709
- static unsigned getVFScaleFactor (VPRecipeBase *R) {
4710
- if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4711
- return RR->getVFScaleFactor ();
4712
- if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4713
- return RR->getVFScaleFactor ();
4714
- return 1 ;
4715
- }
4716
-
4717
- // / Estimate the register usage for \p Plan and vectorization factors in \p VFs
4718
- // / by calculating the highest number of values that are live at a single
4719
- // / location as a rough estimate. Returns the register usage for each VF in \p
4720
- // / VFs.
4721
- static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 >
4722
- calculateRegisterUsage (VPlan &Plan, ArrayRef<ElementCount> VFs,
4723
- const TargetTransformInfo &TTI,
4724
- const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4725
- // Each 'key' in the map opens a new interval. The values
4726
- // of the map are the index of the 'last seen' usage of the
4727
- // recipe that is the key.
4728
- using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned , 16 >;
4729
-
4730
- // Maps indices to recipes.
4731
- SmallVector<VPRecipeBase *, 64 > Idx2Recipe;
4732
- // Marks the end of each interval.
4733
- IntervalMap EndPoint;
4734
- // Saves the list of recipe indices that are used in the loop.
4735
- SmallPtrSet<VPRecipeBase *, 8 > Ends;
4736
- // Saves the list of values that are used in the loop but are defined outside
4737
- // the loop (not including non-recipe values such as arguments and
4738
- // constants).
4739
- SmallSetVector<VPValue *, 8 > LoopInvariants;
4740
- LoopInvariants.insert (&Plan.getVectorTripCount ());
4741
-
4742
- // We scan the loop in a topological order in order and assign a number to
4743
- // each recipe. We use RPO to ensure that defs are met before their users. We
4744
- // assume that each recipe that has in-loop users starts an interval. We
4745
- // record every time that an in-loop value is used, so we have a list of the
4746
- // first and last occurrences of each recipe.
4747
- ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT (
4748
- Plan.getVectorLoopRegion ());
4749
- for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4750
- if (!VPBB->getParent ())
4751
- break ;
4752
- for (VPRecipeBase &R : *VPBB) {
4753
- Idx2Recipe.push_back (&R);
4754
-
4755
- // Save the end location of each USE.
4756
- for (VPValue *U : R.operands ()) {
4757
- auto *DefR = U->getDefiningRecipe ();
4758
-
4759
- // Ignore non-recipe values such as arguments, constants, etc.
4760
- // FIXME: Might need some motivation why these values are ignored. If
4761
- // for example an argument is used inside the loop it will increase the
4762
- // register pressure (so shouldn't we add it to LoopInvariants).
4763
- if (!DefR && (!U->getLiveInIRValue () ||
4764
- !isa<Instruction>(U->getLiveInIRValue ())))
4765
- continue ;
4766
-
4767
- // If this recipe is outside the loop then record it and continue.
4768
- if (!DefR) {
4769
- LoopInvariants.insert (U);
4770
- continue ;
4771
- }
4772
-
4773
- // Overwrite previous end points.
4774
- EndPoint[DefR] = Idx2Recipe.size ();
4775
- Ends.insert (DefR);
4776
- }
4777
- }
4778
- if (VPBB == Plan.getVectorLoopRegion ()->getExiting ()) {
4779
- // VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4780
- // exiting block, where their increment will get materialized eventually.
4781
- for (auto &R : Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
4782
- if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4783
- EndPoint[&R] = Idx2Recipe.size ();
4784
- Ends.insert (&R);
4785
- }
4786
- }
4787
- }
4788
- }
4789
-
4790
- // Saves the list of intervals that end with the index in 'key'.
4791
- using RecipeList = SmallVector<VPRecipeBase *, 2 >;
4792
- SmallDenseMap<unsigned , RecipeList, 16 > TransposeEnds;
4793
-
4794
- // Next, we transpose the EndPoints into a multi map that holds the list of
4795
- // intervals that *end* at a specific location.
4796
- for (auto &Interval : EndPoint)
4797
- TransposeEnds[Interval.second ].push_back (Interval.first );
4798
-
4799
- SmallPtrSet<VPRecipeBase *, 8 > OpenIntervals;
4800
- SmallVector<LoopVectorizationCostModel::RegisterUsage, 8 > RUs (VFs.size ());
4801
- SmallVector<SmallMapVector<unsigned , unsigned , 4 >, 8 > MaxUsages (VFs.size ());
4802
-
4803
- LLVM_DEBUG (dbgs () << " LV(REG): Calculating max register usage:\n " );
4804
-
4805
- VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType ());
4806
-
4807
- const auto &TTICapture = TTI;
4808
- auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4809
- if (Ty->isTokenTy () || !VectorType::isValidElementType (Ty) ||
4810
- (VF.isScalable () &&
4811
- !TTICapture.isElementTypeLegalForScalableVector (Ty)))
4812
- return 0 ;
4813
- return TTICapture.getRegUsageForType (VectorType::get (Ty, VF));
4814
- };
4815
-
4816
- // We scan the instructions linearly and record each time that a new interval
4817
- // starts, by placing it in a set. If we find this value in TransposEnds then
4818
- // we remove it from the set. The max register usage is the maximum register
4819
- // usage of the recipes of the set.
4820
- for (unsigned int Idx = 0 , Sz = Idx2Recipe.size (); Idx < Sz; ++Idx) {
4821
- VPRecipeBase *R = Idx2Recipe[Idx];
4822
-
4823
- // Remove all of the recipes that end at this location.
4824
- RecipeList &List = TransposeEnds[Idx];
4825
- for (VPRecipeBase *ToRemove : List)
4826
- OpenIntervals.erase (ToRemove);
4827
-
4828
- // Ignore recipes that are never used within the loop and do not have side
4829
- // effects.
4830
- if (!Ends.count (R) && !R->mayHaveSideEffects ())
4831
- continue ;
4832
-
4833
- // Skip recipes for ignored values.
4834
- // TODO: Should mark recipes for ephemeral values that cannot be removed
4835
- // explictly in VPlan.
4836
- if (isa<VPSingleDefRecipe>(R) &&
4837
- ValuesToIgnore.contains (
4838
- cast<VPSingleDefRecipe>(R)->getUnderlyingValue ()))
4839
- continue ;
4840
-
4841
- // For each VF find the maximum usage of registers.
4842
- for (unsigned J = 0 , E = VFs.size (); J < E; ++J) {
4843
- // Count the number of registers used, per register class, given all open
4844
- // intervals.
4845
- // Note that elements in this SmallMapVector will be default constructed
4846
- // as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
4847
- // there is no previous entry for ClassID.
4848
- SmallMapVector<unsigned , unsigned , 4 > RegUsage;
4849
-
4850
- for (auto *R : OpenIntervals) {
4851
- // Skip recipes that weren't present in the original loop.
4852
- // TODO: Remove after removing the legacy
4853
- // LoopVectorizationCostModel::calculateRegisterUsage
4854
- if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
4855
- VPBranchOnMaskRecipe>(R))
4856
- continue ;
4857
-
4858
- if (VFs[J].isScalar () ||
4859
- isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
4860
- VPScalarIVStepsRecipe>(R) ||
4861
- (isa<VPInstruction>(R) &&
4862
- all_of (cast<VPSingleDefRecipe>(R)->users (),
4863
- [&](VPUser *U) {
4864
- return cast<VPRecipeBase>(U)->usesScalars (
4865
- R->getVPSingleValue ());
4866
- })) ||
4867
- (isa<VPReductionPHIRecipe>(R) &&
4868
- (cast<VPReductionPHIRecipe>(R))->isInLoop ())) {
4869
- unsigned ClassID = TTI.getRegisterClassForType (
4870
- false , TypeInfo.inferScalarType (R->getVPSingleValue ()));
4871
- // FIXME: The target might use more than one register for the type
4872
- // even in the scalar case.
4873
- RegUsage[ClassID] += 1 ;
4874
- } else {
4875
- // The output from scaled phis and scaled reductions actually has
4876
- // fewer lanes than the VF.
4877
- unsigned ScaleFactor = getVFScaleFactor (R);
4878
- ElementCount VF = VFs[J].divideCoefficientBy (ScaleFactor);
4879
- LLVM_DEBUG (if (VF != VFs[J]) {
4880
- dbgs () << " LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
4881
- << " for " << *R << " \n " ;
4882
- });
4883
-
4884
- for (VPValue *DefV : R->definedValues ()) {
4885
- Type *ScalarTy = TypeInfo.inferScalarType (DefV);
4886
- unsigned ClassID = TTI.getRegisterClassForType (true , ScalarTy);
4887
- RegUsage[ClassID] += GetRegUsage (ScalarTy, VF);
4888
- }
4889
- }
4890
- }
4891
-
4892
- for (const auto &Pair : RegUsage) {
4893
- auto &Entry = MaxUsages[J][Pair.first ];
4894
- Entry = std::max (Entry, Pair.second );
4895
- }
4896
- }
4897
-
4898
- LLVM_DEBUG (dbgs () << " LV(REG): At #" << Idx << " Interval # "
4899
- << OpenIntervals.size () << ' \n ' );
4900
-
4901
- // Add the current recipe to the list of open intervals.
4902
- OpenIntervals.insert (R);
4903
- }
4904
-
4905
- // We also search for instructions that are defined outside the loop, but are
4906
- // used inside the loop. We need this number separately from the max-interval
4907
- // usage number because when we unroll, loop-invariant values do not take
4908
- // more register.
4909
- LoopVectorizationCostModel::RegisterUsage RU;
4910
- for (unsigned Idx = 0 , End = VFs.size (); Idx < End; ++Idx) {
4911
- // Note that elements in this SmallMapVector will be default constructed
4912
- // as 0. So we can use "Invariant[ClassID] += n" in the code below even if
4913
- // there is no previous entry for ClassID.
4914
- SmallMapVector<unsigned , unsigned , 4 > Invariant;
4915
-
4916
- for (auto *In : LoopInvariants) {
4917
- // FIXME: The target might use more than one register for the type
4918
- // even in the scalar case.
4919
- bool IsScalar = all_of (In->users (), [&](VPUser *U) {
4920
- return cast<VPRecipeBase>(U)->usesScalars (In);
4921
- });
4922
-
4923
- ElementCount VF = IsScalar ? ElementCount::getFixed (1 ) : VFs[Idx];
4924
- unsigned ClassID = TTI.getRegisterClassForType (
4925
- VF.isVector (), TypeInfo.inferScalarType (In));
4926
- Invariant[ClassID] += GetRegUsage (TypeInfo.inferScalarType (In), VF);
4927
- }
4928
-
4929
- LLVM_DEBUG ({
4930
- dbgs () << " LV(REG): VF = " << VFs[Idx] << ' \n ' ;
4931
- dbgs () << " LV(REG): Found max usage: " << MaxUsages[Idx].size ()
4932
- << " item\n " ;
4933
- for (const auto &pair : MaxUsages[Idx]) {
4934
- dbgs () << " LV(REG): RegisterClass: "
4935
- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
4936
- << " registers\n " ;
4937
- }
4938
- dbgs () << " LV(REG): Found invariant usage: " << Invariant.size ()
4939
- << " item\n " ;
4940
- for (const auto &pair : Invariant) {
4941
- dbgs () << " LV(REG): RegisterClass: "
4942
- << TTI.getRegisterClassName (pair.first ) << " , " << pair.second
4943
- << " registers\n " ;
4944
- }
4945
- });
4946
-
4947
- RU.LoopInvariantRegs = Invariant;
4948
- RU.MaxLocalUsers = MaxUsages[Idx];
4949
- RUs[Idx] = RU;
4950
- }
4951
-
4952
- return RUs;
4953
- }
4954
-
4955
4679
unsigned
4956
4680
LoopVectorizationCostModel::selectInterleaveCount (VPlan &Plan, ElementCount VF,
4957
4681
InstructionCost LoopCost) {
@@ -5002,8 +4726,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
5002
4726
return 1 ;
5003
4727
}
5004
4728
5005
- RegisterUsage R =
5006
- ::calculateRegisterUsage (Plan, {VF}, TTI, ValuesToIgnore)[0];
4729
+ VPRegisterUsage R =
4730
+ calculateRegisterUsageForPlan (Plan, {VF}, TTI, ValuesToIgnore)[0 ];
5007
4731
// We divide by these constants so assume that we have at least one
5008
4732
// instruction that uses at least one register.
5009
4733
for (auto &Pair : R.MaxLocalUsers ) {
@@ -7380,7 +7104,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
7380
7104
for (auto &P : VPlans) {
7381
7105
ArrayRef<ElementCount> VFs (P->vectorFactors ().begin (),
7382
7106
P->vectorFactors ().end ());
7383
- auto RUs = :: calculateRegisterUsage (*P, VFs, TTI, CM.ValuesToIgnore );
7107
+ auto RUs = calculateRegisterUsageForPlan (*P, VFs, TTI, CM.ValuesToIgnore );
7384
7108
for (auto [VF, RU] : zip_equal (VFs, RUs)) {
7385
7109
if (VF.isScalar ())
7386
7110
continue ;
0 commit comments