@@ -218,15 +218,18 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
218
218
return true ;
219
219
}
220
220
221
+ // / \returns True if the value is a constant (but not globals/constant
222
+ // / expressions).
223
+ static bool isConstant (Value *V) {
224
+ return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
225
+ }
226
+
221
227
// / \returns True if all of the values in \p VL are constants (but not
222
228
// / globals/constant expressions).
223
229
static bool allConstant (ArrayRef<Value *> VL) {
224
230
// Constant expressions and globals can't be vectorized like normal integer/FP
225
231
// constants.
226
- for (Value *i : VL)
227
- if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
228
- return false ;
229
- return true ;
232
+ return all_of (VL, isConstant);
230
233
}
231
234
232
235
// / \returns True if all of the values in \p VL are identical.
@@ -4725,6 +4728,8 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
4725
4728
// Iterate in reverse order to consider insert elements with the high cost.
4726
4729
for (unsigned I = VL.size (); I > 0 ; --I) {
4727
4730
unsigned Idx = I - 1 ;
4731
+ if (isConstant (VL[Idx]))
4732
+ continue ;
4728
4733
if (!UniqueElements.insert (VL[Idx]).second )
4729
4734
ShuffledElements.insert (Idx);
4730
4735
}
@@ -4810,108 +4815,79 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
4810
4815
}
4811
4816
4812
4817
Value *BoUpSLP::gather (ArrayRef<Value *> VL) {
4813
- Value *Val0 =
4814
- isa<StoreInst>(VL[0 ]) ? cast<StoreInst>(VL[0 ])->getValueOperand () : VL[0 ];
4815
- FixedVectorType *VecTy = FixedVectorType::get (Val0->getType (), VL.size ());
4816
- Value *Vec = PoisonValue::get (VecTy);
4817
- unsigned InsIndex = 0 ;
4818
- for (Value *Val : VL) {
4819
- Vec = Builder.CreateInsertElement (Vec, Val, Builder.getInt32 (InsIndex++));
4818
+ // List of instructions/lanes from current block and/or the blocks which are
4819
+ // part of the current loop. These instructions will be inserted at the end to
4820
+ // make it possible to optimize loops and hoist invariant instructions out of
4821
+ // the loops body with better chances for success.
4822
+ SmallVector<std::pair<Value *, unsigned >, 4 > PostponedInsts;
4823
+ SmallSet<int , 4 > PostponedIndices;
4824
+ Loop *L = LI->getLoopFor (Builder.GetInsertBlock ());
4825
+ auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
4826
+ SmallPtrSet<BasicBlock *, 4 > Visited;
4827
+ while (InsertBB && InsertBB != InstBB && Visited.insert (InsertBB).second )
4828
+ InsertBB = InsertBB->getSinglePredecessor ();
4829
+ return InsertBB && InsertBB == InstBB;
4830
+ };
4831
+ for (int I = 0 , E = VL.size (); I < E; ++I) {
4832
+ if (auto *Inst = dyn_cast<Instruction>(VL[I]))
4833
+ if ((CheckPredecessor (Inst->getParent (), Builder.GetInsertBlock ()) ||
4834
+ getTreeEntry (Inst) || (L && (L->contains (Inst)))) &&
4835
+ PostponedIndices.insert (I).second )
4836
+ PostponedInsts.emplace_back (Inst, I);
4837
+ }
4838
+
4839
+ auto &&CreateInsertElement = [this ](Value *Vec, Value *V, unsigned Pos) {
4840
+ // No need to insert undefs elements - exit.
4841
+ if (isa<UndefValue>(V))
4842
+ return Vec;
4843
+ Vec = Builder.CreateInsertElement (Vec, V, Builder.getInt32 (Pos));
4820
4844
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
4821
4845
if (!InsElt)
4822
- continue ;
4846
+ return Vec ;
4823
4847
GatherSeq.insert (InsElt);
4824
4848
CSEBlocks.insert (InsElt->getParent ());
4825
4849
// Add to our 'need-to-extract' list.
4826
- if (TreeEntry *Entry = getTreeEntry (Val )) {
4850
+ if (TreeEntry *Entry = getTreeEntry (V )) {
4827
4851
// Find which lane we need to extract.
4828
- int FoundLane =
4829
- findLaneForValue (Entry->Scalars , Entry->ReuseShuffleIndices , Val);
4830
- ExternalUses.push_back (ExternalUser (Val, InsElt, FoundLane));
4831
- }
4832
- }
4833
-
4834
- return Vec;
4835
- }
4836
-
4837
- Value *BoUpSLP::vectorizeTree (ArrayRef<Value *> VL) {
4838
- InstructionsState S = getSameOpcode (VL);
4839
- if (S.getOpcode ()) {
4840
- if (TreeEntry *E = getTreeEntry (S.OpValue )) {
4841
- if (E->isSame (VL)) {
4842
- Value *V = vectorizeTree (E);
4843
- if (VL.size () == E->Scalars .size () && !E->ReuseShuffleIndices .empty ()) {
4844
- // Reshuffle to get only unique values.
4845
- // If some of the scalars are duplicated in the vectorization tree
4846
- // entry, we do not vectorize them but instead generate a mask for the
4847
- // reuses. But if there are several users of the same entry, they may
4848
- // have different vectorization factors. This is especially important
4849
- // for PHI nodes. In this case, we need to adapt the resulting
4850
- // instruction for the user vectorization factor and have to reshuffle
4851
- // it again to take only unique elements of the vector. Without this
4852
- // code the function incorrectly returns reduced vector instruction
4853
- // with the same elements, not with the unique ones.
4854
- // block:
4855
- // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
4856
- // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
4857
- // ... (use %2)
4858
- // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
4859
- // br %block
4860
- SmallVector<int , 4 > UniqueIdxs;
4861
- SmallSet<int , 4 > UsedIdxs;
4862
- int Pos = 0 ;
4863
- for (int Idx : E->ReuseShuffleIndices ) {
4864
- if (UsedIdxs.insert (Idx).second )
4865
- UniqueIdxs.emplace_back (Pos);
4866
- ++Pos;
4867
- }
4868
- V = Builder.CreateShuffleVector (V, UniqueIdxs, " shrink.shuffle" );
4869
- }
4870
- return V;
4852
+ unsigned FoundLane =
4853
+ std::distance (Entry->Scalars .begin (), find (Entry->Scalars , V));
4854
+ assert (FoundLane < Entry->Scalars .size () && " Couldn't find extract lane" );
4855
+ if (!Entry->ReuseShuffleIndices .empty ()) {
4856
+ FoundLane = std::distance (Entry->ReuseShuffleIndices .begin (),
4857
+ find (Entry->ReuseShuffleIndices , FoundLane));
4871
4858
}
4859
+ ExternalUses.emplace_back (V, InsElt, FoundLane);
4872
4860
}
4861
+ return Vec;
4862
+ };
4863
+ Value *Val0 =
4864
+ isa<StoreInst>(VL[0 ]) ? cast<StoreInst>(VL[0 ])->getValueOperand () : VL[0 ];
4865
+ FixedVectorType *VecTy = FixedVectorType::get (Val0->getType (), VL.size ());
4866
+ Value *Vec = PoisonValue::get (VecTy);
4867
+ for (int I = 0 , E = VL.size (); I < E; ++I) {
4868
+ if (PostponedIndices.contains (I))
4869
+ continue ;
4870
+ Vec = CreateInsertElement (Vec, VL[I], I);
4873
4871
}
4872
+ // Append instructions, which are/may be part of the loop, in the end to make
4873
+ // it possible to hoist non-loop-based instructions.
4874
+ for (const std::pair<Value *, unsigned > &Pair : PostponedInsts)
4875
+ Vec = CreateInsertElement (Vec, Pair.first , Pair.second );
4874
4876
4875
- // Check that every instruction appears once in this bundle.
4876
- SmallVector<int , 4 > ReuseShuffleIndicies;
4877
- SmallVector<Value *, 4 > UniqueValues;
4878
- if (VL.size () > 2 ) {
4879
- DenseMap<Value *, unsigned > UniquePositions;
4880
- for (Value *V : VL) {
4881
- auto Res = UniquePositions.try_emplace (V, UniqueValues.size ());
4882
- ReuseShuffleIndicies.emplace_back (Res.first ->second );
4883
- if (Res.second || isa<Constant>(V))
4884
- UniqueValues.emplace_back (V);
4885
- }
4886
- // Do not shuffle single element or if number of unique values is not power
4887
- // of 2.
4888
- if (UniqueValues.size () == VL.size () || UniqueValues.size () <= 1 ||
4889
- !llvm::isPowerOf2_32 (UniqueValues.size ()))
4890
- ReuseShuffleIndicies.clear ();
4891
- else
4892
- VL = UniqueValues;
4893
- }
4894
-
4895
- Value *Vec = gather (VL);
4896
- if (!ReuseShuffleIndicies.empty ()) {
4897
- Vec = Builder.CreateShuffleVector (Vec, ReuseShuffleIndicies, " shuffle" );
4898
- if (auto *I = dyn_cast<Instruction>(Vec)) {
4899
- GatherSeq.insert (I);
4900
- CSEBlocks.insert (I->getParent ());
4901
- }
4902
- }
4903
4877
return Vec;
4904
4878
}
4905
4879
4906
4880
namespace {
4907
4881
// / Merges shuffle masks and emits final shuffle instruction, if required.
4908
4882
class ShuffleInstructionBuilder {
4909
4883
IRBuilderBase &Builder;
4884
+ const unsigned VF = 0 ;
4910
4885
bool IsFinalized = false ;
4911
4886
SmallVector<int , 4 > Mask;
4912
4887
4913
4888
public:
4914
- ShuffleInstructionBuilder (IRBuilderBase &Builder) : Builder(Builder) {}
4889
+ ShuffleInstructionBuilder (IRBuilderBase &Builder, unsigned VF)
4890
+ : Builder(Builder), VF(VF) {}
4915
4891
4916
4892
// / Adds a mask, inverting it before applying.
4917
4893
void addInversedMask (ArrayRef<unsigned > SubMask) {
@@ -4938,8 +4914,9 @@ class ShuffleInstructionBuilder {
4938
4914
SmallVector<int , 4 > NewMask (SubMask.size (), SubMask.size ());
4939
4915
int TermValue = std::min (Mask.size (), SubMask.size ());
4940
4916
for (int I = 0 , E = SubMask.size (); I < E; ++I) {
4941
- if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) {
4942
- NewMask[I] = E;
4917
+ if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
4918
+ Mask[SubMask[I]] >= TermValue) {
4919
+ NewMask[I] = UndefMaskElem;
4943
4920
continue ;
4944
4921
}
4945
4922
NewMask[I] = Mask[SubMask[I]];
@@ -4949,7 +4926,14 @@ class ShuffleInstructionBuilder {
4949
4926
4950
4927
Value *finalize (Value *V) {
4951
4928
IsFinalized = true ;
4952
- if (Mask.empty ())
4929
+ unsigned ValueVF = cast<FixedVectorType>(V->getType ())->getNumElements ();
4930
+ if (VF == ValueVF && Mask.empty ())
4931
+ return V;
4932
+ SmallVector<int , 4 > NormalizedMask (VF, UndefMaskElem);
4933
+ std::iota (NormalizedMask.begin (), NormalizedMask.end (), 0 );
4934
+ addMask (NormalizedMask);
4935
+
4936
+ if (VF == ValueVF && ShuffleVectorInst::isIdentityMask (Mask))
4953
4937
return V;
4954
4938
return Builder.CreateShuffleVector (V, Mask, " shuffle" );
4955
4939
}
@@ -4961,6 +4945,120 @@ class ShuffleInstructionBuilder {
4961
4945
};
4962
4946
} // namespace
4963
4947
4948
+ Value *BoUpSLP::vectorizeTree (ArrayRef<Value *> VL) {
4949
+ unsigned VF = VL.size ();
4950
+ InstructionsState S = getSameOpcode (VL);
4951
+ if (S.getOpcode ()) {
4952
+ if (TreeEntry *E = getTreeEntry (S.OpValue ))
4953
+ if (E->isSame (VL)) {
4954
+ Value *V = vectorizeTree (E);
4955
+ if (VF != cast<FixedVectorType>(V->getType ())->getNumElements ()) {
4956
+ if (!E->ReuseShuffleIndices .empty ()) {
4957
+ // Reshuffle to get only unique values.
4958
+ // If some of the scalars are duplicated in the vectorization tree
4959
+ // entry, we do not vectorize them but instead generate a mask for
4960
+ // the reuses. But if there are several users of the same entry,
4961
+ // they may have different vectorization factors. This is especially
4962
+ // important for PHI nodes. In this case, we need to adapt the
4963
+ // resulting instruction for the user vectorization factor and have
4964
+ // to reshuffle it again to take only unique elements of the vector.
4965
+ // Without this code the function incorrectly returns reduced vector
4966
+ // instruction with the same elements, not with the unique ones.
4967
+
4968
+ // block:
4969
+ // %phi = phi <2 x > { .., %entry} {%shuffle, %block}
4970
+ // %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
4971
+ // ... (use %2)
4972
+ // %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
4973
+ // br %block
4974
+ SmallVector<int > UniqueIdxs;
4975
+ SmallSet<int , 4 > UsedIdxs;
4976
+ int Pos = 0 ;
4977
+ int Sz = VL.size ();
4978
+ for (int Idx : E->ReuseShuffleIndices ) {
4979
+ if (Idx != Sz && UsedIdxs.insert (Idx).second )
4980
+ UniqueIdxs.emplace_back (Pos);
4981
+ ++Pos;
4982
+ }
4983
+ assert (VF >= UsedIdxs.size () && " Expected vectorization factor "
4984
+ " less than original vector size." );
4985
+ UniqueIdxs.append (VF - UsedIdxs.size (), UndefMaskElem);
4986
+ V = Builder.CreateShuffleVector (V, UniqueIdxs, " shrink.shuffle" );
4987
+ } else {
4988
+ assert (VF < cast<FixedVectorType>(V->getType ())->getNumElements () &&
4989
+ " Expected vectorization factor less "
4990
+ " than original vector size." );
4991
+ SmallVector<int > UniformMask (VF, 0 );
4992
+ std::iota (UniformMask.begin (), UniformMask.end (), 0 );
4993
+ V = Builder.CreateShuffleVector (V, UniformMask, " shrink.shuffle" );
4994
+ }
4995
+ }
4996
+ return V;
4997
+ }
4998
+ }
4999
+
5000
+ // Check that every instruction appears once in this bundle.
5001
+ SmallVector<int > ReuseShuffleIndicies;
5002
+ SmallVector<Value *> UniqueValues;
5003
+ if (VL.size () > 2 ) {
5004
+ DenseMap<Value *, unsigned > UniquePositions;
5005
+ unsigned NumValues =
5006
+ std::distance (VL.begin (), find_if (reverse (VL), [](Value *V) {
5007
+ return !isa<UndefValue>(V);
5008
+ }).base ());
5009
+ VF = std::max<unsigned >(VF, PowerOf2Ceil (NumValues));
5010
+ int UniqueVals = 0 ;
5011
+ bool HasUndefs = false ;
5012
+ for (Value *V : VL.drop_back (VL.size () - VF)) {
5013
+ if (isa<UndefValue>(V)) {
5014
+ ReuseShuffleIndicies.emplace_back (UndefMaskElem);
5015
+ HasUndefs = true ;
5016
+ continue ;
5017
+ }
5018
+ if (isConstant (V)) {
5019
+ ReuseShuffleIndicies.emplace_back (UniqueValues.size ());
5020
+ UniqueValues.emplace_back (V);
5021
+ continue ;
5022
+ }
5023
+ auto Res = UniquePositions.try_emplace (V, UniqueValues.size ());
5024
+ ReuseShuffleIndicies.emplace_back (Res.first ->second );
5025
+ if (Res.second ) {
5026
+ UniqueValues.emplace_back (V);
5027
+ ++UniqueVals;
5028
+ }
5029
+ }
5030
+ if (HasUndefs && UniqueVals == 1 && UniqueValues.size () == 1 ) {
5031
+ // Emit pure splat vector.
5032
+ // FIXME: why it is not identified as an identity.
5033
+ unsigned NumUndefs = count (ReuseShuffleIndicies, UndefMaskElem);
5034
+ if (NumUndefs == ReuseShuffleIndicies.size () - 1 )
5035
+ ReuseShuffleIndicies.append (VF - ReuseShuffleIndicies.size (),
5036
+ UndefMaskElem);
5037
+ else
5038
+ ReuseShuffleIndicies.assign (VF, 0 );
5039
+ } else if (UniqueValues.size () >= VF - 1 || UniqueValues.size () <= 1 ) {
5040
+ ReuseShuffleIndicies.clear ();
5041
+ UniqueValues.clear ();
5042
+ UniqueValues.append (VL.begin (), std::next (VL.begin (), NumValues));
5043
+ }
5044
+ UniqueValues.append (VF - UniqueValues.size (),
5045
+ UndefValue::get (VL[0 ]->getType ()));
5046
+ VL = UniqueValues;
5047
+ }
5048
+
5049
+ ShuffleInstructionBuilder ShuffleBuilder (Builder, VF);
5050
+ Value *Vec = gather (VL);
5051
+ if (!ReuseShuffleIndicies.empty ()) {
5052
+ ShuffleBuilder.addMask (ReuseShuffleIndicies);
5053
+ Vec = ShuffleBuilder.finalize (Vec);
5054
+ if (auto *I = dyn_cast<Instruction>(Vec)) {
5055
+ GatherSeq.insert (I);
5056
+ CSEBlocks.insert (I->getParent ());
5057
+ }
5058
+ }
5059
+ return Vec;
5060
+ }
5061
+
4964
5062
Value *BoUpSLP::vectorizeTree (TreeEntry *E) {
4965
5063
IRBuilder<>::InsertPointGuard Guard (Builder);
4966
5064
@@ -4969,8 +5067,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
4969
5067
return E->VectorizedValue ;
4970
5068
}
4971
5069
4972
- ShuffleInstructionBuilder ShuffleBuilder (Builder);
4973
5070
bool NeedToShuffleReuses = !E->ReuseShuffleIndices .empty ();
5071
+ unsigned VF = E->Scalars .size ();
5072
+ if (NeedToShuffleReuses)
5073
+ VF = E->ReuseShuffleIndices .size ();
5074
+ ShuffleInstructionBuilder ShuffleBuilder (Builder, VF);
4974
5075
if (E->State == TreeEntry::NeedToGather) {
4975
5076
setInsertPointAfterBundle (E);
4976
5077
Value *Vec;
0 commit comments