@@ -147,6 +147,20 @@ static cl::opt<unsigned> MinTreeSize(
147
147
" slp-min-tree-size" , cl::init(3 ), cl::Hidden,
148
148
cl::desc(" Only vectorize small trees if they are fully vectorizable" ));
149
149
150
+ // The maximum depth that the look-ahead score heuristic will explore.
151
+ // The higher this value, the higher the compilation time overhead.
152
+ static cl::opt<int > LookAheadMaxDepth (
153
+ " slp-max-look-ahead-depth" , cl::init(2 ), cl::Hidden,
154
+ cl::desc(" The maximum look-ahead depth for operand reordering scores" ));
155
+
156
+ // The Look-ahead heuristic goes through the users of the bundle to calculate
157
+ // the users cost in getExternalUsesCost(). To avoid compilation time increase
158
+ // we limit the number of users visited to this value.
159
+ static cl::opt<unsigned > LookAheadUsersBudget (
160
+ " slp-look-ahead-users-budget" , cl::init(2 ), cl::Hidden,
161
+ cl::desc(" The maximum number of users to visit while visiting the "
162
+ " predecessors. This prevents compilation time increase." ));
163
+
150
164
static cl::opt<bool >
151
165
ViewSLPTree (" view-slp-tree" , cl::Hidden,
152
166
cl::desc (" Display the SLP trees with Graphviz" ));
@@ -721,6 +735,7 @@ class BoUpSLP {
721
735
722
736
const DataLayout &DL;
723
737
ScalarEvolution &SE;
738
+ const BoUpSLP &R;
724
739
725
740
// / \returns the operand data at \p OpIdx and \p Lane.
726
741
OperandData &getData (unsigned OpIdx, unsigned Lane) {
@@ -746,6 +761,227 @@ class BoUpSLP {
746
761
std::swap (OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
747
762
}
748
763
764
+ // The hard-coded scores listed here are not very important. When computing
765
+ // the scores of matching one sub-tree with another, we are basically
766
+ // counting the number of values that are matching. So even if all scores
767
+ // are set to 1, we would still get a decent matching result.
768
+ // However, sometimes we have to break ties. For example we may have to
769
+ // choose between matching loads vs matching opcodes. This is what these
770
+ // scores are helping us with: they provide the order of preference.
771
+
772
+ // / Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
773
+ static const int ScoreConsecutiveLoads = 3 ;
774
+ // / ExtractElementInst from same vector and consecutive indexes.
775
+ static const int ScoreConsecutiveExtracts = 3 ;
776
+ // / Constants.
777
+ static const int ScoreConstants = 2 ;
778
+ // / Instructions with the same opcode.
779
+ static const int ScoreSameOpcode = 2 ;
780
+ // / Instructions with alt opcodes (e.g, add + sub).
781
+ static const int ScoreAltOpcodes = 1 ;
782
+ // / Identical instructions (a.k.a. splat or broadcast).
783
+ static const int ScoreSplat = 1 ;
784
+ // / Matching with an undef is preferable to failing.
785
+ static const int ScoreUndef = 1 ;
786
+ // / Score for failing to find a decent match.
787
+ static const int ScoreFail = 0 ;
788
+ // / User exteranl to the vectorized code.
789
+ static const int ExternalUseCost = 1 ;
790
+ // / The user is internal but in a different lane.
791
+ static const int UserInDiffLaneCost = ExternalUseCost;
792
+
793
+ // / \returns the score of placing \p V1 and \p V2 in consecutive lanes.
794
+ static int getShallowScore (Value *V1, Value *V2, const DataLayout &DL,
795
+ ScalarEvolution &SE) {
796
+ auto *LI1 = dyn_cast<LoadInst>(V1);
797
+ auto *LI2 = dyn_cast<LoadInst>(V2);
798
+ if (LI1 && LI2)
799
+ return isConsecutiveAccess (LI1, LI2, DL, SE)
800
+ ? VLOperands::ScoreConsecutiveLoads
801
+ : VLOperands::ScoreFail;
802
+
803
+ auto *C1 = dyn_cast<Constant>(V1);
804
+ auto *C2 = dyn_cast<Constant>(V2);
805
+ if (C1 && C2)
806
+ return VLOperands::ScoreConstants;
807
+
808
+ // Extracts from consecutive indexes of the same vector better score as
809
+ // the extracts could be optimized away.
810
+ auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
811
+ auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
812
+ if (Ex1 && Ex2 && Ex1->getVectorOperand () == Ex2->getVectorOperand () &&
813
+ cast<ConstantInt>(Ex1->getIndexOperand ())->getZExtValue () + 1 ==
814
+ cast<ConstantInt>(Ex2->getIndexOperand ())->getZExtValue ()) {
815
+ return VLOperands::ScoreConsecutiveExtracts;
816
+ }
817
+
818
+ auto *I1 = dyn_cast<Instruction>(V1);
819
+ auto *I2 = dyn_cast<Instruction>(V2);
820
+ if (I1 && I2) {
821
+ if (I1 == I2)
822
+ return VLOperands::ScoreSplat;
823
+ InstructionsState S = getSameOpcode ({I1, I2});
824
+ // Note: Only consider instructions with <= 2 operands to avoid
825
+ // complexity explosion.
826
+ if (S.getOpcode () && S.MainOp ->getNumOperands () <= 2 )
827
+ return S.isAltShuffle () ? VLOperands::ScoreAltOpcodes
828
+ : VLOperands::ScoreSameOpcode;
829
+ }
830
+
831
+ if (isa<UndefValue>(V2))
832
+ return VLOperands::ScoreUndef;
833
+
834
+ return VLOperands::ScoreFail;
835
+ }
836
+
837
+ // / Holds the values and their lane that are taking part in the look-ahead
838
+ // / score calculation. This is used in the external uses cost calculation.
839
+ SmallDenseMap<Value *, int > InLookAheadValues;
840
+
841
+ // / \Returns the additinal cost due to uses of \p LHS and \p RHS that are
842
+ // / either external to the vectorized code, or require shuffling.
843
+ int getExternalUsesCost (const std::pair<Value *, int > &LHS,
844
+ const std::pair<Value *, int > &RHS) {
845
+ int Cost = 0 ;
846
+ SmallVector<std::pair<Value *, int >, 2 > Values = {LHS, RHS};
847
+ for (int Idx = 0 , IdxE = Values.size (); Idx != IdxE; ++Idx) {
848
+ Value *V = Values[Idx].first ;
849
+ // Calculate the absolute lane, using the minimum relative lane of LHS
850
+ // and RHS as base and Idx as the offset.
851
+ int Ln = std::min (LHS.second , RHS.second ) + Idx;
852
+ assert (Ln >= 0 && " Bad lane calculation" );
853
+ unsigned UsersBudget = LookAheadUsersBudget;
854
+ for (User *U : V->users ()) {
855
+ if (const TreeEntry *UserTE = R.getTreeEntry (U)) {
856
+ // The user is in the VectorizableTree. Check if we need to insert.
857
+ auto It = llvm::find (UserTE->Scalars , U);
858
+ assert (It != UserTE->Scalars .end () && " U is in UserTE" );
859
+ int UserLn = std::distance (UserTE->Scalars .begin (), It);
860
+ assert (UserLn >= 0 && " Bad lane" );
861
+ if (UserLn != Ln)
862
+ Cost += UserInDiffLaneCost;
863
+ } else {
864
+ // Check if the user is in the look-ahead code.
865
+ auto It2 = InLookAheadValues.find (U);
866
+ if (It2 != InLookAheadValues.end ()) {
867
+ // The user is in the look-ahead code. Check the lane.
868
+ if (It2->second != Ln)
869
+ Cost += UserInDiffLaneCost;
870
+ } else {
871
+ // The user is neither in SLP tree nor in the look-ahead code.
872
+ Cost += ExternalUseCost;
873
+ }
874
+ }
875
+ // Limit the number of visited uses to cap compilation time.
876
+ if (--UsersBudget == 0 )
877
+ break ;
878
+ }
879
+ }
880
+ return Cost;
881
+ }
882
+
883
+ // / Go through the operands of \p LHS and \p RHS recursively until \p
884
+ // / MaxLevel, and return the cummulative score. For example:
885
+ // / \verbatim
886
+ // / A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
887
+ // / \ / \ / \ / \ /
888
+ // / + + + +
889
+ // / G1 G2 G3 G4
890
+ // / \endverbatim
891
+ // / The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
892
+ // / each level recursively, accumulating the score. It starts from matching
893
+ // / the additions at level 0, then moves on to the loads (level 1). The
894
+ // / score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
895
+ // / {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
896
+ // / {A[0],C[0]} has a score of VLOperands::ScoreFail.
897
+ // / Please note that the order of the operands does not matter, as we
898
+ // / evaluate the score of all profitable combinations of operands. In
899
+ // / other words the score of G1 and G4 is the same as G1 and G2. This
900
+ // / heuristic is based on ideas described in:
901
+ // / Look-ahead SLP: Auto-vectorization in the presence of commutative
902
+ // / operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
903
+ // / Luís F. W. Góes
904
+ int getScoreAtLevelRec (const std::pair<Value *, int > &LHS,
905
+ const std::pair<Value *, int > &RHS, int CurrLevel,
906
+ int MaxLevel) {
907
+
908
+ Value *V1 = LHS.first ;
909
+ Value *V2 = RHS.first ;
910
+ // Get the shallow score of V1 and V2.
911
+ int ShallowScoreAtThisLevel =
912
+ std::max ((int )ScoreFail, getShallowScore (V1, V2, DL, SE) -
913
+ getExternalUsesCost (LHS, RHS));
914
+ int Lane1 = LHS.second ;
915
+ int Lane2 = RHS.second ;
916
+
917
+ // If reached MaxLevel,
918
+ // or if V1 and V2 are not instructions,
919
+ // or if they are SPLAT,
920
+ // or if they are not consecutive, early return the current cost.
921
+ auto *I1 = dyn_cast<Instruction>(V1);
922
+ auto *I2 = dyn_cast<Instruction>(V2);
923
+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
924
+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
925
+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
926
+ return ShallowScoreAtThisLevel;
927
+ assert (I1 && I2 && " Should have early exited." );
928
+
929
+ // Keep track of in-tree values for determining the external-use cost.
930
+ InLookAheadValues[V1] = Lane1;
931
+ InLookAheadValues[V2] = Lane2;
932
+
933
+ // Contains the I2 operand indexes that got matched with I1 operands.
934
+ SmallSet<unsigned , 4 > Op2Used;
935
+
936
+ // Recursion towards the operands of I1 and I2. We are trying all possbile
937
+ // operand pairs, and keeping track of the best score.
938
+ for (unsigned OpIdx1 = 0 , NumOperands1 = I1->getNumOperands ();
939
+ OpIdx1 != NumOperands1; ++OpIdx1) {
940
+ // Try to pair op1I with the best operand of I2.
941
+ int MaxTmpScore = 0 ;
942
+ unsigned MaxOpIdx2 = 0 ;
943
+ bool FoundBest = false ;
944
+ // If I2 is commutative try all combinations.
945
+ unsigned FromIdx = isCommutative (I2) ? 0 : OpIdx1;
946
+ unsigned ToIdx = isCommutative (I2)
947
+ ? I2->getNumOperands ()
948
+ : std::min (I2->getNumOperands (), OpIdx1 + 1 );
949
+ assert (FromIdx <= ToIdx && " Bad index" );
950
+ for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
951
+ // Skip operands already paired with OpIdx1.
952
+ if (Op2Used.count (OpIdx2))
953
+ continue ;
954
+ // Recursively calculate the cost at each level
955
+ int TmpScore = getScoreAtLevelRec ({I1->getOperand (OpIdx1), Lane1},
956
+ {I2->getOperand (OpIdx2), Lane2},
957
+ CurrLevel + 1 , MaxLevel);
958
+ // Look for the best score.
959
+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
960
+ MaxTmpScore = TmpScore;
961
+ MaxOpIdx2 = OpIdx2;
962
+ FoundBest = true ;
963
+ }
964
+ }
965
+ if (FoundBest) {
966
+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
967
+ Op2Used.insert (MaxOpIdx2);
968
+ ShallowScoreAtThisLevel += MaxTmpScore;
969
+ }
970
+ }
971
+ return ShallowScoreAtThisLevel;
972
+ }
973
+
974
+ // / \Returns the look-ahead score, which tells us how much the sub-trees
975
+ // / rooted at \p LHS and \p RHS match, the more they match the higher the
976
+ // / score. This helps break ties in an informed way when we cannot decide on
977
+ // / the order of the operands by just considering the immediate
978
+ // / predecessors.
979
+ int getLookAheadScore (const std::pair<Value *, int > &LHS,
980
+ const std::pair<Value *, int > &RHS) {
981
+ InLookAheadValues.clear ();
982
+ return getScoreAtLevelRec (LHS, RHS, 1 , LookAheadMaxDepth);
983
+ }
984
+
749
985
// Search all operands in Ops[*][Lane] for the one that matches best
750
986
// Ops[OpIdx][LastLane] and return its opreand index.
751
987
// If no good match can be found, return None.
@@ -763,9 +999,6 @@ class BoUpSLP {
763
999
// The linearized opcode of the operand at OpIdx, Lane.
764
1000
bool OpIdxAPO = getData (OpIdx, Lane).APO ;
765
1001
766
- const unsigned BestScore = 2 ;
767
- const unsigned GoodScore = 1 ;
768
-
769
1002
// The best operand index and its score.
770
1003
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
771
1004
// are using the score to differentiate between the two.
@@ -794,41 +1027,19 @@ class BoUpSLP {
794
1027
// Look for an operand that matches the current mode.
795
1028
switch (RMode) {
796
1029
case ReorderingMode::Load:
797
- if (isa<LoadInst>(Op)) {
798
- // Figure out which is left and right, so that we can check for
799
- // consecutive loads
800
- bool LeftToRight = Lane > LastLane;
801
- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
802
- Value *OpRight = (LeftToRight) ? Op : OpLastLane;
803
- if (isConsecutiveAccess (cast<LoadInst>(OpLeft),
804
- cast<LoadInst>(OpRight), DL, SE))
805
- BestOp.Idx = Idx;
806
- }
807
- break ;
808
- case ReorderingMode::Opcode:
809
- // We accept both Instructions and Undefs, but with different scores.
810
- if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
811
- cast<Instruction>(Op)->getOpcode () ==
812
- cast<Instruction>(OpLastLane)->getOpcode ()) ||
813
- (isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
814
- isa<UndefValue>(Op)) {
815
- // An instruction has a higher score than an undef.
816
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
817
- if (Score > BestOp.Score ) {
818
- BestOp.Idx = Idx;
819
- BestOp.Score = Score;
820
- }
821
- }
822
- break ;
823
1030
case ReorderingMode::Constant:
824
- if (isa<Constant>(Op)) {
825
- unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
826
- if (Score > BestOp.Score ) {
827
- BestOp.Idx = Idx;
828
- BestOp.Score = Score;
829
- }
1031
+ case ReorderingMode::Opcode: {
1032
+ bool LeftToRight = Lane > LastLane;
1033
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1034
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1035
+ unsigned Score =
1036
+ getLookAheadScore ({OpLeft, LastLane}, {OpRight, Lane});
1037
+ if (Score > BestOp.Score ) {
1038
+ BestOp.Idx = Idx;
1039
+ BestOp.Score = Score;
830
1040
}
831
1041
break ;
1042
+ }
832
1043
case ReorderingMode::Splat:
833
1044
if (Op == OpLastLane)
834
1045
BestOp.Idx = Idx;
@@ -959,8 +1170,8 @@ class BoUpSLP {
959
1170
public:
960
1171
// / Initialize with all the operands of the instruction vector \p RootVL.
961
1172
VLOperands (ArrayRef<Value *> RootVL, const DataLayout &DL,
962
- ScalarEvolution &SE)
963
- : DL(DL), SE(SE) {
1173
+ ScalarEvolution &SE, const BoUpSLP &R )
1174
+ : DL(DL), SE(SE), R(R) {
964
1175
// Append all the operands of RootVL.
965
1176
appendOperandsOfVL (RootVL);
966
1177
}
@@ -1189,7 +1400,8 @@ class BoUpSLP {
1189
1400
SmallVectorImpl<Value *> &Left,
1190
1401
SmallVectorImpl<Value *> &Right,
1191
1402
const DataLayout &DL,
1192
- ScalarEvolution &SE);
1403
+ ScalarEvolution &SE,
1404
+ const BoUpSLP &R);
1193
1405
struct TreeEntry {
1194
1406
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8 >;
1195
1407
TreeEntry (VecTreeTy &Container) : Container(Container) {}
@@ -2550,7 +2762,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2550
2762
// Commutative predicate - collect + sort operands of the instructions
2551
2763
// so that each side is more likely to have the same opcode.
2552
2764
assert (P0 == SwapP0 && " Commutative Predicate mismatch" );
2553
- reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE);
2765
+ reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE, * this );
2554
2766
} else {
2555
2767
// Collect operands - commute if it uses the swapped predicate.
2556
2768
for (Value *V : VL) {
@@ -2597,7 +2809,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2597
2809
// have the same opcode.
2598
2810
if (isa<BinaryOperator>(VL0) && VL0->isCommutative ()) {
2599
2811
ValueList Left, Right;
2600
- reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE);
2812
+ reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE, * this );
2601
2813
TE->setOperand (0 , Left);
2602
2814
TE->setOperand (1 , Right);
2603
2815
buildTree_rec (Left, Depth + 1 , {TE, 0 });
@@ -2789,7 +3001,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
2789
3001
// Reorder operands if reordering would enable vectorization.
2790
3002
if (isa<BinaryOperator>(VL0)) {
2791
3003
ValueList Left, Right;
2792
- reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE);
3004
+ reorderInputsAccordingToOpcode (VL, Left, Right, *DL, *SE, * this );
2793
3005
TE->setOperand (0 , Left);
2794
3006
TE->setOperand (1 , Right);
2795
3007
buildTree_rec (Left, Depth + 1 , {TE, 0 });
@@ -3550,13 +3762,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
3550
3762
3551
3763
// Perform operand reordering on the instructions in VL and return the reordered
3552
3764
// operands in Left and Right.
3553
- void BoUpSLP::reorderInputsAccordingToOpcode (
3554
- ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
3555
- SmallVectorImpl<Value *> &Right, const DataLayout &DL,
3556
- ScalarEvolution &SE) {
3765
+ void BoUpSLP::reorderInputsAccordingToOpcode (ArrayRef<Value *> VL,
3766
+ SmallVectorImpl<Value *> &Left,
3767
+ SmallVectorImpl<Value *> &Right,
3768
+ const DataLayout &DL,
3769
+ ScalarEvolution &SE,
3770
+ const BoUpSLP &R) {
3557
3771
if (VL.empty ())
3558
3772
return ;
3559
- VLOperands Ops (VL, DL, SE);
3773
+ VLOperands Ops (VL, DL, SE, R );
3560
3774
// Reorder the operands in place.
3561
3775
Ops.reorder ();
3562
3776
Left = Ops.getVL (0 );
0 commit comments