Skip to content

Commit 2a30e24

Browse files
author
git apple-llvm automerger
committed
Merge commit 'adc5db6a3b0b' from apple/master into swift/master-next
2 parents cc45f76 + adc5db6 commit 2a30e24

File tree

3 files changed

+524
-137
lines changed

3 files changed

+524
-137
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 260 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,20 @@ static cl::opt<unsigned> MinTreeSize(
147147
"slp-min-tree-size", cl::init(3), cl::Hidden,
148148
cl::desc("Only vectorize small trees if they are fully vectorizable"));
149149

150+
// The maximum depth that the look-ahead score heuristic will explore.
151+
// The higher this value, the higher the compilation time overhead.
152+
static cl::opt<int> LookAheadMaxDepth(
153+
"slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
154+
cl::desc("The maximum look-ahead depth for operand reordering scores"));
155+
156+
// The Look-ahead heuristic goes through the users of the bundle to calculate
157+
// the users cost in getExternalUsesCost(). To avoid compilation time increase
158+
// we limit the number of users visited to this value.
159+
static cl::opt<unsigned> LookAheadUsersBudget(
160+
"slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
161+
cl::desc("The maximum number of users to visit while visiting the "
162+
"predecessors. This prevents compilation time increase."));
163+
150164
static cl::opt<bool>
151165
ViewSLPTree("view-slp-tree", cl::Hidden,
152166
cl::desc("Display the SLP trees with Graphviz"));
@@ -721,6 +735,7 @@ class BoUpSLP {
721735

722736
const DataLayout &DL;
723737
ScalarEvolution &SE;
738+
const BoUpSLP &R;
724739

725740
/// \returns the operand data at \p OpIdx and \p Lane.
726741
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -746,6 +761,227 @@ class BoUpSLP {
746761
std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
747762
}
748763

764+
// The hard-coded scores listed here are not very important. When computing
765+
// the scores of matching one sub-tree with another, we are basically
766+
// counting the number of values that are matching. So even if all scores
767+
// are set to 1, we would still get a decent matching result.
768+
// However, sometimes we have to break ties. For example we may have to
769+
// choose between matching loads vs matching opcodes. This is what these
770+
// scores are helping us with: they provide the order of preference.
771+
772+
/// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
773+
static const int ScoreConsecutiveLoads = 3;
774+
/// ExtractElementInst from same vector and consecutive indexes.
775+
static const int ScoreConsecutiveExtracts = 3;
776+
/// Constants.
777+
static const int ScoreConstants = 2;
778+
/// Instructions with the same opcode.
779+
static const int ScoreSameOpcode = 2;
780+
/// Instructions with alt opcodes (e.g, add + sub).
781+
static const int ScoreAltOpcodes = 1;
782+
/// Identical instructions (a.k.a. splat or broadcast).
783+
static const int ScoreSplat = 1;
784+
/// Matching with an undef is preferable to failing.
785+
static const int ScoreUndef = 1;
786+
/// Score for failing to find a decent match.
787+
static const int ScoreFail = 0;
788+
/// User exteranl to the vectorized code.
789+
static const int ExternalUseCost = 1;
790+
/// The user is internal but in a different lane.
791+
static const int UserInDiffLaneCost = ExternalUseCost;
792+
793+
/// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
794+
static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
795+
ScalarEvolution &SE) {
796+
auto *LI1 = dyn_cast<LoadInst>(V1);
797+
auto *LI2 = dyn_cast<LoadInst>(V2);
798+
if (LI1 && LI2)
799+
return isConsecutiveAccess(LI1, LI2, DL, SE)
800+
? VLOperands::ScoreConsecutiveLoads
801+
: VLOperands::ScoreFail;
802+
803+
auto *C1 = dyn_cast<Constant>(V1);
804+
auto *C2 = dyn_cast<Constant>(V2);
805+
if (C1 && C2)
806+
return VLOperands::ScoreConstants;
807+
808+
// Extracts from consecutive indexes of the same vector better score as
809+
// the extracts could be optimized away.
810+
auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
811+
auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
812+
if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
813+
cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
814+
cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
815+
return VLOperands::ScoreConsecutiveExtracts;
816+
}
817+
818+
auto *I1 = dyn_cast<Instruction>(V1);
819+
auto *I2 = dyn_cast<Instruction>(V2);
820+
if (I1 && I2) {
821+
if (I1 == I2)
822+
return VLOperands::ScoreSplat;
823+
InstructionsState S = getSameOpcode({I1, I2});
824+
// Note: Only consider instructions with <= 2 operands to avoid
825+
// complexity explosion.
826+
if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
827+
return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
828+
: VLOperands::ScoreSameOpcode;
829+
}
830+
831+
if (isa<UndefValue>(V2))
832+
return VLOperands::ScoreUndef;
833+
834+
return VLOperands::ScoreFail;
835+
}
836+
837+
/// Holds the values and their lane that are taking part in the look-ahead
838+
/// score calculation. This is used in the external uses cost calculation.
839+
SmallDenseMap<Value *, int> InLookAheadValues;
840+
841+
/// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
842+
/// either external to the vectorized code, or require shuffling.
843+
int getExternalUsesCost(const std::pair<Value *, int> &LHS,
844+
const std::pair<Value *, int> &RHS) {
845+
int Cost = 0;
846+
SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
847+
for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
848+
Value *V = Values[Idx].first;
849+
// Calculate the absolute lane, using the minimum relative lane of LHS
850+
// and RHS as base and Idx as the offset.
851+
int Ln = std::min(LHS.second, RHS.second) + Idx;
852+
assert(Ln >= 0 && "Bad lane calculation");
853+
unsigned UsersBudget = LookAheadUsersBudget;
854+
for (User *U : V->users()) {
855+
if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
856+
// The user is in the VectorizableTree. Check if we need to insert.
857+
auto It = llvm::find(UserTE->Scalars, U);
858+
assert(It != UserTE->Scalars.end() && "U is in UserTE");
859+
int UserLn = std::distance(UserTE->Scalars.begin(), It);
860+
assert(UserLn >= 0 && "Bad lane");
861+
if (UserLn != Ln)
862+
Cost += UserInDiffLaneCost;
863+
} else {
864+
// Check if the user is in the look-ahead code.
865+
auto It2 = InLookAheadValues.find(U);
866+
if (It2 != InLookAheadValues.end()) {
867+
// The user is in the look-ahead code. Check the lane.
868+
if (It2->second != Ln)
869+
Cost += UserInDiffLaneCost;
870+
} else {
871+
// The user is neither in SLP tree nor in the look-ahead code.
872+
Cost += ExternalUseCost;
873+
}
874+
}
875+
// Limit the number of visited uses to cap compilation time.
876+
if (--UsersBudget == 0)
877+
break;
878+
}
879+
}
880+
return Cost;
881+
}
882+
883+
/// Go through the operands of \p LHS and \p RHS recursively until \p
884+
/// MaxLevel, and return the cummulative score. For example:
885+
/// \verbatim
886+
/// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
887+
/// \ / \ / \ / \ /
888+
/// + + + +
889+
/// G1 G2 G3 G4
890+
/// \endverbatim
891+
/// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
892+
/// each level recursively, accumulating the score. It starts from matching
893+
/// the additions at level 0, then moves on to the loads (level 1). The
894+
/// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
895+
/// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
896+
/// {A[0],C[0]} has a score of VLOperands::ScoreFail.
897+
/// Please note that the order of the operands does not matter, as we
898+
/// evaluate the score of all profitable combinations of operands. In
899+
/// other words the score of G1 and G4 is the same as G1 and G2. This
900+
/// heuristic is based on ideas described in:
901+
/// Look-ahead SLP: Auto-vectorization in the presence of commutative
902+
/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
903+
/// Luís F. W. Góes
904+
int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
905+
const std::pair<Value *, int> &RHS, int CurrLevel,
906+
int MaxLevel) {
907+
908+
Value *V1 = LHS.first;
909+
Value *V2 = RHS.first;
910+
// Get the shallow score of V1 and V2.
911+
int ShallowScoreAtThisLevel =
912+
std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
913+
getExternalUsesCost(LHS, RHS));
914+
int Lane1 = LHS.second;
915+
int Lane2 = RHS.second;
916+
917+
// If reached MaxLevel,
918+
// or if V1 and V2 are not instructions,
919+
// or if they are SPLAT,
920+
// or if they are not consecutive, early return the current cost.
921+
auto *I1 = dyn_cast<Instruction>(V1);
922+
auto *I2 = dyn_cast<Instruction>(V2);
923+
if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
924+
ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
925+
(isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
926+
return ShallowScoreAtThisLevel;
927+
assert(I1 && I2 && "Should have early exited.");
928+
929+
// Keep track of in-tree values for determining the external-use cost.
930+
InLookAheadValues[V1] = Lane1;
931+
InLookAheadValues[V2] = Lane2;
932+
933+
// Contains the I2 operand indexes that got matched with I1 operands.
934+
SmallSet<unsigned, 4> Op2Used;
935+
936+
// Recursion towards the operands of I1 and I2. We are trying all possbile
937+
// operand pairs, and keeping track of the best score.
938+
for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
939+
OpIdx1 != NumOperands1; ++OpIdx1) {
940+
// Try to pair op1I with the best operand of I2.
941+
int MaxTmpScore = 0;
942+
unsigned MaxOpIdx2 = 0;
943+
bool FoundBest = false;
944+
// If I2 is commutative try all combinations.
945+
unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
946+
unsigned ToIdx = isCommutative(I2)
947+
? I2->getNumOperands()
948+
: std::min(I2->getNumOperands(), OpIdx1 + 1);
949+
assert(FromIdx <= ToIdx && "Bad index");
950+
for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
951+
// Skip operands already paired with OpIdx1.
952+
if (Op2Used.count(OpIdx2))
953+
continue;
954+
// Recursively calculate the cost at each level
955+
int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
956+
{I2->getOperand(OpIdx2), Lane2},
957+
CurrLevel + 1, MaxLevel);
958+
// Look for the best score.
959+
if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
960+
MaxTmpScore = TmpScore;
961+
MaxOpIdx2 = OpIdx2;
962+
FoundBest = true;
963+
}
964+
}
965+
if (FoundBest) {
966+
// Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
967+
Op2Used.insert(MaxOpIdx2);
968+
ShallowScoreAtThisLevel += MaxTmpScore;
969+
}
970+
}
971+
return ShallowScoreAtThisLevel;
972+
}
973+
974+
/// \Returns the look-ahead score, which tells us how much the sub-trees
975+
/// rooted at \p LHS and \p RHS match, the more they match the higher the
976+
/// score. This helps break ties in an informed way when we cannot decide on
977+
/// the order of the operands by just considering the immediate
978+
/// predecessors.
979+
int getLookAheadScore(const std::pair<Value *, int> &LHS,
980+
const std::pair<Value *, int> &RHS) {
981+
InLookAheadValues.clear();
982+
return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
983+
}
984+
749985
// Search all operands in Ops[*][Lane] for the one that matches best
750986
// Ops[OpIdx][LastLane] and return its opreand index.
751987
// If no good match can be found, return None.
@@ -763,9 +999,6 @@ class BoUpSLP {
763999
// The linearized opcode of the operand at OpIdx, Lane.
7641000
bool OpIdxAPO = getData(OpIdx, Lane).APO;
7651001

766-
const unsigned BestScore = 2;
767-
const unsigned GoodScore = 1;
768-
7691002
// The best operand index and its score.
7701003
// Sometimes we have more than one option (e.g., Opcode and Undefs), so we
7711004
// are using the score to differentiate between the two.
@@ -794,41 +1027,19 @@ class BoUpSLP {
7941027
// Look for an operand that matches the current mode.
7951028
switch (RMode) {
7961029
case ReorderingMode::Load:
797-
if (isa<LoadInst>(Op)) {
798-
// Figure out which is left and right, so that we can check for
799-
// consecutive loads
800-
bool LeftToRight = Lane > LastLane;
801-
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
802-
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
803-
if (isConsecutiveAccess(cast<LoadInst>(OpLeft),
804-
cast<LoadInst>(OpRight), DL, SE))
805-
BestOp.Idx = Idx;
806-
}
807-
break;
808-
case ReorderingMode::Opcode:
809-
// We accept both Instructions and Undefs, but with different scores.
810-
if ((isa<Instruction>(Op) && isa<Instruction>(OpLastLane) &&
811-
cast<Instruction>(Op)->getOpcode() ==
812-
cast<Instruction>(OpLastLane)->getOpcode()) ||
813-
(isa<UndefValue>(OpLastLane) && isa<Instruction>(Op)) ||
814-
isa<UndefValue>(Op)) {
815-
// An instruction has a higher score than an undef.
816-
unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
817-
if (Score > BestOp.Score) {
818-
BestOp.Idx = Idx;
819-
BestOp.Score = Score;
820-
}
821-
}
822-
break;
8231030
case ReorderingMode::Constant:
824-
if (isa<Constant>(Op)) {
825-
unsigned Score = (isa<UndefValue>(Op)) ? GoodScore : BestScore;
826-
if (Score > BestOp.Score) {
827-
BestOp.Idx = Idx;
828-
BestOp.Score = Score;
829-
}
1031+
case ReorderingMode::Opcode: {
1032+
bool LeftToRight = Lane > LastLane;
1033+
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
1034+
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
1035+
unsigned Score =
1036+
getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
1037+
if (Score > BestOp.Score) {
1038+
BestOp.Idx = Idx;
1039+
BestOp.Score = Score;
8301040
}
8311041
break;
1042+
}
8321043
case ReorderingMode::Splat:
8331044
if (Op == OpLastLane)
8341045
BestOp.Idx = Idx;
@@ -959,8 +1170,8 @@ class BoUpSLP {
9591170
public:
9601171
/// Initialize with all the operands of the instruction vector \p RootVL.
9611172
VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
962-
ScalarEvolution &SE)
963-
: DL(DL), SE(SE) {
1173+
ScalarEvolution &SE, const BoUpSLP &R)
1174+
: DL(DL), SE(SE), R(R) {
9641175
// Append all the operands of RootVL.
9651176
appendOperandsOfVL(RootVL);
9661177
}
@@ -1189,7 +1400,8 @@ class BoUpSLP {
11891400
SmallVectorImpl<Value *> &Left,
11901401
SmallVectorImpl<Value *> &Right,
11911402
const DataLayout &DL,
1192-
ScalarEvolution &SE);
1403+
ScalarEvolution &SE,
1404+
const BoUpSLP &R);
11931405
struct TreeEntry {
11941406
using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
11951407
TreeEntry(VecTreeTy &Container) : Container(Container) {}
@@ -2550,7 +2762,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
25502762
// Commutative predicate - collect + sort operands of the instructions
25512763
// so that each side is more likely to have the same opcode.
25522764
assert(P0 == SwapP0 && "Commutative Predicate mismatch");
2553-
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2765+
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
25542766
} else {
25552767
// Collect operands - commute if it uses the swapped predicate.
25562768
for (Value *V : VL) {
@@ -2597,7 +2809,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
25972809
// have the same opcode.
25982810
if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
25992811
ValueList Left, Right;
2600-
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
2812+
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
26012813
TE->setOperand(0, Left);
26022814
TE->setOperand(1, Right);
26032815
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -2789,7 +3001,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
27893001
// Reorder operands if reordering would enable vectorization.
27903002
if (isa<BinaryOperator>(VL0)) {
27913003
ValueList Left, Right;
2792-
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE);
3004+
reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
27933005
TE->setOperand(0, Left);
27943006
TE->setOperand(1, Right);
27953007
buildTree_rec(Left, Depth + 1, {TE, 0});
@@ -3550,13 +3762,15 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
35503762

35513763
// Perform operand reordering on the instructions in VL and return the reordered
35523764
// operands in Left and Right.
3553-
void BoUpSLP::reorderInputsAccordingToOpcode(
3554-
ArrayRef<Value *> VL, SmallVectorImpl<Value *> &Left,
3555-
SmallVectorImpl<Value *> &Right, const DataLayout &DL,
3556-
ScalarEvolution &SE) {
3765+
void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
3766+
SmallVectorImpl<Value *> &Left,
3767+
SmallVectorImpl<Value *> &Right,
3768+
const DataLayout &DL,
3769+
ScalarEvolution &SE,
3770+
const BoUpSLP &R) {
35573771
if (VL.empty())
35583772
return;
3559-
VLOperands Ops(VL, DL, SE);
3773+
VLOperands Ops(VL, DL, SE, R);
35603774
// Reorder the operands in place.
35613775
Ops.reorder();
35623776
Left = Ops.getVL(0);

0 commit comments

Comments
 (0)