Skip to content

Commit 576261a

Browse files
[SLP]Improve reordering for consts, splats and ops from same nodes + improved analysis.
Improved detection of const/splat candidates, their matching and analysis of instructions from same nodes. Metric: size..text Program size..text results results0 diff results results0 diff test-suite :: MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE.test 92952.00 93096.00 0.2% test-suite :: External/SPEC/CINT2006/464.h264ref/464.h264ref.test 779832.00 780136.00 0.0% test-suite :: MultiSource/Applications/JM/lencod/lencod.test 839923.00 840179.00 0.0% test-suite :: MultiSource/Applications/JM/ldecod/ldecod.test 392708.00 392740.00 0.0% test-suite :: External/SPEC/CFP2017rate/511.povray_r/511.povray_r.test 1171131.00 1171147.00 0.0% test-suite :: External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r.test 1391089.00 1391073.00 -0.0% test-suite :: External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s.test 1391089.00 1391073.00 -0.0% test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12352780.00 12352636.00 -0.0% MultiSource/Benchmarks/DOE-ProxyApps-C++/miniFE/miniFE - small reordering External/SPEC/CINT2006/464.h264ref/464.h264ref - small better code after reordering MultiSource/Applications/JM/lencod/lencod - smaller code with less shuffles MultiSource/Applications/JM/ldecod/ldecod - same External/SPEC/CFP2017rate/511.povray_r/511.povray_r - 2 extra loads vectorized, smaller code External/SPEC/CFP2017rate/538.imagick_r/538.imagick_r - better code, size increased because of more constant vectors. External/SPEC/CFP2017speed/638.imagick_s/638.imagick_s - same External/SPEC/CFP2017rate/526.blender_r/526.blender_r - small change in the vectorized code, some code a bit better, some a bit worse. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #87091
1 parent 67e726a commit 576261a

13 files changed

+229
-181
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 76 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1395,12 +1395,19 @@ class BoUpSLP {
13951395
return LookAheadHeuristics::ScoreSplat;
13961396
}
13971397

1398+
auto CheckSameEntryOrFail = [&]() {
1399+
if (const TreeEntry *TE1 = R.getTreeEntry(V1);
1400+
TE1 && TE1 == R.getTreeEntry(V2))
1401+
return LookAheadHeuristics::ScoreSplatLoads;
1402+
return LookAheadHeuristics::ScoreFail;
1403+
};
1404+
13981405
auto *LI1 = dyn_cast<LoadInst>(V1);
13991406
auto *LI2 = dyn_cast<LoadInst>(V2);
14001407
if (LI1 && LI2) {
14011408
if (LI1->getParent() != LI2->getParent() || !LI1->isSimple() ||
14021409
!LI2->isSimple())
1403-
return LookAheadHeuristics::ScoreFail;
1410+
return CheckSameEntryOrFail();
14041411

14051412
std::optional<int> Dist = getPointersDiff(
14061413
LI1->getType(), LI1->getPointerOperand(), LI2->getType(),
@@ -1412,7 +1419,7 @@ class BoUpSLP {
14121419
FixedVectorType::get(LI1->getType(), NumLanes),
14131420
LI1->getAlign()))
14141421
return LookAheadHeuristics::ScoreMaskedGatherCandidate;
1415-
return LookAheadHeuristics::ScoreFail;
1422+
return CheckSameEntryOrFail();
14161423
}
14171424
// The distance is too large - still may be profitable to use masked
14181425
// loads/gathers.
@@ -1469,14 +1476,14 @@ class BoUpSLP {
14691476
}
14701477
return LookAheadHeuristics::ScoreAltOpcodes;
14711478
}
1472-
return LookAheadHeuristics::ScoreFail;
1479+
return CheckSameEntryOrFail();
14731480
}
14741481

14751482
auto *I1 = dyn_cast<Instruction>(V1);
14761483
auto *I2 = dyn_cast<Instruction>(V2);
14771484
if (I1 && I2) {
14781485
if (I1->getParent() != I2->getParent())
1479-
return LookAheadHeuristics::ScoreFail;
1486+
return CheckSameEntryOrFail();
14801487
SmallVector<Value *, 4> Ops(MainAltOps.begin(), MainAltOps.end());
14811488
Ops.push_back(I1);
14821489
Ops.push_back(I2);
@@ -1497,7 +1504,7 @@ class BoUpSLP {
14971504
if (isa<UndefValue>(V2))
14981505
return LookAheadHeuristics::ScoreUndef;
14991506

1500-
return LookAheadHeuristics::ScoreFail;
1507+
return CheckSameEntryOrFail();
15011508
}
15021509

15031510
/// Go through the operands of \p LHS and \p RHS recursively until
@@ -1660,6 +1667,7 @@ class BoUpSLP {
16601667
const DataLayout &DL;
16611668
ScalarEvolution &SE;
16621669
const BoUpSLP &R;
1670+
const Loop *L = nullptr;
16631671

16641672
/// \returns the operand data at \p OpIdx and \p Lane.
16651673
OperandData &getData(unsigned OpIdx, unsigned Lane) {
@@ -1828,8 +1836,9 @@ class BoUpSLP {
18281836
// Track if the operand must be marked as used. If the operand is set to
18291837
// Score 1 explicitly (because of non power-of-2 unique scalars, we may
18301838
// want to reestimate the operands again on the following iterations).
1831-
bool IsUsed =
1832-
RMode == ReorderingMode::Splat || RMode == ReorderingMode::Constant;
1839+
bool IsUsed = RMode == ReorderingMode::Splat ||
1840+
RMode == ReorderingMode::Constant ||
1841+
RMode == ReorderingMode::Load;
18331842
// Iterate through all unused operands and look for the best.
18341843
for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
18351844
// Get the operand at Idx and Lane.
@@ -1850,23 +1859,44 @@ class BoUpSLP {
18501859
// Look for an operand that matches the current mode.
18511860
switch (RMode) {
18521861
case ReorderingMode::Load:
1853-
case ReorderingMode::Constant:
18541862
case ReorderingMode::Opcode: {
18551863
bool LeftToRight = Lane > LastLane;
18561864
Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
18571865
Value *OpRight = (LeftToRight) ? Op : OpLastLane;
18581866
int Score = getLookAheadScore(OpLeft, OpRight, MainAltOps, Lane,
18591867
OpIdx, Idx, IsUsed);
1860-
if (Score > static_cast<int>(BestOp.Score)) {
1868+
if (Score > static_cast<int>(BestOp.Score) ||
1869+
(Score > 0 && Score == static_cast<int>(BestOp.Score) &&
1870+
Idx == OpIdx)) {
18611871
BestOp.Idx = Idx;
18621872
BestOp.Score = Score;
18631873
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] = Score;
18641874
}
18651875
break;
18661876
}
1877+
case ReorderingMode::Constant:
1878+
if (isa<Constant>(Op) ||
1879+
(!BestOp.Score && L && L->isLoopInvariant(Op))) {
1880+
BestOp.Idx = Idx;
1881+
if (isa<Constant>(Op)) {
1882+
BestOp.Score = LookAheadHeuristics::ScoreConstants;
1883+
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1884+
LookAheadHeuristics::ScoreConstants;
1885+
}
1886+
if (isa<UndefValue>(Op) || !isa<Constant>(Op))
1887+
IsUsed = false;
1888+
}
1889+
break;
18671890
case ReorderingMode::Splat:
1868-
if (Op == OpLastLane)
1891+
if (Op == OpLastLane || (!BestOp.Score && isa<Constant>(Op))) {
1892+
IsUsed = Op == OpLastLane;
1893+
if (Op == OpLastLane) {
1894+
BestOp.Score = LookAheadHeuristics::ScoreSplat;
1895+
BestScoresPerLanes[std::make_pair(OpIdx, Lane)] =
1896+
LookAheadHeuristics::ScoreSplat;
1897+
}
18691898
BestOp.Idx = Idx;
1899+
}
18701900
break;
18711901
case ReorderingMode::Failed:
18721902
llvm_unreachable("Not expected Failed reordering mode.");
@@ -2059,10 +2089,12 @@ class BoUpSLP {
20592089
void clear() { OpsVec.clear(); }
20602090

20612091
/// \Returns true if there are enough operands identical to \p Op to fill
2062-
/// the whole vector.
2092+
/// the whole vector (it is mixed with constants or loop invariant values).
20632093
/// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
20642094
bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
20652095
bool OpAPO = getData(OpIdx, Lane).APO;
2096+
bool IsInvariant = L && L->isLoopInvariant(Op);
2097+
unsigned Cnt = 0;
20662098
for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
20672099
if (Ln == Lane)
20682100
continue;
@@ -2072,22 +2104,51 @@ class BoUpSLP {
20722104
OperandData &Data = getData(OpI, Ln);
20732105
if (Data.APO != OpAPO || Data.IsUsed)
20742106
continue;
2075-
if (Data.V == Op) {
2107+
Value *OpILane = getValue(OpI, Lane);
2108+
bool IsConstantOp = isa<Constant>(OpILane);
2109+
// Consider the broadcast candidate if:
2110+
// 1. Same value is found in one of the operands.
2111+
if (Data.V == Op ||
2112+
// 2. The operand in the given lane is not constant but there is a
2113+
// constant operand in another lane (which can be moved to the
2114+
// given lane). In this case we can represent it as a simple
2115+
// permutation of constant and broadcast.
2116+
(!IsConstantOp &&
2117+
((Lns > 2 && isa<Constant>(Data.V)) ||
2118+
// 2.1. If we have only 2 lanes, need to check that value in the
2119+
// next lane does not build same opcode sequence.
2120+
(Lns == 2 &&
2121+
!getSameOpcode({Op, getValue((OpI + 1) % OpE, Ln)}, TLI)
2122+
.getOpcode() &&
2123+
isa<Constant>(Data.V)))) ||
2124+
// 3. The operand in the current lane is loop invariant (can be
2125+
// hoisted out) and another operand is also a loop invariant
2126+
// (though not a constant). In this case the whole vector can be
2127+
// hoisted out.
2128+
// FIXME: need to teach the cost model about this case for better
2129+
// estimation.
2130+
(IsInvariant && !isa<Constant>(Data.V) &&
2131+
!getSameOpcode({Op, Data.V}, TLI).getOpcode() &&
2132+
L->isLoopInvariant(Data.V))) {
20762133
FoundCandidate = true;
2077-
Data.IsUsed = true;
2134+
Data.IsUsed = Data.V == Op;
2135+
if (Data.V == Op)
2136+
++Cnt;
20782137
break;
20792138
}
20802139
}
20812140
if (!FoundCandidate)
20822141
return false;
20832142
}
2084-
return true;
2143+
return getNumLanes() == 2 || Cnt > 1;
20852144
}
20862145

20872146
public:
20882147
/// Initialize with all the operands of the instruction vector \p RootVL.
20892148
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
2090-
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R) {
2149+
: TLI(*R.TLI), DL(*R.DL), SE(*R.SE), R(R),
2150+
L(R.LI->getLoopFor(
2151+
(cast<Instruction>(RootVL.front())->getParent()))) {
20912152
// Append all the operands of RootVL.
20922153
appendOperandsOfVL(RootVL);
20932154
}
@@ -2219,8 +2280,6 @@ class BoUpSLP {
22192280
// getBestOperand().
22202281
swap(OpIdx, *BestIdx, Lane);
22212282
} else {
2222-
// We failed to find a best operand, set mode to 'Failed'.
2223-
ReorderingModes[OpIdx] = ReorderingMode::Failed;
22242283
// Enable the second pass.
22252284
StrategyFailed = true;
22262285
}

0 commit comments

Comments
 (0)