Skip to content

Commit a0086ad

Browse files
committed
[SLP]Improve gathering of scalar elements.
1. Better sorting of scalars to be gathered. Trying to insert constants/arguments/instructions-out-of-loop at first and only then the instructions which are inside the loop. It improves hoisting of invariant insertelements instructions. 2. Better detection of shuffle candidates in gathering function. 3. The cost of insertelement for constants is 0. Part of D57059. Differential Revision: https://reviews.llvm.org/D103458
1 parent 0120e6c commit a0086ad

31 files changed

+596
-611
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 190 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -218,15 +218,18 @@ static bool allSameBlock(ArrayRef<Value *> VL) {
218218
return true;
219219
}
220220

221+
/// \returns True if the value is a constant (but not globals/constant
222+
/// expressions).
223+
static bool isConstant(Value *V) {
224+
return isa<Constant>(V) && !isa<ConstantExpr>(V) && !isa<GlobalValue>(V);
225+
}
226+
221227
/// \returns True if all of the values in \p VL are constants (but not
222228
/// globals/constant expressions).
223229
static bool allConstant(ArrayRef<Value *> VL) {
224230
// Constant expressions and globals can't be vectorized like normal integer/FP
225231
// constants.
226-
for (Value *i : VL)
227-
if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
228-
return false;
229-
return true;
232+
return all_of(VL, isConstant);
230233
}
231234

232235
/// \returns True if all of the values in \p VL are identical.
@@ -4725,6 +4728,8 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
47254728
// Iterate in reverse order to consider insert elements with the high cost.
47264729
for (unsigned I = VL.size(); I > 0; --I) {
47274730
unsigned Idx = I - 1;
4731+
if (isConstant(VL[Idx]))
4732+
continue;
47284733
if (!UniqueElements.insert(VL[Idx]).second)
47294734
ShuffledElements.insert(Idx);
47304735
}
@@ -4810,108 +4815,79 @@ void BoUpSLP::setInsertPointAfterBundle(const TreeEntry *E) {
48104815
}
48114816

48124817
Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
4813-
Value *Val0 =
4814-
isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
4815-
FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
4816-
Value *Vec = PoisonValue::get(VecTy);
4817-
unsigned InsIndex = 0;
4818-
for (Value *Val : VL) {
4819-
Vec = Builder.CreateInsertElement(Vec, Val, Builder.getInt32(InsIndex++));
4818+
// List of instructions/lanes from current block and/or the blocks which are
4819+
// part of the current loop. These instructions will be inserted at the end to
4820+
// make it possible to optimize loops and hoist invariant instructions out of
4821+
// the loops body with better chances for success.
4822+
SmallVector<std::pair<Value *, unsigned>, 4> PostponedInsts;
4823+
SmallSet<int, 4> PostponedIndices;
4824+
Loop *L = LI->getLoopFor(Builder.GetInsertBlock());
4825+
auto &&CheckPredecessor = [](BasicBlock *InstBB, BasicBlock *InsertBB) {
4826+
SmallPtrSet<BasicBlock *, 4> Visited;
4827+
while (InsertBB && InsertBB != InstBB && Visited.insert(InsertBB).second)
4828+
InsertBB = InsertBB->getSinglePredecessor();
4829+
return InsertBB && InsertBB == InstBB;
4830+
};
4831+
for (int I = 0, E = VL.size(); I < E; ++I) {
4832+
if (auto *Inst = dyn_cast<Instruction>(VL[I]))
4833+
if ((CheckPredecessor(Inst->getParent(), Builder.GetInsertBlock()) ||
4834+
getTreeEntry(Inst) || (L && (L->contains(Inst)))) &&
4835+
PostponedIndices.insert(I).second)
4836+
PostponedInsts.emplace_back(Inst, I);
4837+
}
4838+
4839+
auto &&CreateInsertElement = [this](Value *Vec, Value *V, unsigned Pos) {
4840+
// No need to insert undefs elements - exit.
4841+
if (isa<UndefValue>(V))
4842+
return Vec;
4843+
Vec = Builder.CreateInsertElement(Vec, V, Builder.getInt32(Pos));
48204844
auto *InsElt = dyn_cast<InsertElementInst>(Vec);
48214845
if (!InsElt)
4822-
continue;
4846+
return Vec;
48234847
GatherSeq.insert(InsElt);
48244848
CSEBlocks.insert(InsElt->getParent());
48254849
// Add to our 'need-to-extract' list.
4826-
if (TreeEntry *Entry = getTreeEntry(Val)) {
4850+
if (TreeEntry *Entry = getTreeEntry(V)) {
48274851
// Find which lane we need to extract.
4828-
int FoundLane =
4829-
findLaneForValue(Entry->Scalars, Entry->ReuseShuffleIndices, Val);
4830-
ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
4831-
}
4832-
}
4833-
4834-
return Vec;
4835-
}
4836-
4837-
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
4838-
InstructionsState S = getSameOpcode(VL);
4839-
if (S.getOpcode()) {
4840-
if (TreeEntry *E = getTreeEntry(S.OpValue)) {
4841-
if (E->isSame(VL)) {
4842-
Value *V = vectorizeTree(E);
4843-
if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
4844-
// Reshuffle to get only unique values.
4845-
// If some of the scalars are duplicated in the vectorization tree
4846-
// entry, we do not vectorize them but instead generate a mask for the
4847-
// reuses. But if there are several users of the same entry, they may
4848-
// have different vectorization factors. This is especially important
4849-
// for PHI nodes. In this case, we need to adapt the resulting
4850-
// instruction for the user vectorization factor and have to reshuffle
4851-
// it again to take only unique elements of the vector. Without this
4852-
// code the function incorrectly returns reduced vector instruction
4853-
// with the same elements, not with the unique ones.
4854-
// block:
4855-
// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
4856-
// %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
4857-
// ... (use %2)
4858-
// %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
4859-
// br %block
4860-
SmallVector<int, 4> UniqueIdxs;
4861-
SmallSet<int, 4> UsedIdxs;
4862-
int Pos = 0;
4863-
for (int Idx : E->ReuseShuffleIndices) {
4864-
if (UsedIdxs.insert(Idx).second)
4865-
UniqueIdxs.emplace_back(Pos);
4866-
++Pos;
4867-
}
4868-
V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
4869-
}
4870-
return V;
4852+
unsigned FoundLane =
4853+
std::distance(Entry->Scalars.begin(), find(Entry->Scalars, V));
4854+
assert(FoundLane < Entry->Scalars.size() && "Couldn't find extract lane");
4855+
if (!Entry->ReuseShuffleIndices.empty()) {
4856+
FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
4857+
find(Entry->ReuseShuffleIndices, FoundLane));
48714858
}
4859+
ExternalUses.emplace_back(V, InsElt, FoundLane);
48724860
}
4861+
return Vec;
4862+
};
4863+
Value *Val0 =
4864+
isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
4865+
FixedVectorType *VecTy = FixedVectorType::get(Val0->getType(), VL.size());
4866+
Value *Vec = PoisonValue::get(VecTy);
4867+
for (int I = 0, E = VL.size(); I < E; ++I) {
4868+
if (PostponedIndices.contains(I))
4869+
continue;
4870+
Vec = CreateInsertElement(Vec, VL[I], I);
48734871
}
4872+
// Append instructions, which are/may be part of the loop, in the end to make
4873+
// it possible to hoist non-loop-based instructions.
4874+
for (const std::pair<Value *, unsigned> &Pair : PostponedInsts)
4875+
Vec = CreateInsertElement(Vec, Pair.first, Pair.second);
48744876

4875-
// Check that every instruction appears once in this bundle.
4876-
SmallVector<int, 4> ReuseShuffleIndicies;
4877-
SmallVector<Value *, 4> UniqueValues;
4878-
if (VL.size() > 2) {
4879-
DenseMap<Value *, unsigned> UniquePositions;
4880-
for (Value *V : VL) {
4881-
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
4882-
ReuseShuffleIndicies.emplace_back(Res.first->second);
4883-
if (Res.second || isa<Constant>(V))
4884-
UniqueValues.emplace_back(V);
4885-
}
4886-
// Do not shuffle single element or if number of unique values is not power
4887-
// of 2.
4888-
if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
4889-
!llvm::isPowerOf2_32(UniqueValues.size()))
4890-
ReuseShuffleIndicies.clear();
4891-
else
4892-
VL = UniqueValues;
4893-
}
4894-
4895-
Value *Vec = gather(VL);
4896-
if (!ReuseShuffleIndicies.empty()) {
4897-
Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
4898-
if (auto *I = dyn_cast<Instruction>(Vec)) {
4899-
GatherSeq.insert(I);
4900-
CSEBlocks.insert(I->getParent());
4901-
}
4902-
}
49034877
return Vec;
49044878
}
49054879

49064880
namespace {
49074881
/// Merges shuffle masks and emits final shuffle instruction, if required.
49084882
class ShuffleInstructionBuilder {
49094883
IRBuilderBase &Builder;
4884+
const unsigned VF = 0;
49104885
bool IsFinalized = false;
49114886
SmallVector<int, 4> Mask;
49124887

49134888
public:
4914-
ShuffleInstructionBuilder(IRBuilderBase &Builder) : Builder(Builder) {}
4889+
ShuffleInstructionBuilder(IRBuilderBase &Builder, unsigned VF)
4890+
: Builder(Builder), VF(VF) {}
49154891

49164892
/// Adds a mask, inverting it before applying.
49174893
void addInversedMask(ArrayRef<unsigned> SubMask) {
@@ -4938,8 +4914,9 @@ class ShuffleInstructionBuilder {
49384914
SmallVector<int, 4> NewMask(SubMask.size(), SubMask.size());
49394915
int TermValue = std::min(Mask.size(), SubMask.size());
49404916
for (int I = 0, E = SubMask.size(); I < E; ++I) {
4941-
if (SubMask[I] >= TermValue || Mask[SubMask[I]] >= TermValue) {
4942-
NewMask[I] = E;
4917+
if (SubMask[I] >= TermValue || SubMask[I] == UndefMaskElem ||
4918+
Mask[SubMask[I]] >= TermValue) {
4919+
NewMask[I] = UndefMaskElem;
49434920
continue;
49444921
}
49454922
NewMask[I] = Mask[SubMask[I]];
@@ -4949,7 +4926,14 @@ class ShuffleInstructionBuilder {
49494926

49504927
Value *finalize(Value *V) {
49514928
IsFinalized = true;
4952-
if (Mask.empty())
4929+
unsigned ValueVF = cast<FixedVectorType>(V->getType())->getNumElements();
4930+
if (VF == ValueVF && Mask.empty())
4931+
return V;
4932+
SmallVector<int, 4> NormalizedMask(VF, UndefMaskElem);
4933+
std::iota(NormalizedMask.begin(), NormalizedMask.end(), 0);
4934+
addMask(NormalizedMask);
4935+
4936+
if (VF == ValueVF && ShuffleVectorInst::isIdentityMask(Mask))
49534937
return V;
49544938
return Builder.CreateShuffleVector(V, Mask, "shuffle");
49554939
}
@@ -4961,6 +4945,120 @@ class ShuffleInstructionBuilder {
49614945
};
49624946
} // namespace
49634947

4948+
Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
4949+
unsigned VF = VL.size();
4950+
InstructionsState S = getSameOpcode(VL);
4951+
if (S.getOpcode()) {
4952+
if (TreeEntry *E = getTreeEntry(S.OpValue))
4953+
if (E->isSame(VL)) {
4954+
Value *V = vectorizeTree(E);
4955+
if (VF != cast<FixedVectorType>(V->getType())->getNumElements()) {
4956+
if (!E->ReuseShuffleIndices.empty()) {
4957+
// Reshuffle to get only unique values.
4958+
// If some of the scalars are duplicated in the vectorization tree
4959+
// entry, we do not vectorize them but instead generate a mask for
4960+
// the reuses. But if there are several users of the same entry,
4961+
// they may have different vectorization factors. This is especially
4962+
// important for PHI nodes. In this case, we need to adapt the
4963+
// resulting instruction for the user vectorization factor and have
4964+
// to reshuffle it again to take only unique elements of the vector.
4965+
// Without this code the function incorrectly returns reduced vector
4966+
// instruction with the same elements, not with the unique ones.
4967+
4968+
// block:
4969+
// %phi = phi <2 x > { .., %entry} {%shuffle, %block}
4970+
// %2 = shuffle <2 x > %phi, %poison, <4 x > <0, 0, 1, 1>
4971+
// ... (use %2)
4972+
// %shuffle = shuffle <2 x> %2, poison, <2 x> {0, 2}
4973+
// br %block
4974+
SmallVector<int> UniqueIdxs;
4975+
SmallSet<int, 4> UsedIdxs;
4976+
int Pos = 0;
4977+
int Sz = VL.size();
4978+
for (int Idx : E->ReuseShuffleIndices) {
4979+
if (Idx != Sz && UsedIdxs.insert(Idx).second)
4980+
UniqueIdxs.emplace_back(Pos);
4981+
++Pos;
4982+
}
4983+
assert(VF >= UsedIdxs.size() && "Expected vectorization factor "
4984+
"less than original vector size.");
4985+
UniqueIdxs.append(VF - UsedIdxs.size(), UndefMaskElem);
4986+
V = Builder.CreateShuffleVector(V, UniqueIdxs, "shrink.shuffle");
4987+
} else {
4988+
assert(VF < cast<FixedVectorType>(V->getType())->getNumElements() &&
4989+
"Expected vectorization factor less "
4990+
"than original vector size.");
4991+
SmallVector<int> UniformMask(VF, 0);
4992+
std::iota(UniformMask.begin(), UniformMask.end(), 0);
4993+
V = Builder.CreateShuffleVector(V, UniformMask, "shrink.shuffle");
4994+
}
4995+
}
4996+
return V;
4997+
}
4998+
}
4999+
5000+
// Check that every instruction appears once in this bundle.
5001+
SmallVector<int> ReuseShuffleIndicies;
5002+
SmallVector<Value *> UniqueValues;
5003+
if (VL.size() > 2) {
5004+
DenseMap<Value *, unsigned> UniquePositions;
5005+
unsigned NumValues =
5006+
std::distance(VL.begin(), find_if(reverse(VL), [](Value *V) {
5007+
return !isa<UndefValue>(V);
5008+
}).base());
5009+
VF = std::max<unsigned>(VF, PowerOf2Ceil(NumValues));
5010+
int UniqueVals = 0;
5011+
bool HasUndefs = false;
5012+
for (Value *V : VL.drop_back(VL.size() - VF)) {
5013+
if (isa<UndefValue>(V)) {
5014+
ReuseShuffleIndicies.emplace_back(UndefMaskElem);
5015+
HasUndefs = true;
5016+
continue;
5017+
}
5018+
if (isConstant(V)) {
5019+
ReuseShuffleIndicies.emplace_back(UniqueValues.size());
5020+
UniqueValues.emplace_back(V);
5021+
continue;
5022+
}
5023+
auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
5024+
ReuseShuffleIndicies.emplace_back(Res.first->second);
5025+
if (Res.second) {
5026+
UniqueValues.emplace_back(V);
5027+
++UniqueVals;
5028+
}
5029+
}
5030+
if (HasUndefs && UniqueVals == 1 && UniqueValues.size() == 1) {
5031+
// Emit pure splat vector.
5032+
// FIXME: why it is not identified as an identity.
5033+
unsigned NumUndefs = count(ReuseShuffleIndicies, UndefMaskElem);
5034+
if (NumUndefs == ReuseShuffleIndicies.size() - 1)
5035+
ReuseShuffleIndicies.append(VF - ReuseShuffleIndicies.size(),
5036+
UndefMaskElem);
5037+
else
5038+
ReuseShuffleIndicies.assign(VF, 0);
5039+
} else if (UniqueValues.size() >= VF - 1 || UniqueValues.size() <= 1) {
5040+
ReuseShuffleIndicies.clear();
5041+
UniqueValues.clear();
5042+
UniqueValues.append(VL.begin(), std::next(VL.begin(), NumValues));
5043+
}
5044+
UniqueValues.append(VF - UniqueValues.size(),
5045+
UndefValue::get(VL[0]->getType()));
5046+
VL = UniqueValues;
5047+
}
5048+
5049+
ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
5050+
Value *Vec = gather(VL);
5051+
if (!ReuseShuffleIndicies.empty()) {
5052+
ShuffleBuilder.addMask(ReuseShuffleIndicies);
5053+
Vec = ShuffleBuilder.finalize(Vec);
5054+
if (auto *I = dyn_cast<Instruction>(Vec)) {
5055+
GatherSeq.insert(I);
5056+
CSEBlocks.insert(I->getParent());
5057+
}
5058+
}
5059+
return Vec;
5060+
}
5061+
49645062
Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
49655063
IRBuilder<>::InsertPointGuard Guard(Builder);
49665064

@@ -4969,8 +5067,11 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
49695067
return E->VectorizedValue;
49705068
}
49715069

4972-
ShuffleInstructionBuilder ShuffleBuilder(Builder);
49735070
bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
5071+
unsigned VF = E->Scalars.size();
5072+
if (NeedToShuffleReuses)
5073+
VF = E->ReuseShuffleIndices.size();
5074+
ShuffleInstructionBuilder ShuffleBuilder(Builder, VF);
49745075
if (E->State == TreeEntry::NeedToGather) {
49755076
setInsertPointAfterBundle(E);
49765077
Value *Vec;

llvm/test/Transforms/SLPVectorizer/AArch64/insertelement-inseltpoison.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ target triple = "aarch64-unknown-linux-gnu"
66

77
define <2 x float> @insertelement-fixed-vector() {
88
; CHECK-LABEL: @insertelement-fixed-vector(
9-
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
9+
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> poison)
1010
; CHECK-NEXT: ret <2 x float> [[TMP1]]
1111
;
1212
%f0 = tail call fast float @llvm.fabs.f32(float undef)

llvm/test/Transforms/SLPVectorizer/AArch64/insertelement.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ target triple = "aarch64-unknown-linux-gnu"
66

77
define <2 x float> @insertelement-fixed-vector() {
88
; CHECK-LABEL: @insertelement-fixed-vector(
9-
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> undef)
9+
; CHECK-NEXT: [[TMP1:%.*]] = call fast <2 x float> @llvm.fabs.v2f32(<2 x float> poison)
1010
; CHECK-NEXT: ret <2 x float> [[TMP1]]
1111
;
1212
%f0 = tail call fast float @llvm.fabs.f32(float undef)

llvm/test/Transforms/SLPVectorizer/AArch64/trunc-insertion.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,21 @@ define dso_local void @l() local_unnamed_addr {
88
; CHECK-NEXT: bb:
99
; CHECK-NEXT: br label [[BB1:%.*]]
1010
; CHECK: bb1:
11-
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ]
11+
; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ poison, [[BB:%.*]] ], [ [[TMP11:%.*]], [[BB25:%.*]] ]
1212
; CHECK-NEXT: br i1 undef, label [[BB3:%.*]], label [[BB11:%.*]]
1313
; CHECK: bb3:
1414
; CHECK-NEXT: [[I4:%.*]] = zext i1 undef to i32
15-
; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], undef
15+
; CHECK-NEXT: [[TMP1:%.*]] = xor <2 x i16> [[TMP0]], poison
1616
; CHECK-NEXT: [[TMP2:%.*]] = icmp ugt <2 x i16> [[TMP1]], <i16 8, i16 8>
1717
; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i1> [[TMP2]] to <2 x i32>
1818
; CHECK-NEXT: br label [[BB25]]
1919
; CHECK: bb11:
2020
; CHECK-NEXT: [[I12:%.*]] = zext i1 undef to i32
21-
; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], undef
21+
; CHECK-NEXT: [[TMP4:%.*]] = xor <2 x i16> [[TMP0]], poison
2222
; CHECK-NEXT: [[TMP5:%.*]] = sext <2 x i16> [[TMP4]] to <2 x i64>
23-
; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> undef, [[TMP5]]
23+
; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> poison, [[TMP5]]
2424
; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i1> [[TMP6]] to <2 x i32>
25-
; CHECK-NEXT: [[TMP8:%.*]] = icmp ult <2 x i32> undef, [[TMP7]]
25+
; CHECK-NEXT: [[TMP8:%.*]] = icmp ult <2 x i32> poison, [[TMP7]]
2626
; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i1> [[TMP8]] to <2 x i32>
2727
; CHECK-NEXT: br label [[BB25]]
2828
; CHECK: bb25:

0 commit comments

Comments
 (0)