Skip to content

Commit 3851186

Browse files
committed
[SLP]Remove operands upon marking instruction for deletion.
If the instruction is marked for deletion, better to drop all its operands and mark them for deletion too (if allowed). It allows to have more vectorizable patterns and generate less useless extractelement instructions. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #97409
1 parent 9dca3ac commit 3851186

23 files changed

+242
-94
lines changed

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Lines changed: 187 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1186,6 +1186,12 @@ class BoUpSLP {
11861186
return VectorizableTree.front()->Scalars;
11871187
}
11881188

1189+
/// Checks if the root graph node can be emitted with narrower bitwidth at
1190+
/// codegen and returns it signedness, if so.
1191+
bool isSignedMinBitwidthRootNode() const {
1192+
return MinBWs.at(VectorizableTree.front().get()).second;
1193+
}
1194+
11891195
/// Builds external uses of the vectorized scalars, i.e. the list of
11901196
/// vectorized scalars to be extracted, their lanes and their scalar users. \p
11911197
/// ExternallyUsedValues contains additional list of external uses to handle
@@ -2453,6 +2459,90 @@ class BoUpSLP {
24532459
DeletedInstructions.insert(I);
24542460
}
24552461

2462+
/// Remove instructions from the parent function and clear the operands of \p
2463+
/// DeadVals instructions, marking for deletion trivially dead operands.
2464+
template <typename T>
2465+
void removeInstructionsAndOperands(ArrayRef<T *> DeadVals) {
2466+
SmallVector<WeakTrackingVH> DeadInsts;
2467+
for (T *V : DeadVals) {
2468+
auto *I = cast<Instruction>(V);
2469+
DeletedInstructions.insert(I);
2470+
}
2471+
for (T *V : DeadVals) {
2472+
if (!V)
2473+
continue;
2474+
auto *I = cast<Instruction>(V);
2475+
salvageDebugInfo(*I);
2476+
SmallVector<const TreeEntry *> Entries;
2477+
if (const TreeEntry *Entry = getTreeEntry(I)) {
2478+
Entries.push_back(Entry);
2479+
auto It = MultiNodeScalars.find(I);
2480+
if (It != MultiNodeScalars.end())
2481+
Entries.append(It->second.begin(), It->second.end());
2482+
}
2483+
for (Use &U : I->operands()) {
2484+
if (auto *OpI = dyn_cast_if_present<Instruction>(U.get());
2485+
OpI && !DeletedInstructions.contains(OpI) && OpI->hasOneUser() &&
2486+
wouldInstructionBeTriviallyDead(OpI, TLI) &&
2487+
(Entries.empty() || none_of(Entries, [&](const TreeEntry *Entry) {
2488+
return Entry->VectorizedValue == OpI;
2489+
})))
2490+
DeadInsts.push_back(OpI);
2491+
}
2492+
I->dropAllReferences();
2493+
}
2494+
for (T *V : DeadVals) {
2495+
auto *I = cast<Instruction>(V);
2496+
if (!I->getParent())
2497+
continue;
2498+
assert((I->use_empty() || all_of(I->uses(),
2499+
[&](Use &U) {
2500+
return isDeleted(
2501+
cast<Instruction>(U.getUser()));
2502+
})) &&
2503+
"trying to erase instruction with users.");
2504+
I->removeFromParent();
2505+
SE->forgetValue(I);
2506+
}
2507+
// Process the dead instruction list until empty.
2508+
while (!DeadInsts.empty()) {
2509+
Value *V = DeadInsts.pop_back_val();
2510+
Instruction *VI = cast_or_null<Instruction>(V);
2511+
if (!VI || !VI->getParent())
2512+
continue;
2513+
assert(isInstructionTriviallyDead(VI, TLI) &&
2514+
"Live instruction found in dead worklist!");
2515+
assert(VI->use_empty() && "Instructions with uses are not dead.");
2516+
2517+
// Don't lose the debug info while deleting the instructions.
2518+
salvageDebugInfo(*VI);
2519+
2520+
// Null out all of the instruction's operands to see if any operand
2521+
// becomes dead as we go.
2522+
for (Use &OpU : VI->operands()) {
2523+
Value *OpV = OpU.get();
2524+
if (!OpV)
2525+
continue;
2526+
OpU.set(nullptr);
2527+
2528+
if (!OpV->use_empty())
2529+
continue;
2530+
2531+
// If the operand is an instruction that became dead as we nulled out
2532+
// the operand, and if it is 'trivially' dead, delete it in a future
2533+
// loop iteration.
2534+
if (auto *OpI = dyn_cast<Instruction>(OpV))
2535+
if (!DeletedInstructions.contains(OpI) &&
2536+
isInstructionTriviallyDead(OpI, TLI))
2537+
DeadInsts.push_back(OpI);
2538+
}
2539+
2540+
VI->removeFromParent();
2541+
DeletedInstructions.insert(VI);
2542+
SE->forgetValue(VI);
2543+
}
2544+
}
2545+
24562546
/// Checks if the instruction was already analyzed for being possible
24572547
/// reduction root.
24582548
bool isAnalyzedReductionRoot(Instruction *I) const {
@@ -3987,6 +4077,10 @@ template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
39874077
BoUpSLP::~BoUpSLP() {
39884078
SmallVector<WeakTrackingVH> DeadInsts;
39894079
for (auto *I : DeletedInstructions) {
4080+
if (!I->getParent()) {
4081+
I->insertBefore(F->getEntryBlock().getTerminator());
4082+
continue;
4083+
}
39904084
for (Use &U : I->operands()) {
39914085
auto *Op = dyn_cast<Instruction>(U.get());
39924086
if (Op && !DeletedInstructions.count(Op) && Op->hasOneUser() &&
@@ -14075,11 +14169,8 @@ Value *BoUpSLP::vectorizeTree(
1407514169
}
1407614170
#endif
1407714171
LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
14078-
eraseInstruction(cast<Instruction>(Scalar));
14079-
// Retain to-be-deleted instructions for some debug-info
14080-
// bookkeeping. NOTE: eraseInstruction only marks the instruction for
14081-
// deletion - instructions are not deleted until later.
14082-
RemovedInsts.push_back(cast<Instruction>(Scalar));
14172+
auto *I = cast<Instruction>(Scalar);
14173+
RemovedInsts.push_back(I);
1408314174
}
1408414175
}
1408514176

@@ -14088,6 +14179,22 @@ Value *BoUpSLP::vectorizeTree(
1408814179
if (auto *V = dyn_cast<Instruction>(VectorizableTree[0]->VectorizedValue))
1408914180
V->mergeDIAssignID(RemovedInsts);
1409014181

14182+
// Clear up reduction references, if any.
14183+
if (UserIgnoreList) {
14184+
for (Instruction *I : RemovedInsts) {
14185+
if (getTreeEntry(I)->Idx != 0)
14186+
continue;
14187+
I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) {
14188+
return UserIgnoreList->contains(U.getUser());
14189+
});
14190+
}
14191+
}
14192+
// Retain to-be-deleted instructions for some debug-info bookkeeping and alias
14193+
// cache correctness.
14194+
// NOTE: removeInstructionAndOperands only marks the instruction for deletion
14195+
// - instructions are not deleted until later.
14196+
removeInstructionsAndOperands(ArrayRef(RemovedInsts));
14197+
1409114198
Builder.ClearInsertionPoint();
1409214199
InstrElementSize.clear();
1409314200

@@ -16137,15 +16244,18 @@ bool SLPVectorizerPass::vectorizeStores(
1613716244
Res.first = Idx;
1613816245
Res.second.emplace(Idx, 0);
1613916246
};
16140-
StoreInst *PrevStore = Stores.front();
16247+
Type *PrevValTy = nullptr;
1614116248
for (auto [I, SI] : enumerate(Stores)) {
16249+
if (R.isDeleted(SI))
16250+
continue;
16251+
if (!PrevValTy)
16252+
PrevValTy = SI->getValueOperand()->getType();
1614216253
// Check that we do not try to vectorize stores of different types.
16143-
if (PrevStore->getValueOperand()->getType() !=
16144-
SI->getValueOperand()->getType()) {
16254+
if (PrevValTy != SI->getValueOperand()->getType()) {
1614516255
for (auto &Set : SortedStores)
1614616256
TryToVectorize(Set.second);
1614716257
SortedStores.clear();
16148-
PrevStore = SI;
16258+
PrevValTy = SI->getValueOperand()->getType();
1614916259
}
1615016260
FillStoresSet(I, SI);
1615116261
}
@@ -17028,9 +17138,12 @@ class HorizontalReduction {
1702817138
Value *VectorizedTree = nullptr;
1702917139
bool CheckForReusedReductionOps = false;
1703017140
// Try to vectorize elements based on their type.
17141+
SmallVector<InstructionsState> States;
17142+
for (ArrayRef<Value *> RV : ReducedVals)
17143+
States.push_back(getSameOpcode(RV, TLI));
1703117144
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
1703217145
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
17033-
InstructionsState S = getSameOpcode(OrigReducedVals, TLI);
17146+
InstructionsState S = States[I];
1703417147
SmallVector<Value *> Candidates;
1703517148
Candidates.reserve(2 * OrigReducedVals.size());
1703617149
DenseMap<Value *, Value *> TrackedToOrig(2 * OrigReducedVals.size());
@@ -17355,14 +17468,11 @@ class HorizontalReduction {
1735517468
Value *ReducedSubTree =
1735617469
emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
1735717470
if (ReducedSubTree->getType() != VL.front()->getType()) {
17358-
ReducedSubTree = Builder.CreateIntCast(
17359-
ReducedSubTree, VL.front()->getType(), any_of(VL, [&](Value *R) {
17360-
KnownBits Known = computeKnownBits(
17361-
R, cast<Instruction>(ReductionOps.front().front())
17362-
->getModule()
17363-
->getDataLayout());
17364-
return !Known.isNonNegative();
17365-
}));
17471+
assert(ReducedSubTree->getType() != VL.front()->getType() &&
17472+
"Expected different reduction type.");
17473+
ReducedSubTree =
17474+
Builder.CreateIntCast(ReducedSubTree, VL.front()->getType(),
17475+
V.isSignedMinBitwidthRootNode());
1736617476
}
1736717477

1736817478
// Improved analysis for add/fadd/xor reductions with same scale factor
@@ -17524,11 +17634,11 @@ class HorizontalReduction {
1752417634
}
1752517635
#endif
1752617636
if (!Ignore->use_empty()) {
17527-
Value *Undef = UndefValue::get(Ignore->getType());
17528-
Ignore->replaceAllUsesWith(Undef);
17637+
Value *P = PoisonValue::get(Ignore->getType());
17638+
Ignore->replaceAllUsesWith(P);
1752917639
}
17530-
V.eraseInstruction(cast<Instruction>(Ignore));
1753117640
}
17641+
V.removeInstructionsAndOperands(RdxOps);
1753217642
}
1753317643
} else if (!CheckForReusedReductionOps) {
1753417644
for (ReductionOpsType &RdxOps : ReductionOps)
@@ -18076,6 +18186,8 @@ bool SLPVectorizerPass::vectorizeHorReduction(
1807618186
Stack.emplace(I, Level);
1807718187
continue;
1807818188
}
18189+
if (R.isDeleted(Inst))
18190+
continue;
1807918191
} else {
1808018192
// We could not vectorize `Inst` so try to use it as a future seed.
1808118193
if (!TryAppendToPostponedInsts(Inst)) {
@@ -18161,15 +18273,28 @@ static bool tryToVectorizeSequence(
1816118273

1816218274
// Try to vectorize elements base on their type.
1816318275
SmallVector<T *> Candidates;
18164-
for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;) {
18276+
SmallVector<T *> VL;
18277+
for (auto *IncIt = Incoming.begin(), *E = Incoming.end(); IncIt != E;
18278+
VL.clear()) {
1816518279
// Look for the next elements with the same type, parent and operand
1816618280
// kinds.
18281+
auto *I = dyn_cast<Instruction>(*IncIt);
18282+
if (!I || R.isDeleted(I)) {
18283+
++IncIt;
18284+
continue;
18285+
}
1816718286
auto *SameTypeIt = IncIt;
18168-
while (SameTypeIt != E && AreCompatible(*SameTypeIt, *IncIt))
18287+
while (SameTypeIt != E && (!isa<Instruction>(*SameTypeIt) ||
18288+
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18289+
AreCompatible(*SameTypeIt, *IncIt))) {
18290+
auto *I = dyn_cast<Instruction>(*SameTypeIt);
1816918291
++SameTypeIt;
18292+
if (I && !R.isDeleted(I))
18293+
VL.push_back(cast<T>(I));
18294+
}
1817018295

1817118296
// Try to vectorize them.
18172-
unsigned NumElts = (SameTypeIt - IncIt);
18297+
unsigned NumElts = VL.size();
1817318298
LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at nodes ("
1817418299
<< NumElts << ")\n");
1817518300
// The vectorization is a 3-state attempt:
@@ -18181,10 +18306,15 @@ static bool tryToVectorizeSequence(
1818118306
// 3. Final attempt to try to vectorize all instructions with the
1818218307
// same/alternate ops only, this may result in some extra final
1818318308
// vectorization.
18184-
if (NumElts > 1 &&
18185-
TryToVectorizeHelper(ArrayRef(IncIt, NumElts), MaxVFOnly)) {
18309+
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL), MaxVFOnly)) {
1818618310
// Success start over because instructions might have been changed.
1818718311
Changed = true;
18312+
VL.swap(Candidates);
18313+
Candidates.clear();
18314+
for (T *V : VL) {
18315+
if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18316+
Candidates.push_back(V);
18317+
}
1818818318
} else {
1818918319
/// \Returns the minimum number of elements that we will attempt to
1819018320
/// vectorize.
@@ -18195,7 +18325,10 @@ static bool tryToVectorizeSequence(
1819518325
if (NumElts < GetMinNumElements(*IncIt) &&
1819618326
(Candidates.empty() ||
1819718327
Candidates.front()->getType() == (*IncIt)->getType())) {
18198-
Candidates.append(IncIt, std::next(IncIt, NumElts));
18328+
for (T *V : VL) {
18329+
if (auto *I = dyn_cast<Instruction>(V); I && !R.isDeleted(I))
18330+
Candidates.push_back(V);
18331+
}
1819918332
}
1820018333
}
1820118334
// Final attempt to vectorize instructions with the same types.
@@ -18206,13 +18339,26 @@ static bool tryToVectorizeSequence(
1820618339
Changed = true;
1820718340
} else if (MaxVFOnly) {
1820818341
// Try to vectorize using small vectors.
18209-
for (auto *It = Candidates.begin(), *End = Candidates.end();
18210-
It != End;) {
18342+
SmallVector<T *> VL;
18343+
for (auto *It = Candidates.begin(), *End = Candidates.end(); It != End;
18344+
VL.clear()) {
18345+
auto *I = dyn_cast<Instruction>(*It);
18346+
if (!I || R.isDeleted(I)) {
18347+
++It;
18348+
continue;
18349+
}
1821118350
auto *SameTypeIt = It;
18212-
while (SameTypeIt != End && AreCompatible(*SameTypeIt, *It))
18351+
while (SameTypeIt != End &&
18352+
(!isa<Instruction>(*SameTypeIt) ||
18353+
R.isDeleted(cast<Instruction>(*SameTypeIt)) ||
18354+
AreCompatible(*SameTypeIt, *It))) {
18355+
auto *I = dyn_cast<Instruction>(*SameTypeIt);
1821318356
++SameTypeIt;
18214-
unsigned NumElts = (SameTypeIt - It);
18215-
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(It, NumElts),
18357+
if (I && !R.isDeleted(I))
18358+
VL.push_back(cast<T>(I));
18359+
}
18360+
unsigned NumElts = VL.size();
18361+
if (NumElts > 1 && TryToVectorizeHelper(ArrayRef(VL),
1821618362
/*MaxVFOnly=*/false))
1821718363
Changed = true;
1821818364
It = SameTypeIt;
@@ -18486,7 +18632,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
1848618632
}
1848718633
return false;
1848818634
};
18489-
auto AreCompatiblePHIs = [&PHIToOpcodes, this](Value *V1, Value *V2) {
18635+
auto AreCompatiblePHIs = [&PHIToOpcodes, this, &R](Value *V1, Value *V2) {
1849018636
if (V1 == V2)
1849118637
return true;
1849218638
if (V1->getType() != V2->getType())
@@ -18501,6 +18647,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
1850118647
continue;
1850218648
if (auto *I1 = dyn_cast<Instruction>(Opcodes1[I]))
1850318649
if (auto *I2 = dyn_cast<Instruction>(Opcodes2[I])) {
18650+
if (R.isDeleted(I1) || R.isDeleted(I2))
18651+
return false;
1850418652
if (I1->getParent() != I2->getParent())
1850518653
return false;
1850618654
InstructionsState S = getSameOpcode({I1, I2}, *TLI);
@@ -18721,8 +18869,13 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
1872118869
// are trying to vectorize the index computations, so the maximum number of
1872218870
// elements is based on the size of the index expression, rather than the
1872318871
// size of the GEP itself (the target's pointer size).
18872+
auto *It = find_if(Entry.second, [&](GetElementPtrInst *GEP) {
18873+
return !R.isDeleted(GEP);
18874+
});
18875+
if (It == Entry.second.end())
18876+
continue;
1872418877
unsigned MaxVecRegSize = R.getMaxVecRegSize();
18725-
unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
18878+
unsigned EltSize = R.getVectorElementSize(*(*It)->idx_begin());
1872618879
if (MaxVecRegSize < EltSize)
1872718880
continue;
1872818881

llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,11 +24,11 @@
2424
;; Test that dbg.assigns linked to the the scalar stores to quad get linked to
2525
;; the vector store that replaces them.
2626

27-
; CHECK: #dbg_assign(float undef, ![[VAR:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[ID:[0-9]+]], ptr %arrayidx, !DIExpression(),
28-
; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 4),
29-
; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 8),
27+
; CHECK: #dbg_assign(float poison, ![[VAR:[0-9]+]], !DIExpression(DW_OP_LLVM_fragment, 0, 32), ![[ID:[0-9]+]], ptr %arrayidx, !DIExpression(),
28+
; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 32, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 4),
29+
; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 64, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 8),
3030
; CHECK: store <4 x float> {{.*}} !DIAssignID ![[ID]]
31-
; CHECK: #dbg_assign(float undef, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 12),
31+
; CHECK: #dbg_assign(float poison, ![[VAR]], !DIExpression(DW_OP_LLVM_fragment, 96, 32), ![[ID]], ptr %quad, !DIExpression(DW_OP_plus_uconst, 12),
3232

3333
target triple = "x86_64-unknown-unknown"
3434

llvm/test/Transforms/SLPVectorizer/AArch64/spillcost-di.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ define void @patatino(i64 %n, i64 %i, ptr %p) !dbg !7 {
1313
; CHECK-NEXT: #dbg_value(i64 [[I:%.*]], [[META19:![0-9]+]], !DIExpression(), [[META24:![0-9]+]])
1414
; CHECK-NEXT: #dbg_value(ptr [[P:%.*]], [[META20:![0-9]+]], !DIExpression(), [[META25:![0-9]+]])
1515
; CHECK-NEXT: [[X1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P]], i64 [[N]], i32 0, !dbg [[DBG26:![0-9]+]]
16-
; CHECK-NEXT: #dbg_value(i64 undef, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
17-
; CHECK-NEXT: #dbg_value(i64 undef, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]])
16+
; CHECK-NEXT: #dbg_value(i64 poison, [[META21:![0-9]+]], !DIExpression(), [[META27:![0-9]+]])
17+
; CHECK-NEXT: #dbg_value(i64 poison, [[META22:![0-9]+]], !DIExpression(), [[META28:![0-9]+]])
1818
; CHECK-NEXT: [[X5:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 [[I]], i32 0, !dbg [[DBG29:![0-9]+]]
1919
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X1]], align 8, !dbg [[DBG26]], !tbaa [[TBAA30:![0-9]+]]
2020
; CHECK-NEXT: store <2 x i64> [[TMP0]], ptr [[X5]], align 8, !dbg [[DBG34:![0-9]+]], !tbaa [[TBAA30]]

0 commit comments

Comments
 (0)