Skip to content

Commit a4e171c

Browse files
author
git apple-llvm automerger
committed
Merge commit '0ab67ec19167' from llvm.org/main into next
2 parents 817a304 + 0ab67ec commit a4e171c

File tree

8 files changed

+676
-0
lines changed

8 files changed

+676
-0
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
//===------ EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass optimizes a vectorized loop with canonical IV to using EVL-based
10+
// IV if it was tail-folded by predicated EVL.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
15+
#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H
16+
17+
#include "llvm/Analysis/LoopAnalysisManager.h"
18+
#include "llvm/IR/PassManager.h"
19+
20+
namespace llvm {
21+
class Loop;
22+
class LPMUpdater;
23+
24+
/// Turn vectorized loops with canonical induction variables into loops that
25+
/// only use a single EVL-based induction variable.
26+
struct EVLIndVarSimplifyPass : public PassInfoMixin<EVLIndVarSimplifyPass> {
27+
PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM,
28+
LoopStandardAnalysisResults &AR, LPMUpdater &U);
29+
};
30+
} // namespace llvm
31+
#endif

llvm/lib/Passes/PassBuilder.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,7 @@
373373
#include "llvm/Transforms/Utils/SymbolRewriter.h"
374374
#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
375375
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
376+
#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
376377
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
377378
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
378379
#include "llvm/Transforms/Vectorize/LoopVectorize.h"

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@
143143
#include "llvm/Transforms/Utils/NameAnonGlobals.h"
144144
#include "llvm/Transforms/Utils/RelLookupTableConverter.h"
145145
#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
146+
#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
146147
#include "llvm/Transforms/Vectorize/LoopVectorize.h"
147148
#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
148149
#include "llvm/Transforms/Vectorize/VectorCombine.h"

llvm/lib/Passes/PassRegistry.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -677,6 +677,7 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch",
677677
#endif
678678
LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
679679
LOOP_PASS("dot-ddg", DDGDotPrinterPass())
680+
LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass())
680681
LOOP_PASS("guard-widening", GuardWideningPass())
681682
LOOP_PASS("extra-simple-loop-unswitch-passes",
682683
ExtraLoopPassManager<ShouldRunExtraSimpleLoopUnswitch>())

llvm/lib/Target/RISCV/RISCVTargetMachine.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include "llvm/Target/TargetOptions.h"
3838
#include "llvm/Transforms/IPO.h"
3939
#include "llvm/Transforms/Scalar.h"
40+
#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
4041
#include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h"
4142
#include <optional>
4243
using namespace llvm;
@@ -645,6 +646,12 @@ void RISCVTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
645646
OptimizationLevel Level) {
646647
LPM.addPass(LoopIdiomVectorizePass(LoopIdiomVectorizeStyle::Predicated));
647648
});
649+
650+
PB.registerVectorizerEndEPCallback(
651+
[](FunctionPassManager &FPM, OptimizationLevel Level) {
652+
if (Level.isOptimizingForSpeed())
653+
FPM.addPass(createFunctionToLoopPassAdaptor(EVLIndVarSimplifyPass()));
654+
});
648655
}
649656

650657
yaml::MachineFunctionInfo *

llvm/lib/Transforms/Vectorize/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
add_llvm_component_library(LLVMVectorize
2+
EVLIndVarSimplify.cpp
23
LoadStoreVectorizer.cpp
34
LoopIdiomVectorize.cpp
45
LoopVectorizationLegality.cpp
Lines changed: 301 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,301 @@
1+
//===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This pass optimizes a vectorized loop with canonical IV to using EVL-based
10+
// IV if it was tail-folded by predicated EVL.
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h"
15+
#include "llvm/ADT/Statistic.h"
16+
#include "llvm/Analysis/IVDescriptors.h"
17+
#include "llvm/Analysis/LoopInfo.h"
18+
#include "llvm/Analysis/LoopPass.h"
19+
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
20+
#include "llvm/Analysis/ScalarEvolution.h"
21+
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
22+
#include "llvm/Analysis/ValueTracking.h"
23+
#include "llvm/IR/IRBuilder.h"
24+
#include "llvm/IR/PatternMatch.h"
25+
#include "llvm/Support/CommandLine.h"
26+
#include "llvm/Support/Debug.h"
27+
#include "llvm/Support/MathExtras.h"
28+
#include "llvm/Support/raw_ostream.h"
29+
#include "llvm/Transforms/Scalar/LoopPassManager.h"
30+
#include "llvm/Transforms/Utils/Local.h"
31+
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
32+
33+
#define DEBUG_TYPE "evl-iv-simplify"
34+
35+
using namespace llvm;
36+
37+
STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated");
38+
39+
static cl::opt<bool> EnableEVLIndVarSimplify(
40+
"enable-evl-indvar-simplify",
41+
cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden,
42+
cl::init(true));
43+
44+
namespace {
45+
struct EVLIndVarSimplifyImpl {
46+
ScalarEvolution &SE;
47+
OptimizationRemarkEmitter *ORE = nullptr;
48+
49+
EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR,
50+
OptimizationRemarkEmitter *ORE)
51+
: SE(LAR.SE), ORE(ORE) {}
52+
53+
/// Returns true if modify the loop.
54+
bool run(Loop &L);
55+
};
56+
} // anonymous namespace
57+
58+
/// Returns the constant part of vectorization factor from the induction
59+
/// variable's step value SCEV expression.
60+
static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) {
61+
if (!Step)
62+
return 0U;
63+
64+
// Looking for loops with IV step value in the form of `(<constant VF> x
65+
// vscale)`.
66+
if (const auto *Mul = dyn_cast<SCEVMulExpr>(Step)) {
67+
if (Mul->getNumOperands() == 2) {
68+
const SCEV *LHS = Mul->getOperand(0);
69+
const SCEV *RHS = Mul->getOperand(1);
70+
if (const auto *Const = dyn_cast<SCEVConstant>(LHS);
71+
Const && isa<SCEVVScale>(RHS)) {
72+
uint64_t V = Const->getAPInt().getLimitedValue();
73+
if (llvm::isUInt<32>(V))
74+
return V;
75+
}
76+
}
77+
}
78+
79+
// If not, see if the vscale_range of the parent function is a fixed value,
80+
// which makes the step value to be replaced by a constant.
81+
if (F.hasFnAttribute(Attribute::VScaleRange))
82+
if (const auto *ConstStep = dyn_cast<SCEVConstant>(Step)) {
83+
APInt V = ConstStep->getAPInt().abs();
84+
ConstantRange CR = llvm::getVScaleRange(&F, 64);
85+
if (const APInt *Fixed = CR.getSingleElement()) {
86+
V = V.zextOrTrunc(Fixed->getBitWidth());
87+
uint64_t VF = V.udiv(*Fixed).getLimitedValue();
88+
if (VF && llvm::isUInt<32>(VF) &&
89+
// Make sure step is divisible by vscale.
90+
V.urem(*Fixed).isZero())
91+
return VF;
92+
}
93+
}
94+
95+
return 0U;
96+
}
97+
98+
bool EVLIndVarSimplifyImpl::run(Loop &L) {
99+
if (!EnableEVLIndVarSimplify)
100+
return false;
101+
102+
if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized"))
103+
return false;
104+
const MDOperand *EVLMD =
105+
findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle")
106+
.value_or(nullptr);
107+
if (!EVLMD || !EVLMD->equalsStr("evl"))
108+
return false;
109+
110+
BasicBlock *LatchBlock = L.getLoopLatch();
111+
ICmpInst *OrigLatchCmp = L.getLatchCmpInst();
112+
if (!LatchBlock || !OrigLatchCmp)
113+
return false;
114+
115+
InductionDescriptor IVD;
116+
PHINode *IndVar = L.getInductionVariable(SE);
117+
if (!IndVar || !L.getInductionDescriptor(SE, IVD)) {
118+
const char *Reason = (IndVar ? "induction descriptor is not available"
119+
: "cannot recognize induction variable");
120+
LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName()
121+
<< " because" << Reason << "\n");
122+
if (ORE) {
123+
ORE->emit([&]() {
124+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar",
125+
L.getStartLoc(), L.getHeader())
126+
<< "Cannot retrieve IV because " << ore::NV("Reason", Reason);
127+
});
128+
}
129+
return false;
130+
}
131+
132+
BasicBlock *InitBlock, *BackEdgeBlock;
133+
if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) {
134+
LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in "
135+
<< L.getName() << "\n");
136+
if (ORE) {
137+
ORE->emit([&]() {
138+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure",
139+
L.getStartLoc(), L.getHeader())
140+
<< "Does not have a unique incoming and backedge";
141+
});
142+
}
143+
return false;
144+
}
145+
146+
// Retrieve the loop bounds.
147+
std::optional<Loop::LoopBounds> Bounds = L.getBounds(SE);
148+
if (!Bounds) {
149+
LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName()
150+
<< "\n");
151+
if (ORE) {
152+
ORE->emit([&]() {
153+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure",
154+
L.getStartLoc(), L.getHeader())
155+
<< "Could not obtain the loop bounds";
156+
});
157+
}
158+
return false;
159+
}
160+
Value *CanonicalIVInit = &Bounds->getInitialIVValue();
161+
Value *CanonicalIVFinal = &Bounds->getFinalIVValue();
162+
163+
const SCEV *StepV = IVD.getStep();
164+
uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent());
165+
if (!VF) {
166+
LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV
167+
<< "'\n");
168+
if (ORE) {
169+
ORE->emit([&]() {
170+
return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar",
171+
L.getStartLoc(), L.getHeader())
172+
<< "Could not infer VF from IndVar step "
173+
<< ore::NV("Step", StepV);
174+
});
175+
}
176+
return false;
177+
}
178+
LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName()
179+
<< "\n");
180+
181+
// Try to find the EVL-based induction variable.
182+
using namespace PatternMatch;
183+
BasicBlock *BB = IndVar->getParent();
184+
185+
Value *EVLIndVar = nullptr;
186+
Value *RemTC = nullptr;
187+
Value *TC = nullptr;
188+
auto IntrinsicMatch = m_Intrinsic<Intrinsic::experimental_get_vector_length>(
189+
m_Value(RemTC), m_SpecificInt(VF),
190+
/*Scalable=*/m_SpecificInt(1));
191+
for (PHINode &PN : BB->phis()) {
192+
if (&PN == IndVar)
193+
continue;
194+
195+
// Check 1: it has to contain both incoming (init) & backedge blocks
196+
// from IndVar.
197+
if (PN.getBasicBlockIndex(InitBlock) < 0 ||
198+
PN.getBasicBlockIndex(BackEdgeBlock) < 0)
199+
continue;
200+
// Check 2: EVL index is always increasing, thus its inital value has to be
201+
// equal to either the initial IV value (when the canonical IV is also
202+
// increasing) or the last IV value (when canonical IV is decreasing).
203+
Value *Init = PN.getIncomingValueForBlock(InitBlock);
204+
using Direction = Loop::LoopBounds::Direction;
205+
switch (Bounds->getDirection()) {
206+
case Direction::Increasing:
207+
if (Init != CanonicalIVInit)
208+
continue;
209+
break;
210+
case Direction::Decreasing:
211+
if (Init != CanonicalIVFinal)
212+
continue;
213+
break;
214+
case Direction::Unknown:
215+
// To be more permissive and see if either the initial or final IV value
216+
// matches PN's init value.
217+
if (Init != CanonicalIVInit && Init != CanonicalIVFinal)
218+
continue;
219+
break;
220+
}
221+
Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock);
222+
assert(RecValue && "expect recurrent IndVar value");
223+
224+
LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN
225+
<< "\n");
226+
227+
// Check 3: Pattern match to find the EVL-based index and total trip count
228+
// (TC).
229+
if (match(RecValue,
230+
m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) &&
231+
match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) {
232+
EVLIndVar = RecValue;
233+
break;
234+
}
235+
}
236+
237+
if (!EVLIndVar || !TC)
238+
return false;
239+
240+
LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n");
241+
if (ORE) {
242+
ORE->emit([&]() {
243+
DebugLoc DL;
244+
BasicBlock *Region = nullptr;
245+
if (auto *I = dyn_cast<Instruction>(EVLIndVar)) {
246+
DL = I->getDebugLoc();
247+
Region = I->getParent();
248+
} else {
249+
DL = L.getStartLoc();
250+
Region = L.getHeader();
251+
}
252+
return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region)
253+
<< "Using " << ore::NV("EVLIndVar", EVLIndVar)
254+
<< " for EVL-based IndVar";
255+
});
256+
}
257+
258+
// Create an EVL-based comparison and replace the branch to use it as
259+
// predicate.
260+
261+
// Loop::getLatchCmpInst check at the beginning of this function has ensured
262+
// that latch block ends in a conditional branch.
263+
auto *LatchBranch = cast<BranchInst>(LatchBlock->getTerminator());
264+
assert(LatchBranch->isConditional() &&
265+
"expect the loop latch to be ended with a conditional branch");
266+
ICmpInst::Predicate Pred;
267+
if (LatchBranch->getSuccessor(0) == L.getHeader())
268+
Pred = ICmpInst::ICMP_NE;
269+
else
270+
Pred = ICmpInst::ICMP_EQ;
271+
272+
IRBuilder<> Builder(OrigLatchCmp);
273+
auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC);
274+
OrigLatchCmp->replaceAllUsesWith(NewLatchCmp);
275+
276+
// llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are
277+
// not used outside the cycles. However, in this case the now-RAUW-ed
278+
// OrigLatchCmp will be considered a use outside the cycle while in reality
279+
// it's practically dead. Thus we need to remove it before calling
280+
// RecursivelyDeleteDeadPHINode.
281+
(void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp);
282+
if (llvm::RecursivelyDeleteDeadPHINode(IndVar))
283+
LLVM_DEBUG(dbgs() << "Removed original IndVar\n");
284+
285+
++NumEliminatedCanonicalIV;
286+
287+
return true;
288+
}
289+
290+
PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM,
291+
LoopStandardAnalysisResults &AR,
292+
LPMUpdater &U) {
293+
Function &F = *L.getHeader()->getParent();
294+
auto &FAMProxy = LAM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR);
295+
OptimizationRemarkEmitter *ORE =
296+
FAMProxy.getCachedResult<OptimizationRemarkEmitterAnalysis>(F);
297+
298+
if (EVLIndVarSimplifyImpl(AR, ORE).run(L))
299+
return PreservedAnalyses::allInSet<CFGAnalyses>();
300+
return PreservedAnalyses::all();
301+
}

0 commit comments

Comments
 (0)