Skip to content

Commit 572cfa3

Browse files
committed
[LV] Use SCEV for uniformity analysis across VF
This patch uses SCEV to check if a value is uniform across a given VF. The basic idea is to construct SCEVs where the AddRecs of the loop are adjusted to reflect the version in the vectorized loop (Step multiplied by VF). We construct a SCEV for the value of the vector lane 0 (offset 0) compare it to the expressions for lanes 1 to the last vector lane (VF - 1). If they are equal, consider the expression uniform. While re-writing expressions, we also need to catch expressions we cannot determine uniformity (e.g. SCEVUnknown). Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D148841
1 parent 4369de7 commit 572cfa3

12 files changed

+285
-232
lines changed

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -588,8 +588,9 @@ class LoopAccessInfo {
588588
static bool blockNeedsPredication(BasicBlock *BB, Loop *TheLoop,
589589
DominatorTree *DT);
590590

591-
/// Returns true if the value V is uniform within the loop.
592-
bool isUniform(Value *V) const;
591+
/// Returns true if value \p V is uniform across \p VF lanes, when \p VF is
592+
/// provided, and otherwise if \p V is invariant across all loop iterations.
593+
bool isUniform(Value *V, std::optional<ElementCount> VF = std::nullopt) const;
593594

594595
uint64_t getMaxSafeDepDistBytes() const { return MaxSafeDepDistBytes; }
595596
unsigned getNumStores() const { return NumStores; }

llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -347,12 +347,15 @@ class LoopVectorizationLegality {
347347
/// loop. Do not use after invoking 'createVectorizedLoopSkeleton' (PR34965).
348348
int isConsecutivePtr(Type *AccessTy, Value *Ptr) const;
349349

350-
/// Returns true if the value V is uniform within the loop.
351-
bool isUniform(Value *V) const;
350+
/// Returns true if value V is uniform across \p VF lanes, when \p VF is
351+
/// provided, and otherwise if \p V is invariant across all loop iterations.
352+
bool isUniform(Value *V, std::optional<ElementCount> VF = std::nullopt) const;
352353

353354
/// A uniform memory op is a load or store which accesses the same memory
354-
/// location on all lanes.
355-
bool isUniformMemOp(Instruction &I) const;
355+
/// location on all \p VF lanes, if \p VF is provided and otherwise if the
356+
/// memory location is invariant.
357+
bool isUniformMemOp(Instruction &I,
358+
std::optional<ElementCount> VF = std::nullopt) const;
356359

357360
/// Returns the information that we collected about runtime memory check.
358361
const RuntimePointerChecking *getRuntimePointerChecking() const {

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 111 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2532,15 +2532,124 @@ OptimizationRemarkAnalysis &LoopAccessInfo::recordAnalysis(StringRef RemarkName,
25322532
return *Report;
25332533
}
25342534

2535-
bool LoopAccessInfo::isUniform(Value *V) const {
2535+
namespace {
2536+
/// A rewriter to build the SCEVs for each of the VF lanes in the expected
2537+
/// vectorized loop, which can then be compared to detect their uniformity. This
2538+
/// is done by replacing the AddRec SCEVs of the original scalar loop (TheLoop)
2539+
/// with new AddRecs where the step is multiplied by StepMultiplier and Offset *
2540+
/// Step is added. Also checks if all sub-expressions are analyzable w.r.t.
2541+
/// uniformity.
2542+
class SCEVAddRecForUniformityRewriter
2543+
: public SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter> {
2544+
/// Multiplier to be applied to the step of AddRecs in TheLoop.
2545+
unsigned StepMultiplier;
2546+
2547+
/// Offset to be added to the AddRecs in TheLoop.
2548+
unsigned Offset;
2549+
2550+
/// Loop for which to rewrite AddRecsFor.
2551+
Loop *TheLoop;
2552+
2553+
/// Is any sub-expressions not analyzable w.r.t. uniformity?
2554+
bool CannotAnalyze = false;
2555+
2556+
bool canAnalyze() const { return !CannotAnalyze; }
2557+
2558+
public:
2559+
SCEVAddRecForUniformityRewriter(ScalarEvolution &SE, unsigned StepMultiplier,
2560+
unsigned Offset, Loop *TheLoop)
2561+
: SCEVRewriteVisitor(SE), StepMultiplier(StepMultiplier), Offset(Offset),
2562+
TheLoop(TheLoop) {}
2563+
2564+
const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
2565+
assert(Expr->getLoop() == TheLoop &&
2566+
"addrec outside of TheLoop must be invariant and should have been "
2567+
"handled earlier");
2568+
// Build a new AddRec by multiplying the step by StepMultiplier and
2569+
// incrementing the start by Offset * step.
2570+
Type *Ty = Expr->getType();
2571+
auto *Step = Expr->getStepRecurrence(SE);
2572+
auto *NewStep = SE.getMulExpr(Step, SE.getConstant(Ty, StepMultiplier));
2573+
auto *ScaledOffset = SE.getMulExpr(Step, SE.getConstant(Ty, Offset));
2574+
auto *NewStart = SE.getAddExpr(Expr->getStart(), ScaledOffset);
2575+
return SE.getAddRecExpr(NewStart, NewStep, TheLoop, SCEV::FlagAnyWrap);
2576+
}
2577+
2578+
const SCEV *visit(const SCEV *S) {
2579+
if (CannotAnalyze || SE.isLoopInvariant(S, TheLoop))
2580+
return S;
2581+
return SCEVRewriteVisitor<SCEVAddRecForUniformityRewriter>::visit(S);
2582+
}
2583+
2584+
const SCEV *visitUnknown(const SCEVUnknown *S) {
2585+
if (SE.isLoopInvariant(S, TheLoop))
2586+
return S;
2587+
// The value could vary across iterations.
2588+
CannotAnalyze = true;
2589+
return S;
2590+
}
2591+
2592+
const SCEV *visitCouldNotCompute(const SCEVCouldNotCompute *S) {
2593+
// Could not analyze the expression.
2594+
CannotAnalyze = true;
2595+
return S;
2596+
}
2597+
2598+
static const SCEV *rewrite(const SCEV *S, ScalarEvolution &SE,
2599+
unsigned StepMultiplier, unsigned Offset,
2600+
Loop *TheLoop) {
2601+
/// Bail out if the expression does not contain an UDiv expression.
2602+
/// Uniform values which are not loop invariant require operations to strip
2603+
/// out the lowest bits. For now just look for UDivs and use it to avoid
2604+
/// re-writing UDIV-free expressions for other lanes to limit compile time.
2605+
if (!SCEVExprContains(S,
2606+
[](const SCEV *S) { return isa<SCEVUDivExpr>(S); }))
2607+
return SE.getCouldNotCompute();
2608+
2609+
SCEVAddRecForUniformityRewriter Rewriter(SE, StepMultiplier, Offset,
2610+
TheLoop);
2611+
const SCEV *Result = Rewriter.visit(S);
2612+
2613+
if (Rewriter.canAnalyze())
2614+
return Result;
2615+
return SE.getCouldNotCompute();
2616+
}
2617+
};
2618+
2619+
} // namespace
2620+
2621+
bool LoopAccessInfo::isUniform(Value *V, std::optional<ElementCount> VF) const {
25362622
auto *SE = PSE->getSE();
25372623
// Since we rely on SCEV for uniformity, if the type is not SCEVable, it is
25382624
// never considered uniform.
25392625
// TODO: Is this really what we want? Even without FP SCEV, we may want some
25402626
// trivially loop-invariant FP values to be considered uniform.
25412627
if (!SE->isSCEVable(V->getType()))
25422628
return false;
2543-
return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
2629+
const SCEV *S = SE->getSCEV(V);
2630+
if (SE->isLoopInvariant(S, TheLoop))
2631+
return true;
2632+
if (!VF || VF->isScalable())
2633+
return false;
2634+
if (VF->isScalar())
2635+
return true;
2636+
2637+
// Rewrite AddRecs in TheLoop to step by VF and check if the expression for
2638+
// lane 0 matches the expressions for all other lanes.
2639+
unsigned FixedVF = VF->getKnownMinValue();
2640+
const SCEV *FirstLaneExpr =
2641+
SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, 0, TheLoop);
2642+
if (isa<SCEVCouldNotCompute>(FirstLaneExpr))
2643+
return false;
2644+
2645+
// Make sure the expressions for lanes FixedVF-1..1 match the expression for
2646+
// lane 0. We check lanes in reverse order for compile-time, as frequently
2647+
// checking the last lane is sufficient to rule out uniformity.
2648+
return all_of(reverse(seq<unsigned>(1, FixedVF)), [&](unsigned I) {
2649+
const SCEV *IthLaneExpr =
2650+
SCEVAddRecForUniformityRewriter::rewrite(S, *SE, FixedVF, I, TheLoop);
2651+
return FirstLaneExpr == IthLaneExpr;
2652+
});
25442653
}
25452654

25462655
/// Find the operand of the GEP that should be checked for consecutive

llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -471,19 +471,21 @@ int LoopVectorizationLegality::isConsecutivePtr(Type *AccessTy,
471471
return 0;
472472
}
473473

474-
bool LoopVectorizationLegality::isUniform(Value *V) const {
475-
return LAI->isUniform(V);
474+
bool LoopVectorizationLegality::isUniform(
475+
Value *V, std::optional<ElementCount> VF) const {
476+
return LAI->isUniform(V, VF);
476477
}
477478

478-
bool LoopVectorizationLegality::isUniformMemOp(Instruction &I) const {
479+
bool LoopVectorizationLegality::isUniformMemOp(
480+
Instruction &I, std::optional<ElementCount> VF) const {
479481
Value *Ptr = getLoadStorePointerOperand(&I);
480482
if (!Ptr)
481483
return false;
482484
// Note: There's nothing inherent which prevents predicated loads and
483485
// stores from being uniform. The current lowering simply doesn't handle
484486
// it; in particular, the cost model distinguishes scatter/gather from
485487
// scalar w/predication, and we currently rely on the scalar path.
486-
return isUniform(Ptr) && !blockNeedsPredication(I.getParent());
488+
return isUniform(Ptr, VF) && !blockNeedsPredication(I.getParent());
487489
}
488490

489491
bool LoopVectorizationLegality::canVectorizeOuterLoop() {

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4674,7 +4674,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
46744674
// Return true if all lanes perform the same memory operation, and we can
46754675
// thus chose to execute only one.
46764676
auto isUniformMemOpUse = [&](Instruction *I) {
4677-
if (!Legal->isUniformMemOp(*I))
4677+
if (!Legal->isUniformMemOp(*I, VF))
46784678
return false;
46794679
if (isa<LoadInst>(I))
46804680
// Loading the same address always produces the same result - at least
@@ -6496,7 +6496,7 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
64966496
InstructionCost
64976497
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
64986498
ElementCount VF) {
6499-
assert(Legal->isUniformMemOp(*I));
6499+
assert(Legal->isUniformMemOp(*I, VF));
65006500

65016501
Type *ValTy = getLoadStoreType(I);
65026502
auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
@@ -6872,7 +6872,7 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
68726872
if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
68736873
NumPredStores++;
68746874

6875-
if (Legal->isUniformMemOp(I)) {
6875+
if (Legal->isUniformMemOp(I, VF)) {
68766876
auto isLegalToScalarize = [&]() {
68776877
if (!VF.isScalable())
68786878
// Scalarization of fixed length vectors "just works".

0 commit comments

Comments
 (0)