Skip to content

[LoopVectorize] Perform loop versioning for some early exit loops #120603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions llvm/include/llvm/Analysis/Loads.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,12 +88,6 @@ bool isDereferenceableAndAlignedInLoop(
AssumptionCache *AC = nullptr,
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);

/// Return true if the loop \p L cannot fault on any iteration and only
/// contains read-only memory accesses.
bool isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
SmallVectorImpl<const SCEVPredicate *> *Predicates = nullptr);

/// Return true if we know that executing a load from this value cannot trap.
///
/// If DT and ScanFrom are specified this method performs context-sensitive
Expand Down
33 changes: 31 additions & 2 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,11 +382,18 @@ class LoopVectorizationLegality {
const LoopAccessInfo *getLAI() const { return LAI; }

bool isSafeForAnyVectorWidth() const {
return LAI->getDepChecker().isSafeForAnyVectorWidth();
return LAI->getDepChecker().isSafeForAnyVectorWidth() &&
(!hasUncountableEarlyExit() || !getNumPotentiallyFaultingPointers());
}

uint64_t getMaxSafeVectorWidthInBits() const {
return LAI->getDepChecker().getMaxSafeVectorWidthInBits();
uint64_t MaxSafeVectorWidth =
LAI->getDepChecker().getMaxSafeVectorWidthInBits();
// The legalizer bails out if getMinPageSize does not return a value.
if (hasUncountableEarlyExit() && getNumPotentiallyFaultingPointers())
MaxSafeVectorWidth =
std::min(MaxSafeVectorWidth, uint64_t(*TTI->getMinPageSize()) * 8);
return MaxSafeVectorWidth;
}

/// Returns true if the loop has exactly one uncountable early exit, i.e. an
Expand Down Expand Up @@ -419,6 +426,19 @@ class LoopVectorizationLegality {
unsigned getNumStores() const { return LAI->getNumStores(); }
unsigned getNumLoads() const { return LAI->getNumLoads(); }

/// Return the number of pointers in the loop that could potentially fault in
/// a loop with uncountable early exits.
unsigned getNumPotentiallyFaultingPointers() const {
return PotentiallyFaultingPtrs.size();
}

/// Return a vector of all potentially faulting pointers in a loop with
/// uncountable early exits.
ArrayRef<std::pair<const SCEV *, Type *>>
getPotentiallyFaultingPointers() const {
return PotentiallyFaultingPtrs;
}

/// Returns a HistogramInfo* for the given instruction if it was determined
/// to be part of a load -> update -> store sequence where multiple lanes
/// may be working on the same memory address.
Expand Down Expand Up @@ -524,6 +544,11 @@ class LoopVectorizationLegality {
/// additional cases safely.
bool isVectorizableEarlyExitLoop();

/// Returns true if all loads in the loop contained in \p Loads can be
/// analyzed as potentially faulting. Any loads that may fault are added to
/// the member variable PotentiallyFaultingPtrs.
bool analyzePotentiallyFaultingLoads(SmallVectorImpl<LoadInst *> &Loads);

/// Return true if all of the instructions in the block can be speculatively
/// executed, and record the loads/stores that require masking.
/// \p SafePtrs is a list of addresses that are known to be legal and we know
Expand Down Expand Up @@ -642,6 +667,10 @@ class LoopVectorizationLegality {
/// Keep track of the loop edge to an uncountable exit, comprising a pair
/// of (Exiting, Exit) blocks, if there is exactly one early exit.
std::optional<std::pair<BasicBlock *, BasicBlock *>> UncountableEdge;

/// Keep a record of all potentially faulting pointers in loops with
/// uncountable early exits.
SmallVector<std::pair<const SCEV *, Type *>, 4> PotentiallyFaultingPtrs;
};

} // namespace llvm
Expand Down
15 changes: 0 additions & 15 deletions llvm/lib/Analysis/Loads.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -816,18 +816,3 @@ bool llvm::canReplacePointersIfEqual(const Value *From, const Value *To,

return isPointerAlwaysReplaceable(From, To, DL);
}

bool llvm::isDereferenceableReadOnlyLoop(
Loop *L, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
SmallVectorImpl<const SCEVPredicate *> *Predicates) {
for (BasicBlock *BB : L->blocks()) {
for (Instruction &I : *BB) {
if (auto *LI = dyn_cast<LoadInst>(&I)) {
if (!isDereferenceableAndAlignedInLoop(LI, L, *SE, *DT, AC, Predicates))
return false;
} else if (I.mayReadFromMemory() || I.mayWriteToMemory() || I.mayThrow())
return false;
}
}
return true;
}
88 changes: 76 additions & 12 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1602,6 +1602,46 @@ bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
return Result;
}

bool LoopVectorizationLegality::analyzePotentiallyFaultingLoads(
SmallVectorImpl<LoadInst *> &Loads) {
LLVM_DEBUG(dbgs() << "LV: Looking for potentially faulting loads in loop "
"with uncountable early exit:\n");
for (LoadInst *LI : Loads) {
LLVM_DEBUG(dbgs() << "LV: Load: " << *LI << '\n');
if (LI->getPointerAddressSpace())
return false;

Value *Ptr = LI->getPointerOperand();
const SCEV *PtrExpr = PSE.getSCEV(Ptr);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrExpr);
// TODO: Deal with loop invariant pointers.
// NOTE: The reasoning below is only safe if the load executes at least
// once.
if (!AR || AR->getLoop() != TheLoop || !AR->isAffine())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to explain here that we check for add-recs because the reasoning is only safe if the load executes at least once?

We should probably also restrict this just for loads from the default address space (0) for now

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

return false;
auto Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*PSE.getSE()));
if (!Step)
return false;
const SCEV *Start = AR->getStart();

// Make sure the step is positive and matches the object size in memory.
// TODO: Extend this to cover more cases.
auto &DL = LI->getDataLayout();
APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
DL.getTypeStoreSize(LI->getType()).getFixedValue());

// Also discard element sizes that are not a power of 2, since the loop
// vectorizer can only perform loop versioning with pointer alignment
// checks for vector loads that are power-of-2 in size.
if (EltSize != Step->getAPInt() || !EltSize.isPowerOf2())
return false;

LLVM_DEBUG(dbgs() << "LV: SCEV for Load Ptr: " << *Start << '\n');
PotentiallyFaultingPtrs.push_back({Start, LI->getType()});
}
return true;
}

bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
BasicBlock *LatchBB = TheLoop->getLoopLatch();
if (!LatchBB) {
Expand Down Expand Up @@ -1706,6 +1746,8 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
}
};

Predicates.clear();
SmallVector<LoadInst *, 4> NonDerefLoads;
for (auto *BB : TheLoop->blocks())
for (auto &I : *BB) {
if (I.mayWriteToMemory()) {
Expand All @@ -1715,30 +1757,52 @@ bool LoopVectorizationLegality::isVectorizableEarlyExitLoop() {
"Cannot vectorize early exit loop with writes to memory",
"WritesInEarlyExitLoop", ORE, TheLoop);
return false;
} else if (!IsSafeOperation(&I)) {
} else if (I.mayThrow() || !IsSafeOperation(&I)) {
reportVectorizationFailure("Early exit loop contains operations that "
"cannot be speculatively executed",
"UnsafeOperationsEarlyExitLoop", ORE,
TheLoop);
return false;
} else if (I.mayReadFromMemory()) {
auto *LI = dyn_cast<LoadInst>(&I);
bool UnsafeRead = false;
if (!LI)
UnsafeRead = true;
else if (!isDereferenceableAndAlignedInLoop(LI, TheLoop, *PSE.getSE(),
*DT, AC, &Predicates)) {
if (LI->getParent() != TheLoop->getHeader())
UnsafeRead = true;
else
NonDerefLoads.push_back(LI);
}

if (UnsafeRead) {
reportVectorizationFailure(
"Loop may fault",
"Cannot vectorize potentially faulting early exit loop",
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
return false;
}
}
}

if (!NonDerefLoads.empty()) {
if (!TTI->getMinPageSize() ||
!analyzePotentiallyFaultingLoads(NonDerefLoads)) {
PotentiallyFaultingPtrs.clear();
reportVectorizationFailure(
"Loop may fault",
"Cannot vectorize potentially faulting early exit loop",
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
return false;
}
LLVM_DEBUG(dbgs() << "We can vectorize the loop with runtime checks.\n");
}

// The vectoriser cannot handle loads that occur after the early exit block.
assert(LatchBB->getUniquePredecessor() == SingleUncountableEdge->first &&
"Expected latch predecessor to be the early exiting block");

// TODO: Handle loops that may fault.
Predicates.clear();
if (!isDereferenceableReadOnlyLoop(TheLoop, PSE.getSE(), DT, AC,
&Predicates)) {
reportVectorizationFailure(
"Loop may fault",
"Cannot vectorize potentially faulting early exit loop",
"PotentiallyFaultingEarlyExitLoop", ORE, TheLoop);
return false;
}

[[maybe_unused]] const SCEV *SymbolicMaxBTC =
PSE.getSymbolicMaxBackedgeTakenCount();
// Since we have an exact exit count for the latch and the early exit
Expand Down
71 changes: 65 additions & 6 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,12 @@ static cl::opt<bool> EnableEarlyExitVectorization(
cl::desc(
"Enable vectorization of early exit loops with uncountable exits."));

static cl::opt<unsigned> MaxNumPotentiallyFaultingPointers(
"max-num-faulting-pointers", cl::init(0), cl::Hidden,
cl::desc(
"The maximum number of potentially faulting pointers we permit when "
"vectorizing loops with uncountable exits."));

// Likelyhood of bypassing the vectorized loop because assumptions about SCEV
// variables not overflowing do not hold. See `emitSCEVChecks`.
static constexpr uint32_t SCEVCheckBypassWeights[] = {1, 127};
Expand Down Expand Up @@ -2163,6 +2169,28 @@ class GeneratedRTChecks {
};
} // namespace

static void
addPointerAlignmentChecks(ArrayRef<std::pair<const SCEV *, Type *>> Ptrs,
Function *F, PredicatedScalarEvolution &PSE,
TargetTransformInfo *TTI, ElementCount VF,
unsigned IC) {
ScalarEvolution *SE = PSE.getSE();
const DataLayout &DL = SE->getDataLayout();

for (auto Ptr : Ptrs) {
Type *PtrIntType = DL.getIntPtrType(Ptr.first->getType());
APInt EltSize(PtrIntType->getScalarSizeInBits(),
DL.getTypeStoreSize(Ptr.second).getFixedValue());
const SCEV *Start = SE->getPtrToIntExpr(Ptr.first, PtrIntType);
const SCEV *ScevEC = SE->getElementCount(PtrIntType, VF * IC);
const SCEV *Align =
SE->getMulExpr(ScevEC, SE->getConstant(EltSize),
(SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNUW));
const SCEV *Rem = SE->getURemExpr(Start, Align);
PSE.addPredicate(*SE->getEqualPredicate(Rem, SE->getZero(PtrIntType)));
}
}

static bool useActiveLaneMask(TailFoldingStyle Style) {
return Style == TailFoldingStyle::Data ||
Style == TailFoldingStyle::DataAndControlFlow ||
Expand Down Expand Up @@ -3842,6 +3870,15 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
return false;
}

if (Legal->hasUncountableEarlyExit() &&
Legal->getNumPotentiallyFaultingPointers() &&
!TTI.isVScaleKnownToBeAPowerOfTwo()) {
reportVectorizationInfo("Cannot vectorize potentially faulting early exit "
"loop with scalable vectors.",
"ScalableVFUnfeasible", ORE, TheLoop);
return false;
}

IsScalableVectorizationAllowed = true;
return true;
}
Expand Down Expand Up @@ -10508,11 +10545,25 @@ bool LoopVectorizePass::processLoop(Loop *L) {
return false;
}

if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
"early exit is not enabled",
"UncountableEarlyExitLoopsDisabled", ORE, L);
return false;
if (LVL.hasUncountableEarlyExit()) {
if (!EnableEarlyExitVectorization) {
reportVectorizationFailure("Auto-vectorization of loops with uncountable "
"early exit is not enabled",
"UncountableEarlyExitLoopsDisabled", ORE, L);
return false;
}

unsigned NumPotentiallyFaultingPointers =
LVL.getNumPotentiallyFaultingPointers();
if (NumPotentiallyFaultingPointers > MaxNumPotentiallyFaultingPointers) {
reportVectorizationFailure("Not worth vectorizing loop with uncountable "
"early exit, due to number of potentially "
"faulting loads",
"UncountableEarlyExitMayFault", ORE, L);
return false;
} else if (NumPotentiallyFaultingPointers)
LLVM_DEBUG(dbgs() << "LV: Need to version early-exit vector loop with "
<< "pointer alignment checks.\n");
}

// Entrance to the VPlan-native vectorization path. Outer loops are processed
Expand Down Expand Up @@ -10663,8 +10714,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
unsigned SelectedIC = std::max(IC, UserIC);
// Optimistically generate runtime checks if they are needed. Drop them if
// they turn out to not be profitable.
if (VF.Width.isVector() || SelectedIC > 1)
if (VF.Width.isVector() || SelectedIC > 1) {
if (LVL.getNumPotentiallyFaultingPointers()) {
assert(!CM.foldTailWithEVL() &&
"Explicit vector length unsupported for early exit loops and "
"potentially faulting loads");
addPointerAlignmentChecks(LVL.getPotentiallyFaultingPointers(), F, PSE,
TTI, VF.Width, SelectedIC);
}
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
}

// Check if it is profitable to vectorize with runtime checks.
bool ForceVectorization =
Expand Down
Loading
Loading