Skip to content

[LV]Enable non-power-of-2 store-load forwarding distance in predicated DataWithEVL vectorization mode #100755

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 31 additions & 11 deletions llvm/include/llvm/Analysis/LoopAccessAnalysis.h
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,10 @@ class MemoryDepChecker {

MemoryDepChecker(PredicatedScalarEvolution &PSE, const Loop *L,
const DenseMap<Value *, const SCEV *> &SymbolicStrides,
unsigned MaxTargetVectorWidthInBits)
unsigned MaxTargetVectorWidthInBits, bool AllowNonPow2Deps)
: PSE(PSE), InnermostLoop(L), SymbolicStrides(SymbolicStrides),
MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits) {}
MaxTargetVectorWidthInBits(MaxTargetVectorWidthInBits),
AllowNonPow2Deps(AllowNonPow2Deps) {}

/// Register the location (instructions are given increasing numbers)
/// of a write access.
Expand Down Expand Up @@ -218,17 +219,28 @@ class MemoryDepChecker {

/// Return true if there are no store-load forwarding dependencies.
bool isSafeForAnyStoreLoadForwardDistances() const {
return MaxStoreLoadForwardSafeDistanceInBits ==
std::numeric_limits<uint64_t>::max();
return MaxPowerOf2StoreLoadForwardSafeDistanceInBits ==
std::numeric_limits<uint64_t>::max() &&
MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits ==
std::numeric_limits<uint64_t>::max();
}

/// Return safe power-of-2 number of elements, which do not prevent store-load
/// forwarding, multiplied by the size of the elements in bits.
uint64_t getStoreLoadForwardSafeDistanceInBits() const {
/// Return safe number of elements, which do not prevent store-load
/// forwarding, multiplied by the size of the elements in bits (power-of-2).
uint64_t getPowerOf2StoreLoadForwardSafeDistanceInBits() const {
assert(!isSafeForAnyStoreLoadForwardDistances() &&
"Expected the distance, that prevent store-load forwarding, to be "
"set.");
return MaxStoreLoadForwardSafeDistanceInBits;
return MaxPowerOf2StoreLoadForwardSafeDistanceInBits;
}

/// Return safe number of elements, which do not prevent store-load
/// forwarding, multiplied by the size of the elements in bits (power-of-2).
uint64_t getNonPowerOf2StoreLoadForwardSafeDistanceInBits() const {
assert(!isSafeForAnyStoreLoadForwardDistances() &&
"Expected the distance, that prevent store-load forwarding, to be "
"set.");
return MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits;
}

/// In same cases when the dependency check fails we can still
Expand Down Expand Up @@ -319,9 +331,14 @@ class MemoryDepChecker {
/// restrictive.
uint64_t MaxSafeVectorWidthInBits = -1U;

/// Maximum power-of-2 number of elements, which do not prevent store-load
/// forwarding, multiplied by the size of the elements in bits.
uint64_t MaxStoreLoadForwardSafeDistanceInBits =
/// Maximum number of elements, which do not prevent store-load forwarding,
/// multiplied by the size of the elements in bits (power-of-2).
uint64_t MaxPowerOf2StoreLoadForwardSafeDistanceInBits =
std::numeric_limits<uint64_t>::max();

/// Maximum number of elements, which do not prevent store-load forwarding,
/// multiplied by the size of the elements in bits (non-power-of-2).
uint64_t MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits =
std::numeric_limits<uint64_t>::max();

/// If we see a non-constant dependence distance we can still try to
Expand All @@ -348,6 +365,9 @@ class MemoryDepChecker {
/// backwards-vectorizable or unknown (triggering a runtime check).
unsigned MaxTargetVectorWidthInBits = 0;

/// True if current target supports non-power-of-2 dependence distances.
bool AllowNonPow2Deps = false;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is confusing, in general any target supports non-power-of-2 dependence distances, if >= VF?


/// Mapping of SCEV expressions to their expanded pointer bounds (pair of
/// start and end pointer expressions).
DenseMap<std::pair<const SCEV *, Type *>,
Expand Down
14 changes: 10 additions & 4 deletions llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,8 +382,7 @@ class LoopVectorizationLegality {
const LoopAccessInfo *getLAI() const { return LAI; }

bool isSafeForAnyVectorWidth() const {
return LAI->getDepChecker().isSafeForAnyVectorWidth() &&
LAI->getDepChecker().isSafeForAnyStoreLoadForwardDistances();
return LAI->getDepChecker().isSafeForAnyVectorWidth();
}

uint64_t getMaxSafeVectorWidthInBits() const {
Expand Down Expand Up @@ -414,8 +413,15 @@ class LoopVectorizationLegality {

/// Return safe power-of-2 number of elements, which do not prevent store-load
/// forwarding and safe to operate simultaneously.
uint64_t getMaxStoreLoadForwardSafeDistanceInBits() const {
return LAI->getDepChecker().getStoreLoadForwardSafeDistanceInBits();
uint64_t getPowerOf2MaxStoreLoadForwardSafeDistanceInBits() const {
return LAI->getDepChecker().getPowerOf2StoreLoadForwardSafeDistanceInBits();
}

/// Return safe non-power-of-2 number of elements, which do not prevent
/// store-load forwarding and safe to operate simultaneously.
uint64_t getNonPowerOf2MaxStoreLoadForwardSafeDistanceInBits() const {
return LAI->getDepChecker()
.getNonPowerOf2StoreLoadForwardSafeDistanceInBits();
}

/// Returns true if vector representation of the instruction \p I
Expand Down
60 changes: 51 additions & 9 deletions llvm/lib/Analysis/LoopAccessAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1756,7 +1756,8 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
// Maximum vector factor.
uint64_t MaxVFWithoutSLForwardIssuesPowerOf2 =
std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
MaxStoreLoadForwardSafeDistanceInBits);
MaxPowerOf2StoreLoadForwardSafeDistanceInBits);
uint64_t MaxVFWithoutSLForwardIssuesNonPowerOf2 = 0;

// Compute the smallest VF at which the store and load would be misaligned.
for (uint64_t VF = 2 * TypeByteSize;
Expand All @@ -1768,24 +1769,61 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
break;
}
}
// RISCV VLA supports non-power-2 vector factor. So, we iterate in a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this needed for correctness for RISCV? If not, can be done separately as this adds some extra complexity.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially, we can support only power-of-2. I can split this patch into 2 sub-patches.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes please, this would help to make the patch simpler hopefully.

Also MaxStoreLoadForwardSafeVF is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).

Is this relevant for cores supporting EVL?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The initial patch with only power-of-2 support is committed already, this one adds non-power-of-2

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also MaxStoreLoadForwardSafeVF is not really a legality constraint but a cost constraint (to prevent cases where the HW support Store to load forwarding, which may be faster than a vector loop not allowing for store->load forwarding).

Right.

Is this relevant for cores supporting EVL?
Yes, but it supports non-power-of-2 sizes due to its nature

// backward order to find largest VF, which allows aligned stores-loads or
// the number of iterations between conflicting memory addresses is not less
// than 8 (NumItersForStoreLoadThroughMemory).
if (AllowNonPow2Deps) {
MaxVFWithoutSLForwardIssuesNonPowerOf2 =
std::min(VectorizerParams::MaxVectorWidth * TypeByteSize,
MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits);

for (uint64_t VF = MaxVFWithoutSLForwardIssuesNonPowerOf2;
VF > MaxVFWithoutSLForwardIssuesPowerOf2; VF -= TypeByteSize) {
if (Distance % VF == 0 ||
Distance / VF >= NumItersForStoreLoadThroughMemory) {
uint64_t GCD =
isSafeForAnyStoreLoadForwardDistances()
? VF
: std::gcd(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits,
VF);
MaxVFWithoutSLForwardIssuesNonPowerOf2 = GCD;
break;
}
}
}

if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize) {
if (MaxVFWithoutSLForwardIssuesPowerOf2 < 2 * TypeByteSize &&
MaxVFWithoutSLForwardIssuesNonPowerOf2 < 2 * TypeByteSize) {
LLVM_DEBUG(
dbgs() << "LAA: Distance " << Distance
<< " that could cause a store-load forwarding conflict\n");
return true;
}

// Handle non-power-2 store-load forwarding distance, power-of-2 distance can
// be calculated.
if (AllowNonPow2Deps && CommonStride &&
MaxVFWithoutSLForwardIssuesNonPowerOf2 <
MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits &&
MaxVFWithoutSLForwardIssuesNonPowerOf2 !=
VectorizerParams::MaxVectorWidth * TypeByteSize) {
uint64_t MaxVF = MaxVFWithoutSLForwardIssuesNonPowerOf2 / CommonStride;
uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits =
std::min(MaxNonPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits);
}

if (CommonStride &&
MaxVFWithoutSLForwardIssuesPowerOf2 <
MaxStoreLoadForwardSafeDistanceInBits &&
MaxPowerOf2StoreLoadForwardSafeDistanceInBits &&
MaxVFWithoutSLForwardIssuesPowerOf2 !=
VectorizerParams::MaxVectorWidth * TypeByteSize) {
uint64_t MaxVF =
bit_floor(MaxVFWithoutSLForwardIssuesPowerOf2 / CommonStride);
uint64_t MaxVFInBits = MaxVF * TypeByteSize * 8;
MaxStoreLoadForwardSafeDistanceInBits =
std::min(MaxStoreLoadForwardSafeDistanceInBits, MaxVFInBits);
MaxPowerOf2StoreLoadForwardSafeDistanceInBits =
std::min(MaxPowerOf2StoreLoadForwardSafeDistanceInBits, MaxVFInBits);
}
return false;
}
Expand Down Expand Up @@ -2250,7 +2288,8 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
return Dependence::Unknown;
}

MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
if (!AllowNonPow2Deps)
MaxSafeVectorWidthInBits = std::min(MaxSafeVectorWidthInBits, MaxVFInBits);
return Dependence::BackwardVectorizable;
}

Expand Down Expand Up @@ -2984,8 +3023,9 @@ LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
MaxTargetVectorWidthInBits =
TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) * 2;

DepChecker = std::make_unique<MemoryDepChecker>(*PSE, L, SymbolicStrides,
MaxTargetVectorWidthInBits);
DepChecker = std::make_unique<MemoryDepChecker>(
*PSE, L, SymbolicStrides, MaxTargetVectorWidthInBits,
TTI && TTI->hasActiveVectorLength(0, nullptr, Align()));
PtrRtChecking = std::make_unique<RuntimePointerChecking>(*DepChecker, SE);
if (canAnalyzeLoop())
CanVecMem = analyzeLoop(AA, LI, TLI, DT);
Expand All @@ -2999,7 +3039,9 @@ void LoopAccessInfo::print(raw_ostream &OS, unsigned Depth) const {
OS << " with a maximum safe vector width of "
<< DC.getMaxSafeVectorWidthInBits() << " bits";
if (!DC.isSafeForAnyStoreLoadForwardDistances()) {
uint64_t SLDist = DC.getStoreLoadForwardSafeDistanceInBits();
uint64_t SLDist = DC.getNonPowerOf2StoreLoadForwardSafeDistanceInBits();
if (SLDist == std::numeric_limits<uint64_t>::max())
SLDist = DC.getPowerOf2StoreLoadForwardSafeDistanceInBits();
OS << ", with a maximum safe store-load forward width of " << SLDist
<< " bits";
}
Expand Down
74 changes: 54 additions & 20 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1559,9 +1559,10 @@ class LoopVectorizationCostModel {
/// elements is a power-of-2 larger than zero. If scalable vectorization is
/// disabled or unsupported, then the scalable part will be equal to
/// ElementCount::getScalable(0).
FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
ElementCount UserVF,
bool FoldTailByMasking);
FixedScalableVFPair
computeFeasibleMaxVF(unsigned MaxTripCount, ElementCount UserVF,
bool FoldTailByMasking,
bool AllowNonPowerOf2SafeDist = false);

/// \return the maximized element count based on the targets vector
/// registers and the loop trip-count, but limited to a maximum safe VF.
Expand Down Expand Up @@ -3751,7 +3752,9 @@ bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
return false;
}

if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(*TheFunction, TTI)) {
if ((!Legal->isSafeForAnyVectorWidth() ||
!Legal->isSafeForAnyStoreLoadForwardDistances()) &&
!getMaxVScale(*TheFunction, TTI)) {
reportVectorizationInfo("The target does not provide maximum vscale value "
"for safe distance analysis.",
"ScalableVFUnfeasible", ORE, TheLoop);
Expand All @@ -3769,7 +3772,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {

auto MaxScalableVF = ElementCount::getScalable(
std::numeric_limits<ElementCount::ScalarTy>::max());
if (Legal->isSafeForAnyVectorWidth())
if (Legal->isSafeForAnyVectorWidth() &&
Legal->isSafeForAnyStoreLoadForwardDistances())
return MaxScalableVF;

std::optional<unsigned> MaxVScale = getMaxVScale(*TheFunction, TTI);
Expand All @@ -3786,7 +3790,8 @@ LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
}

FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking,
bool AllowNonPowerOf2SafeDist) {
MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
unsigned SmallestType, WidestType;
std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
Expand All @@ -3795,18 +3800,32 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
// It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
// the memory accesses that is most restrictive (involved in the smallest
// dependence distance).
unsigned MaxSafeElementsPowerOf2 =
bit_floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
unsigned MaxSafeElementsNonPowerOf2 =
Legal->getMaxSafeVectorWidthInBits() / WidestType;
unsigned MaxSafeElementsPowerOf2 = bit_floor(MaxSafeElementsNonPowerOf2);
if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
MaxSafeElementsPowerOf2 =
std::min(MaxSafeElementsPowerOf2, SLDist / WidestType);
uint64_t SLDist = Legal->getPowerOf2MaxStoreLoadForwardSafeDistanceInBits();
if (SLDist != std::numeric_limits<uint64_t>::max()) {
unsigned SLVF = SLDist / WidestType;
MaxSafeElementsPowerOf2 = std::min(MaxSafeElementsPowerOf2, SLVF);
}
if (FoldTailByMasking && AllowNonPowerOf2SafeDist) {
uint64_t SLDist =
Legal->getNonPowerOf2MaxStoreLoadForwardSafeDistanceInBits();
if (SLDist != std::numeric_limits<uint64_t>::max()) {
unsigned SLVF = SLDist / WidestType;
MaxSafeElements = Legal->isSafeForAnyVectorWidth()
? SLVF
: std::gcd(MaxSafeElementsNonPowerOf2, SLVF);
}
} else {
MaxSafeElements = MaxSafeElementsPowerOf2;
}
}
auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElementsPowerOf2);
auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElementsPowerOf2);

if (!Legal->isSafeForAnyVectorWidth())
this->MaxSafeElements = MaxSafeElementsPowerOf2;
auto MaxSafeScalableVF = getMaxLegalScalableVF(
FoldTailByMasking && AllowNonPowerOf2SafeDist ? MaxSafeElementsNonPowerOf2
: MaxSafeElementsPowerOf2);

LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
<< ".\n");
Expand Down Expand Up @@ -4018,7 +4037,12 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return Rem->isZero();
};

if (MaxPowerOf2RuntimeVF > 0u) {
FixedScalableVFPair FoldTailMaxFactors =
computeFeasibleMaxVF(MaxTC, UserVF, /*FoldTailByMasking=*/true,
/*AllowNonPowerOf2SafeDist=*/true);
if ((Legal->isSafeForAnyStoreLoadForwardDistances() ||
has_single_bit(*MaxSafeElements)) &&
MaxPowerOf2RuntimeVF) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
Expand All @@ -4030,7 +4054,14 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {

auto ExpectedTC = getSmallBestKnownTC(PSE, TheLoop);
if (ExpectedTC && ExpectedTC <= TTI.getMinTripCountTailFoldingThreshold()) {
if (MaxPowerOf2RuntimeVF > 0u) {
if (MaxPowerOf2RuntimeVF) {
assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
"MaxFixedVF must be a power of 2");
if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
// Accept MaxFixedVF if we do not have a tail.
LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
return MaxFactors;
}
// If we have a low-trip-count, and the fixed-width VF is known to divide
// the trip count but the scalable factor does not, use the fixed-width
// factor in preference to allow the generation of a non-predicated loop.
Expand All @@ -4054,10 +4085,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
// found modulo the vectorization factor is not zero, try to fold the tail
// by masking.
// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero() ||
FoldTailMaxFactors.ScalableVF.isNonZero();
setTailFoldingStyles(ContainsScalableVF, UserIC);
if (foldTailByMasking()) {
if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
if (foldTailWithEVL()) {
LLVM_DEBUG(
dbgs()
<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
Expand All @@ -4069,6 +4101,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
assert(ContainsScalableVF && "Expected scalable vector factor.");

MaxFactors.FixedVF = ElementCount::getFixed(1);
MaxFactors.ScalableVF = FoldTailMaxFactors.ScalableVF;
}
return MaxFactors;
}
Expand Down Expand Up @@ -5137,7 +5170,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
}

// We used the distance for the interleave count.
if (!Legal->isSafeForAnyVectorWidth())
if (!Legal->isSafeForAnyVectorWidth() ||
!Legal->isSafeForAnyStoreLoadForwardDistances())
return 1;

// We don't attempt to perform interleaving for loops with uncountable early
Expand Down
Loading
Loading