-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Optimize VPWidenIntOrFpInductionRecipe for known TC #118828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5862a87
f30fdd9
30b5939
135d7ab
e0cfe7c
503dd24
7b75327
cbe3613
5f5e6f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,6 +20,7 @@ | |
#include "VPlanPatternMatch.h" | ||
#include "VPlanUtils.h" | ||
#include "VPlanVerifier.h" | ||
#include "llvm/ADT/APInt.h" | ||
#include "llvm/ADT/PostOrderIterator.h" | ||
#include "llvm/ADT/STLExtras.h" | ||
#include "llvm/ADT/SetVector.h" | ||
|
@@ -29,6 +30,8 @@ | |
#include "llvm/Analysis/VectorUtils.h" | ||
#include "llvm/IR/Intrinsics.h" | ||
#include "llvm/IR/PatternMatch.h" | ||
#include "llvm/Support/Casting.h" | ||
#include "llvm/Support/TypeSize.h" | ||
|
||
using namespace llvm; | ||
|
||
|
@@ -1086,11 +1089,84 @@ void VPlanTransforms::simplifyRecipes(VPlan &Plan, Type &CanonicalIVTy) { | |
} | ||
} | ||
|
||
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||
unsigned BestUF, | ||
PredicatedScalarEvolution &PSE) { | ||
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); | ||
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); | ||
/// Optimize the width of vector induction variables in \p Plan based on a known | ||
/// constant Trip Count, \p BestVF and \p BestUF. | ||
static bool optimizeVectorInductionWidthForTCAndVFUF(VPlan &Plan, | ||
ElementCount BestVF, | ||
unsigned BestUF) { | ||
// Only proceed if we have not completely removed the vector region. | ||
if (!Plan.getVectorLoopRegion()) | ||
return false; | ||
|
||
if (!Plan.getTripCount()->isLiveIn()) | ||
return false; | ||
auto *TC = dyn_cast_if_present<ConstantInt>( | ||
Plan.getTripCount()->getUnderlyingValue()); | ||
if (!TC || !BestVF.isFixed()) | ||
return false; | ||
|
||
// Calculate the minimum power-of-2 bit width that can fit the known TC, VF | ||
// and UF. Returns at least 8. | ||
auto ComputeBitWidth = [](APInt TC, uint64_t Align) { | ||
APInt AlignedTC = | ||
Align * APIntOps::RoundingUDiv(TC, APInt(TC.getBitWidth(), Align), | ||
APInt::Rounding::UP); | ||
APInt MaxVal = AlignedTC - 1; | ||
return std::max<unsigned>(PowerOf2Ceil(MaxVal.getActiveBits()), 8); | ||
}; | ||
unsigned NewBitWidth = | ||
ComputeBitWidth(TC->getValue(), BestVF.getKnownMinValue() * BestUF); | ||
|
||
LLVMContext &Ctx = Plan.getCanonicalIV()->getScalarType()->getContext(); | ||
auto *NewIVTy = IntegerType::get(Ctx, NewBitWidth); | ||
|
||
bool MadeChange = false; | ||
|
||
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); | ||
for (VPRecipeBase &Phi : HeaderVPBB->phis()) { | ||
auto *WideIV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment saying why this is restricted to canonical IVs with a single user? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we also support VPWidenCanonicalIVPHIrecipe? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we could, but the logic for changing the type that gets produced when the recipe is executed is slightly different to |
||
|
||
// Currently only handle canonical IVs as it is trivial to replace the start | ||
// and stop values, and we currently only perform the optimization when the | ||
// IV has a single use. | ||
if (!WideIV || !WideIV->isCanonical() || | ||
WideIV->hasMoreThanOneUniqueUser() || | ||
NewIVTy == WideIV->getScalarType()) | ||
continue; | ||
|
||
// Currently only handle cases where the single user is a header-mask | ||
// comparison with the backedge-taken-count. | ||
using namespace VPlanPatternMatch; | ||
if (!match( | ||
*WideIV->user_begin(), | ||
m_Binary<Instruction::ICmp>( | ||
m_Specific(WideIV), | ||
m_Broadcast(m_Specific(Plan.getOrCreateBackedgeTakenCount()))))) | ||
continue; | ||
|
||
// Update IV operands and comparison bound to use new narrower type. | ||
auto *NewStart = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 0)); | ||
WideIV->setStartValue(NewStart); | ||
auto *NewStep = Plan.getOrAddLiveIn(ConstantInt::get(NewIVTy, 1)); | ||
WideIV->setStepValue(NewStep); | ||
|
||
auto *NewBTC = new VPWidenCastRecipe( | ||
Instruction::Trunc, Plan.getOrCreateBackedgeTakenCount(), NewIVTy); | ||
Plan.getVectorPreheader()->appendRecipe(NewBTC); | ||
auto *Cmp = cast<VPInstruction>(*WideIV->user_begin()); | ||
Cmp->setOperand(1, NewBTC); | ||
|
||
MadeChange = true; | ||
} | ||
|
||
return MadeChange; | ||
} | ||
|
||
/// Try to simplify the branch condition of \p Plan. This may restrict the | ||
/// resulting plan to \p BestVF and \p BestUF. | ||
static bool simplifyBranchConditionForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||
unsigned BestUF, | ||
PredicatedScalarEvolution &PSE) { | ||
VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion(); | ||
VPBasicBlock *ExitingVPBB = VectorRegion->getExitingBasicBlock(); | ||
auto *Term = &ExitingVPBB->back(); | ||
|
@@ -1103,7 +1179,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |
if (!match(Term, m_BranchOnCount(m_VPValue(), m_VPValue())) && | ||
!match(Term, | ||
m_BranchOnCond(m_Not(m_ActiveLaneMask(m_VPValue(), m_VPValue()))))) | ||
return; | ||
return false; | ||
|
||
ScalarEvolution &SE = *PSE.getSE(); | ||
const SCEV *TripCount = | ||
|
@@ -1114,7 +1190,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |
const SCEV *C = SE.getElementCount(TripCount->getType(), NumElements); | ||
if (TripCount->isZero() || | ||
!SE.isKnownPredicate(CmpInst::ICMP_ULE, TripCount, C)) | ||
return; | ||
return false; | ||
|
||
// The vector loop region only executes once. If possible, completely remove | ||
// the region, otherwise replace the terminator controlling the latch with | ||
|
@@ -1140,7 +1216,7 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |
|
||
VPBlockUtils::connectBlocks(Preheader, Header); | ||
VPBlockUtils::connectBlocks(ExitingVPBB, Exit); | ||
simplifyRecipes(Plan, *CanIVTy); | ||
VPlanTransforms::simplifyRecipes(Plan, *CanIVTy); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. unrelated? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is actually necessary - because this logic has been moved into a (static) non-member function we now need the class scope when calling the |
||
} else { | ||
// The vector region contains header phis for which we cannot remove the | ||
// loop region yet. | ||
|
@@ -1153,8 +1229,23 @@ void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | |
|
||
Term->eraseFromParent(); | ||
|
||
Plan.setVF(BestVF); | ||
assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF"); | ||
return true; | ||
} | ||
|
||
void VPlanTransforms::optimizeForVFAndUF(VPlan &Plan, ElementCount BestVF, | ||
unsigned BestUF, | ||
PredicatedScalarEvolution &PSE) { | ||
assert(Plan.hasVF(BestVF) && "BestVF is not available in Plan"); | ||
assert(Plan.hasUF(BestUF) && "BestUF is not available in Plan"); | ||
|
||
bool MadeChange = | ||
simplifyBranchConditionForVFAndUF(Plan, BestVF, BestUF, PSE); | ||
MadeChange |= optimizeVectorInductionWidthForTCAndVFUF(Plan, BestVF, BestUF); | ||
|
||
if (MadeChange) { | ||
Plan.setVF(BestVF); | ||
assert(Plan.getUF() == BestUF && "BestUF must match the Plan's UF"); | ||
} | ||
// TODO: Further simplifications are possible | ||
// 1. Replace inductions with constants. | ||
// 2. Replace vector loop region with VPBasicBlock. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The underlying value should only be used if TripCount is a live-in (currently guaranteed I think), can you update this to use
getLiveInIRValue
, which will assert if it isn't a live-in?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So if I refactor to use
getLiveInIRValue()
, we get quite a few lit test crashes where TripCount is not a live-in, so I'm not sure this is guaranteed?I've added a check above to make sure that the TC is a live-in, so that we are not calling
getUnderlyingValue()
on non live-ins.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, the trip count is one of those special live-ins.