Skip to content

[LoopIdiom] Support 'shift until less-than' idiom #95002

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
291 changes: 255 additions & 36 deletions llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,12 +231,19 @@ class LoopIdiomRecognize {
bool recognizePopcount();
void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
PHINode *CntPhi, Value *Var);
bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
bool ZeroCheck, size_t CanonicalSize);
bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
Instruction *DefX, PHINode *CntPhi,
Instruction *CntInst);
bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
bool recognizeShiftUntilLessThan();
void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
Instruction *CntInst, PHINode *CntPhi,
Value *Var, Instruction *DefX,
const DebugLoc &DL, bool ZeroCheck,
bool IsCntPhiUsedOutsideLoop);
bool IsCntPhiUsedOutsideLoop,
bool InsertSub = false);

bool recognizeShiftUntilBitTest();
bool recognizeShiftUntilZero();
Expand Down Expand Up @@ -1482,7 +1489,8 @@ bool LoopIdiomRecognize::runOnNoncountableLoop() {
<< CurLoop->getHeader()->getName() << "\n");

return recognizePopcount() || recognizeAndInsertFFS() ||
recognizeShiftUntilBitTest() || recognizeShiftUntilZero();
recognizeShiftUntilBitTest() || recognizeShiftUntilZero() ||
recognizeShiftUntilLessThan();
}

/// Check if the given conditional branch is based on the comparison between
Expand Down Expand Up @@ -1517,6 +1525,34 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
return nullptr;
}

/// Check if the given conditional branch is based on an unsigned less-than
/// comparison between a variable and a constant, and if the comparison is false
/// the control yields to the loop entry. If the branch matches the behaviour,
/// the variable involved in the comparison is returned.
static Value *matchShiftULTCondition(BranchInst *BI, BasicBlock *LoopEntry,
uint64_t &Threshold) {
if (!BI || !BI->isConditional())
return nullptr;

ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
if (!Cond)
return nullptr;

ConstantInt *CmpConst = dyn_cast<ConstantInt>(Cond->getOperand(1));
if (!CmpConst)
return nullptr;

BasicBlock *FalseSucc = BI->getSuccessor(1);
ICmpInst::Predicate Pred = Cond->getPredicate();

if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
Threshold = CmpConst->getZExtValue();
return Cond->getOperand(0);
}

return nullptr;
}

// Check if the recurrence variable `VarX` is in the right form to create
// the idiom. Returns the value coerced to a PHINode if so.
static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
Expand All @@ -1528,6 +1564,107 @@ static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
return nullptr;
}

/// Return true if the idiom is detected in the loop.
///
/// Additionally:
/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
/// or nullptr if there is no such.
/// 2) \p CntPhi is set to the corresponding phi node
/// or nullptr if there is no such.
/// 3) \p InitX is set to the value whose CTLZ could be used.
/// 4) \p DefX is set to the instruction calculating Loop exit condition.
/// 5) \p Threshold is set to the constant involved in the unsigned less-than
/// comparison.
///
/// The core idiom we are trying to detect is:
/// \code
/// if (x0 < 2)
/// goto loop-exit // the precondition of the loop
/// cnt0 = init-val
/// do {
/// x = phi (x0, x.next); //PhiX
/// cnt = phi (cnt0, cnt.next)
///
/// cnt.next = cnt + 1;
/// ...
/// x.next = x >> 1; // DefX
/// } while (x >= 4)
/// loop-exit:
/// \endcode
static bool detectShiftUntilLessThanIdiom(Loop *CurLoop, const DataLayout &DL,
Intrinsic::ID &IntrinID,
Value *&InitX, Instruction *&CntInst,
PHINode *&CntPhi, Instruction *&DefX,
uint64_t &Threshold) {
BasicBlock *LoopEntry;

DefX = nullptr;
CntInst = nullptr;
CntPhi = nullptr;
LoopEntry = *(CurLoop->block_begin());

// step 1: Check if the loop-back branch is in desirable form.
if (Value *T = matchShiftULTCondition(
dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry,
Threshold))
DefX = dyn_cast<Instruction>(T);
else
return false;

// step 2: Check the recurrence of variable X
if (!DefX || !isa<PHINode>(DefX))
return false;

PHINode *VarPhi = cast<PHINode>(DefX);
int Idx = VarPhi->getBasicBlockIndex(LoopEntry);
if (Idx == -1)
return false;

DefX = dyn_cast<Instruction>(VarPhi->getIncomingValue(Idx));
if (!DefX || DefX->getNumOperands() == 0 || DefX->getOperand(0) != VarPhi)
return false;

// step 3: detect instructions corresponding to "x.next = x >> 1"
if (DefX->getOpcode() != Instruction::LShr)
return false;

IntrinID = Intrinsic::ctlz;
ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
if (!Shft || !Shft->isOne())
return false;

InitX = VarPhi->getIncomingValueForBlock(CurLoop->getLoopPreheader());

// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
// or cnt.next = cnt + -1.
// TODO: We can skip the step. If loop trip count is known (CTLZ),
// then all uses of "cnt.next" could be optimized to the trip count
// plus "cnt0". Currently it is not optimized.
// This step could be used to detect POPCNT instruction:
// cnt.next = cnt + (x.next & 1)
for (Instruction &Inst : llvm::make_range(
LoopEntry->getFirstNonPHI()->getIterator(), LoopEntry->end())) {
if (Inst.getOpcode() != Instruction::Add)
continue;

ConstantInt *Inc = dyn_cast<ConstantInt>(Inst.getOperand(1));
if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have tests for the decrement cast, i.e. Inc = -1? I think we should either try to write tests for it and make sure the compiler does something sensible, or I'm also happy for now if we remove support for decrements.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point - it seems to me we might as well (as the code for handling the decrement case already exists in transformLoopToCountable), so I've added some tests for this case.

continue;

PHINode *Phi = getRecurrenceVar(Inst.getOperand(0), &Inst, LoopEntry);
if (!Phi)
continue;

CntInst = &Inst;
CntPhi = Phi;
break;
}
if (!CntInst)
return false;

return true;
}

/// Return true iff the idiom is detected in the loop.
///
/// Additionally:
Expand Down Expand Up @@ -1756,27 +1893,35 @@ static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
return true;
}

/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
/// trip count returns true; otherwise, returns false.
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
// Give up if the loop has multiple blocks or multiple backedges.
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
return false;
// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
// profitable if we delete the loop.
bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
Value *InitX, bool ZeroCheck,
size_t CanonicalSize) {
const Value *Args[] = {InitX,
ConstantInt::getBool(InitX->getContext(), ZeroCheck)};

Intrinsic::ID IntrinID;
Value *InitX;
Instruction *DefX = nullptr;
PHINode *CntPhi = nullptr;
Instruction *CntInst = nullptr;
// Help decide if transformation is profitable. For ShiftUntilZero idiom,
// this is always 6.
size_t IdiomCanonicalSize = 6;
// @llvm.dbg doesn't count as they have no semantic effect.
auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
uint32_t HeaderSize =
std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());

if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
CntInst, CntPhi, DefX))
IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
InstructionCost Cost = TTI->getIntrinsicInstrCost(
Attrs, TargetTransformInfo::TCK_SizeAndLatency);
if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
return false;

return true;
}

/// Convert CTLZ / CTTZ idiom loop into countable loop.
/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
/// returns false.
bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
Value *InitX, Instruction *DefX,
PHINode *CntPhi,
Instruction *CntInst) {
bool IsCntPhiUsedOutsideLoop = false;
for (User *U : CntPhi->users())
if (!CurLoop->contains(cast<Instruction>(U))) {
Expand Down Expand Up @@ -1818,35 +1963,107 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
ZeroCheck = true;
}

// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
// profitable if we delete the loop.

// the loop has only 6 instructions:
// FFS idiom loop has only 6 instructions:
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
// %shr = ashr %n.addr.0, 1
// %tobool = icmp eq %shr, 0
// %inc = add nsw %i.0, 1
// br i1 %tobool
size_t IdiomCanonicalSize = 6;
if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
return false;

const Value *Args[] = {InitX,
ConstantInt::getBool(InitX->getContext(), ZeroCheck)};
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
DefX->getDebugLoc(), ZeroCheck,
IsCntPhiUsedOutsideLoop);
return true;
}

// @llvm.dbg doesn't count as they have no semantic effect.
auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
uint32_t HeaderSize =
std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
/// trip count returns true; otherwise, returns false.
bool LoopIdiomRecognize::recognizeAndInsertFFS() {
// Give up if the loop has multiple blocks or multiple backedges.
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
return false;

IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
InstructionCost Cost =
TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
if (HeaderSize != IdiomCanonicalSize &&
Cost > TargetTransformInfo::TCC_Basic)
Intrinsic::ID IntrinID;
Value *InitX;
Instruction *DefX = nullptr;
PHINode *CntPhi = nullptr;
Instruction *CntInst = nullptr;

if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX, CntInst, CntPhi,
DefX))
return false;

return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
}

bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
// Give up if the loop has multiple blocks or multiple backedges.
if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
return false;

Intrinsic::ID IntrinID;
Value *InitX;
Instruction *DefX = nullptr;
PHINode *CntPhi = nullptr;
Instruction *CntInst = nullptr;

uint64_t LoopThreshold;
if (!detectShiftUntilLessThanIdiom(CurLoop, *DL, IntrinID, InitX, CntInst,
CntPhi, DefX, LoopThreshold))
return false;

if (LoopThreshold == 2) {
// Treat as regular FFS.
return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
}

// Look for Floor Log2 Idiom.
if (LoopThreshold != 4)
return false;

// Abort if CntPhi is used outside of the loop.
for (User *U : CntPhi->users())
if (!CurLoop->contains(cast<Instruction>(U)))
return false;

// It is safe to assume Preheader exist as it was checked in
// parent function RunOnLoop.
BasicBlock *PH = CurLoop->getLoopPreheader();
auto *PreCondBB = PH->getSinglePredecessor();
if (!PreCondBB)
return false;
auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
if (!PreCondBI)
return false;

uint64_t PreLoopThreshold;
if (matchShiftULTCondition(PreCondBI, PH, PreLoopThreshold) != InitX ||
PreLoopThreshold != 2)
return false;

bool ZeroCheck = true;

// the loop has only 6 instructions:
// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
// %shr = ashr %n.addr.0, 1
// %tobool = icmp ult %n.addr.0, C
// %inc = add nsw %i.0, 1
// br i1 %tobool
size_t IdiomCanonicalSize = 6;
if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, IdiomCanonicalSize))
return false;

// log2(x) = w − 1 − clz(x)
transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
DefX->getDebugLoc(), ZeroCheck,
IsCntPhiUsedOutsideLoop);
/*IsCntPhiUsedOutsideLoop=*/false,
/*InsertSub=*/true);
return true;
}

Expand Down Expand Up @@ -1961,7 +2178,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
void LoopIdiomRecognize::transformLoopToCountable(
Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());

// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
Expand Down Expand Up @@ -1991,6 +2208,8 @@ void LoopIdiomRecognize::transformLoopToCountable(
Type *CountTy = Count->getType();
Count = Builder.CreateSub(
ConstantInt::get(CountTy, CountTy->getIntegerBitWidth()), Count);
if (InsertSub)
Count = Builder.CreateSub(Count, ConstantInt::get(CountTy, 1));
Value *NewCount = Count;
if (IsCntPhiUsedOutsideLoop)
Count = Builder.CreateAdd(Count, ConstantInt::get(CountTy, 1));
Expand Down
Loading
Loading