Skip to content

Commit 1146fe6

Browse files
traoux1gfxbot
authored andcommitted
Prevent loop spliting in LLVM LoopSimplification pass. When
barriers are present if LLVM pass decides to split the barrier we may end up with divergent barriers causing functional issues.. In term of performance it can also cause low SIMD utilization. Add a pass to canonicalize the loop in our prefered way before running loop passes. LLVM shouldn't break loop with convergent instructions, the Simplification pass should get fixed to prevent this kind of issue. I'll try to prepare a patch to see if this can be prevented. Change-Id: I2881aab5e85ea53763b5b788260fc4c523c2055c
1 parent f062af3 commit 1146fe6

File tree

3 files changed

+257
-1
lines changed

3 files changed

+257
-1
lines changed

IGC/Compiler/CISACodeGen/ShaderCodeGen.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ inline void AddLegalizationPasses(CodeGenContext &ctx, const CShaderProgram::Ker
370370

371371
if (ctx.m_threadCombiningOptDone)
372372
{
373+
mpm.add(createLoopCanonicalization());
373374
mpm.add(llvm::createLoopDeletionPass());
374375
mpm.add(llvm::createBreakCriticalEdgesPass());
375376
mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));
@@ -450,6 +451,7 @@ inline void AddLegalizationPasses(CodeGenContext &ctx, const CShaderProgram::Ker
450451
if(ctx.m_instrTypes.hasLoop)
451452
{
452453
// need to run loop simplify to canonicalize loop and merge latches
454+
mpm.add(createLoopCanonicalization());
453455
mpm.add(createLoopSimplifyPass());
454456
}
455457
if (ctx.m_enableSubroutine)
@@ -1164,6 +1166,7 @@ void OptimizeIR(CodeGenContext* pContext)
11641166
if( pContext->m_instrTypes.hasLoop )
11651167
{
11661168
mpm.add(createLoopDeadCodeEliminationPass());
1169+
mpm.add(createLoopCanonicalization());
11671170
mpm.add(llvm::createLoopDeletionPass());
11681171
mpm.add(llvm::createBreakCriticalEdgesPass());
11691172
mpm.add(llvm::createLoopRotatePass(LOOP_ROTATION_HEADER_INST_THRESHOLD));

IGC/Compiler/CustomLoopOpt.cpp

Lines changed: 247 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -675,3 +675,250 @@ void CustomLoopVersioning::addPhiNodes(
675675
phi->addIncoming(Inst, origLoop->getExitingBlock());
676676
}
677677
}
678+
679+
680+
// This pass is mostly forked from LoopSimplification pass
681+
class LoopCanonicalization : public llvm::FunctionPass
682+
{
683+
public:
684+
static char ID;
685+
686+
LoopCanonicalization();
687+
688+
void getAnalysisUsage(llvm::AnalysisUsage& AU) const
689+
{
690+
AU.addRequired<llvm::LoopInfoWrapperPass>();
691+
AU.addRequired<llvm::DominatorTreeWrapperPass>();
692+
AU.addRequiredID(llvm::LCSSAID);
693+
AU.addPreservedID(LCSSAID);
694+
}
695+
696+
bool runOnFunction(Function& F);
697+
bool processLoop(Loop* loop, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA);
698+
bool processOneLoop(Loop* loop, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA);
699+
700+
701+
llvm::StringRef getPassName() const
702+
{
703+
return "IGC loop canonicalization";
704+
}
705+
706+
private:
707+
CodeGenContext * m_cgCtx;
708+
llvm::LoopInfo* m_LI;
709+
llvm::DominatorTree* m_DT;
710+
llvm::Function* m_function;
711+
};
712+
#undef PASS_FLAG
713+
#undef PASS_DESC
714+
#undef PASS_CFG_ONLY
715+
#undef PASS_ANALYSIS
716+
#define PASS_FLAG "igc-loop-canonicalization"
717+
#define PASS_DESC "IGC Loop canonicalization"
718+
#define PASS_CFG_ONLY false
719+
#define PASS_ANALYSIS false
720+
IGC_INITIALIZE_PASS_BEGIN(LoopCanonicalization, PASS_FLAG, PASS_DESC, PASS_CFG_ONLY, PASS_ANALYSIS)
721+
IGC_INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass);
722+
IGC_INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
723+
IGC_INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
724+
IGC_INITIALIZE_PASS_END(LoopCanonicalization, PASS_FLAG, PASS_DESC, PASS_CFG_ONLY, PASS_ANALYSIS)
725+
726+
727+
char LoopCanonicalization::ID = 0;
728+
729+
LoopCanonicalization::LoopCanonicalization() : FunctionPass(ID)
730+
{
731+
initializeLoopCanonicalizationPass(*PassRegistry::getPassRegistry());
732+
}
733+
/// \brief This method is called when the specified loop has more than one
734+
/// backedge in it.
735+
///
736+
/// If this occurs, revector all of these backedges to target a new basic block
737+
/// and have that block branch to the loop header. This ensures that loops
738+
/// have exactly one backedge.
739+
static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
740+
DominatorTree *DT, LoopInfo *LI) {
741+
assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
742+
743+
// Get information about the loop
744+
BasicBlock *Header = L->getHeader();
745+
Function *F = Header->getParent();
746+
747+
// Unique backedge insertion currently depends on having a preheader.
748+
if(!Preheader)
749+
return nullptr;
750+
751+
// The header is not an EH pad; preheader insertion should ensure this.
752+
assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
753+
754+
// Figure out which basic blocks contain back-edges to the loop header.
755+
std::vector<BasicBlock*> BackedgeBlocks;
756+
for(pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I) {
757+
BasicBlock *P = *I;
758+
759+
// Indirectbr edges cannot be split, so we must fail if we find one.
760+
if(isa<IndirectBrInst>(P->getTerminator()))
761+
return nullptr;
762+
763+
if(P != Preheader) BackedgeBlocks.push_back(P);
764+
}
765+
766+
// Create and insert the new backedge block...
767+
BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
768+
Header->getName() + ".backedge", F);
769+
BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
770+
BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
771+
772+
// Move the new backedge block to right after the last backedge block.
773+
Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
774+
F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
775+
776+
// Now that the block has been inserted into the function, create PHI nodes in
777+
// the backedge block which correspond to any PHI nodes in the header block.
778+
for(BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
779+
PHINode *PN = cast<PHINode>(I);
780+
PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
781+
PN->getName() + ".be", BETerminator);
782+
783+
// Loop over the PHI node, moving all entries except the one for the
784+
// preheader over to the new PHI node.
785+
unsigned PreheaderIdx = ~0U;
786+
bool HasUniqueIncomingValue = true;
787+
Value *UniqueValue = nullptr;
788+
for(unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
789+
BasicBlock *IBB = PN->getIncomingBlock(i);
790+
Value *IV = PN->getIncomingValue(i);
791+
if(IBB == Preheader) {
792+
PreheaderIdx = i;
793+
}
794+
else {
795+
NewPN->addIncoming(IV, IBB);
796+
if(HasUniqueIncomingValue) {
797+
if(!UniqueValue)
798+
UniqueValue = IV;
799+
else if(UniqueValue != IV)
800+
HasUniqueIncomingValue = false;
801+
}
802+
}
803+
}
804+
805+
// Delete all of the incoming values from the old PN except the preheader's
806+
assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
807+
if(PreheaderIdx != 0) {
808+
PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
809+
PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
810+
}
811+
// Nuke all entries except the zero'th.
812+
for(unsigned i = 0, e = PN->getNumIncomingValues() - 1; i != e; ++i)
813+
PN->removeIncomingValue(e - i, false);
814+
815+
// Finally, add the newly constructed PHI node as the entry for the BEBlock.
816+
PN->addIncoming(NewPN, BEBlock);
817+
818+
// As an optimization, if all incoming values in the new PhiNode (which is a
819+
// subset of the incoming values of the old PHI node) have the same value,
820+
// eliminate the PHI Node.
821+
if(HasUniqueIncomingValue) {
822+
NewPN->replaceAllUsesWith(UniqueValue);
823+
BEBlock->getInstList().erase(NewPN);
824+
}
825+
}
826+
827+
// Now that all of the PHI nodes have been inserted and adjusted, modify the
828+
// backedge blocks to jump to the BEBlock instead of the header.
829+
// If one of the backedges has llvm.loop metadata attached, we remove
830+
// it from the backedge and add it to BEBlock.
831+
unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
832+
MDNode *LoopMD = nullptr;
833+
for(unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
834+
TerminatorInst *TI = BackedgeBlocks[i]->getTerminator();
835+
if(!LoopMD)
836+
LoopMD = TI->getMetadata(LoopMDKind);
837+
TI->setMetadata(LoopMDKind, nullptr);
838+
for(unsigned Op = 0, e = TI->getNumSuccessors(); Op != e; ++Op)
839+
if(TI->getSuccessor(Op) == Header)
840+
TI->setSuccessor(Op, BEBlock);
841+
}
842+
BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD);
843+
844+
//===--- Update all analyses which we must preserve now -----------------===//
845+
846+
// Update Loop Information - we know that this block is now in the current
847+
// loop and all parent loops.
848+
L->addBasicBlockToLoop(BEBlock, *LI);
849+
850+
// Update dominator information
851+
DT->splitBlock(BEBlock);
852+
853+
return BEBlock;
854+
}
855+
856+
bool LoopCanonicalization::runOnFunction(llvm::Function& F)
857+
{
858+
bool Changed = false;
859+
LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
860+
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
861+
bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
862+
863+
// Simplify each loop nest in the function.
864+
for(LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
865+
Changed |= processLoop(*I, DT, LI, PreserveLCSSA);
866+
return Changed;
867+
}
868+
869+
bool LoopCanonicalization::processLoop(llvm::Loop* L, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA)
870+
{
871+
bool changed = false;
872+
// Worklist maintains our depth-first queue of loops in this nest to process.
873+
SmallVector<Loop *, 4> Worklist;
874+
Worklist.push_back(L);
875+
876+
// Walk the worklist from front to back, pushing newly found sub loops onto
877+
// the back. This will let us process loops from back to front in depth-first
878+
// order. We can use this simple process because loops form a tree.
879+
for(unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
880+
Loop *L2 = Worklist[Idx];
881+
Worklist.append(L2->begin(), L2->end());
882+
}
883+
884+
while(!Worklist.empty())
885+
changed |= processOneLoop(Worklist.pop_back_val(), DT, LI, PreserveLCSSA);
886+
return changed;
887+
}
888+
889+
// Do basic loop canonicalization to ensure correctness. We need a single header and single latch
890+
bool LoopCanonicalization::processOneLoop(Loop* L, DominatorTree *DT, LoopInfo *LI, bool PreserveLCSSA)
891+
{
892+
bool changed = false;
893+
// Does the loop already have a preheader? If so, don't insert one.
894+
BasicBlock *Preheader = L->getLoopPreheader();
895+
if(!Preheader) {
896+
Preheader = InsertPreheaderForLoop(L, DT, LI, PreserveLCSSA);
897+
if(Preheader) {
898+
changed = true;
899+
}
900+
}
901+
902+
// If the header has more than two predecessors at this point (from the
903+
// preheader and from multiple backedges), we must adjust the loop.
904+
BasicBlock *LoopLatch = L->getLoopLatch();
905+
if(!LoopLatch) {
906+
// If we either couldn't, or didn't want to, identify nesting of the loops,
907+
// insert a new block that all backedges target, then make it jump to the
908+
// loop header.
909+
LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI);
910+
if(LoopLatch) {
911+
changed = true;
912+
}
913+
}
914+
return changed;
915+
}
916+
917+
namespace IGC
918+
{
919+
FunctionPass* createLoopCanonicalization()
920+
{
921+
return new LoopCanonicalization();
922+
}
923+
}
924+

IGC/Compiler/CustomLoopOpt.hpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,13 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
3939

4040
namespace IGC
4141
{
42-
42+
///////////////////////////////////////////////////////////////////////////
43+
/// Enforce a single latch for every loop header. This needs to be ran before
44+
/// LLVM Loop canonicalization pass as LLVM loop simplification pass sometimes
45+
/// decides to spilt the loop. Spliting the loop may cause functional issues
46+
/// in case of barriers being used and it may cause extra SIMD divergence causing
47+
/// performance degradation
48+
llvm::FunctionPass* createLoopCanonicalization();
4349
/**
4450
* Custom loop versioning.
4551
* Break loop into segments to expose loop invirants.

0 commit comments

Comments
 (0)