Skip to content

Commit 5efb955

Browse files
committed
[AMDGPU] Add IR LiveReg type-based optimization
Change-Id: Ide8a46cdaf1d2d82cbd5296c998a5c8fd41fce80
1 parent 678f19f commit 5efb955

File tree

3 files changed

+546
-1802
lines changed

3 files changed

+546
-1802
lines changed

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Lines changed: 343 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,7 @@ class AMDGPUCodeGenPrepareImpl
107107
Module *Mod = nullptr;
108108
const DataLayout *DL = nullptr;
109109
bool HasUnsafeFPMath = false;
110+
bool UsesGlobalISel = false;
110111
bool HasFP32DenormalFlush = false;
111112
bool FlowChanged = false;
112113
mutable Function *SqrtF32 = nullptr;
@@ -343,6 +344,85 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
343344
StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
344345
};
345346

347+
class LiveRegConversion {
348+
private:
349+
// The instruction which defined the original virtual register used across
350+
// blocks
351+
Instruction *LiveRegDef;
352+
// The original type
353+
Type *OriginalType;
354+
// The desired type
355+
Type *NewType;
356+
// The instruction sequence that converts the virtual register, to be used
357+
// instead of the original
358+
std::optional<Instruction *> Converted;
359+
// The builder used to build the conversion instruction
360+
IRBuilder<> ConvertBuilder;
361+
362+
public:
363+
// The instruction which defined the original virtual register used across
364+
// blocks
365+
Instruction *getLiveRegDef() { return LiveRegDef; }
366+
// The original type
367+
Type *getOriginalType() { return OriginalType; }
368+
// The desired type
369+
Type *getNewType() { return NewType; }
370+
void setNewType(Type *NewType) { this->NewType = NewType; }
371+
// The instruction that conerts the virtual register, to be used instead of
372+
// the original
373+
std::optional<Instruction *> &getConverted() { return Converted; }
374+
void setConverted(Instruction *Converted) { this->Converted = Converted; }
375+
// The builder used to build the conversion instruction
376+
IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
377+
// Do we have a instruction sequence which convert the original virtual
378+
// register
379+
bool hasConverted() { return Converted.has_value(); }
380+
381+
LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
382+
BasicBlock::iterator InsertPt)
383+
: LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
384+
ConvertBuilder(InsertBlock, InsertPt) {}
385+
LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
386+
BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
387+
: LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
388+
NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
389+
};
390+
391+
class LiveRegOptimizer {
392+
private:
393+
Module *Mod = nullptr;
394+
// The scalar type to convert to
395+
Type *ConvertToScalar;
396+
// Holds the collection of PHIs with their pending new operands
397+
SmallVector<std::pair<Instruction *,
398+
SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
399+
4>
400+
PHIUpdater;
401+
402+
public:
403+
// Should the def of the instruction be converted if it is live across blocks
404+
bool shouldReplaceUses(const Instruction &I);
405+
// Convert the virtual register to the compatible vector of legal type
406+
void convertToOptType(LiveRegConversion &LR);
407+
// Convert the virtual register back to the original type, stripping away
408+
// the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
409+
void convertFromOptType(LiveRegConversion &LR);
410+
// Get a vector of desired scalar type that is compatible with the original
411+
// vector. In cases where there is no bitsize equivalent using a legal vector
412+
// type, we pad the MSBs (e.g. v7i8 -> v2i32)
413+
Type *getCompatibleType(Instruction *InstToConvert);
414+
// Find and replace uses of the virtual register in different block with a
415+
// newly produced virtual register of legal type
416+
bool replaceUses(Instruction &I);
417+
// Replace the collected PHIs with newly produced incoming values. Replacement
418+
// is only done if we have a replacement for each original incoming value.
419+
bool replacePHIs();
420+
421+
LiveRegOptimizer(Module *Mod) : Mod(Mod) {
422+
ConvertToScalar = Type::getInt32Ty(Mod->getContext());
423+
}
424+
};
425+
346426
} // end anonymous namespace
347427

348428
bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -360,6 +440,7 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
360440
Next = std::next(I);
361441

362442
MadeChange |= visit(*I);
443+
I->getType();
363444

364445
if (Next != E) { // Control flow changed
365446
BasicBlock *NextInstBB = Next->getParent();
@@ -371,9 +452,269 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
371452
}
372453
}
373454
}
455+
456+
// GlobalISel should directly use the values, and do not need to emit
457+
// CopyTo/CopyFrom Regs across blocks
458+
if (UsesGlobalISel)
459+
return MadeChange;
460+
461+
// "Optimize" the virtual regs that cross basic block boundaries. In such
462+
// cases, vectors of illegal types will be scalarized and widened, with each
463+
// scalar living in its own physical register. The optimization converts the
464+
// vectors to equivalent vectors of legal type (which are convereted back
465+
// before uses in subsequenmt blocks), to pack the bits into fewer physical
466+
// registers (used in CopyToReg/CopyFromReg pairs).
467+
LiveRegOptimizer LRO(Mod);
468+
for (auto &BB : F) {
469+
for (auto &I : BB) {
470+
if (!LRO.shouldReplaceUses(I))
471+
continue;
472+
MadeChange |= LRO.replaceUses(I);
473+
}
474+
}
475+
476+
MadeChange |= LRO.replacePHIs();
477+
return MadeChange;
478+
}
479+
480+
bool LiveRegOptimizer::replaceUses(Instruction &I) {
481+
bool MadeChange = false;
482+
483+
struct ConvertUseInfo {
484+
Instruction *Converted;
485+
SmallVector<Instruction *, 4> Users;
486+
};
487+
DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
488+
489+
LiveRegConversion FromLRC(
490+
&I, I.getParent(),
491+
static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
492+
FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
493+
for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
494+
495+
if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
496+
if (UserInst->getParent() != I.getParent()) {
497+
LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
498+
<< *FromLRC.getOriginalType()
499+
<< " from previous block. Needs conversion\n");
500+
convertToOptType(FromLRC);
501+
if (!FromLRC.hasConverted())
502+
continue;
503+
// If it is a PHI node, just create and collect the new operand. We can
504+
// only replace the PHI node once we have converted all the operands
505+
if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
506+
for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
507+
auto IncVal = PhiInst->getIncomingValue(Idx);
508+
if (&I == dyn_cast<Instruction>(IncVal)) {
509+
auto IncBlock = PhiInst->getIncomingBlock(Idx);
510+
auto PHIOps = find_if(
511+
PHIUpdater,
512+
[&UserInst](
513+
std::pair<Instruction *,
514+
SmallVector<
515+
std::pair<Instruction *, BasicBlock *>, 4>>
516+
&Entry) { return Entry.first == UserInst; });
517+
518+
if (PHIOps == PHIUpdater.end())
519+
PHIUpdater.push_back(
520+
{UserInst, {{*FromLRC.getConverted(), IncBlock}}});
521+
else
522+
PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
523+
524+
break;
525+
}
526+
}
527+
continue;
528+
}
529+
530+
// Do not create multiple conversion sequences if there are multiple
531+
// uses in the same block
532+
if (UseConvertTracker.contains(UserInst->getParent())) {
533+
UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
534+
LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
535+
continue;
536+
}
537+
538+
LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
539+
UserInst->getParent(),
540+
static_cast<BasicBlock::iterator>(
541+
UserInst->getParent()->getFirstNonPHIIt()));
542+
convertFromOptType(ToLRC);
543+
assert(ToLRC.hasConverted());
544+
UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
545+
{UserInst}};
546+
}
547+
}
548+
}
549+
550+
// Replace uses of with in a separate loop that is not dependent upon the
551+
// state of the uses
552+
for (auto &Entry : UseConvertTracker) {
553+
for (auto &UserInst : Entry.second.Users) {
554+
LLVM_DEBUG(dbgs() << *UserInst
555+
<< "\n\tNow uses: " << *Entry.second.Converted << "\n");
556+
UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
557+
MadeChange = true;
558+
}
559+
}
560+
return MadeChange;
561+
}
562+
563+
bool LiveRegOptimizer::replacePHIs() {
564+
bool MadeChange = false;
565+
for (auto Ele : PHIUpdater) {
566+
auto ThePHINode = dyn_cast<PHINode>(Ele.first);
567+
assert(ThePHINode);
568+
auto NewPHINodeOps = Ele.second;
569+
LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
570+
// If we have conveted all the required operands, then do the replacement
571+
if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
572+
IRBuilder<> Builder(Ele.first);
573+
auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
574+
NewPHINodeOps.size());
575+
for (auto IncVals : NewPHINodeOps) {
576+
NPHI->addIncoming(IncVals.first, IncVals.second);
577+
LLVM_DEBUG(dbgs() << " Using: " << *IncVals.first
578+
<< " For: " << IncVals.second->getName() << "\n");
579+
}
580+
LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
581+
LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
582+
ThePHINode->getParent(),
583+
static_cast<BasicBlock::iterator>(
584+
ThePHINode->getParent()->getFirstNonPHIIt()));
585+
convertFromOptType(ToLRC);
586+
assert(ToLRC.hasConverted());
587+
Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
588+
// The old PHI is no longer used
589+
ThePHINode->eraseFromParent();
590+
MadeChange = true;
591+
}
592+
}
374593
return MadeChange;
375594
}
376595

596+
Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
597+
auto OriginalType = InstToConvert->getType();
598+
assert(OriginalType->getScalarSizeInBits() <=
599+
ConvertToScalar->getScalarSizeInBits());
600+
auto VTy = dyn_cast<VectorType>(OriginalType);
601+
if (!VTy)
602+
return ConvertToScalar;
603+
604+
auto OriginalSize =
605+
VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
606+
auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
607+
auto ConvertEltCount =
608+
(OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
609+
610+
return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
611+
llvm::ElementCount::getFixed(ConvertEltCount));
612+
}
613+
614+
void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
615+
if (LR.hasConverted()) {
616+
LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
617+
return;
618+
}
619+
620+
auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
621+
assert(VTy);
622+
auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
623+
assert(NewVTy);
624+
625+
auto V = static_cast<Value *>(LR.getLiveRegDef());
626+
auto OriginalSize =
627+
VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
628+
auto NewSize =
629+
NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
630+
631+
auto &Builder = LR.getConverBuilder();
632+
633+
// If there is a bitsize match, we can fit the old vector into a new vector of
634+
// desired type
635+
if (OriginalSize == NewSize) {
636+
LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
637+
LLVM_DEBUG(dbgs() << "\tConverted def to "
638+
<< *(*LR.getConverted())->getType() << "\n");
639+
return;
640+
}
641+
642+
// If there is a bitsize mismatch, we must use a wider vector
643+
assert(NewSize > OriginalSize);
644+
auto ExpandedVecElementCount =
645+
llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
646+
647+
SmallVector<int, 8> ShuffleMask;
648+
for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
649+
ShuffleMask.push_back(I);
650+
651+
for (uint64_t I = VTy->getElementCount().getFixedValue();
652+
I < ExpandedVecElementCount.getFixedValue(); I++)
653+
ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
654+
655+
auto ExpandedVec =
656+
dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
657+
LR.setConverted(
658+
dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
659+
LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
660+
<< "\n");
661+
return;
662+
}
663+
664+
void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
665+
auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
666+
assert(VTy);
667+
auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
668+
assert(NewVTy);
669+
670+
auto V = static_cast<Value *>(LRC.getLiveRegDef());
671+
auto OriginalSize =
672+
VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
673+
auto NewSize =
674+
NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
675+
676+
auto &Builder = LRC.getConverBuilder();
677+
678+
// If there is a bitsize match, we simply convert back to the original type
679+
if (OriginalSize == NewSize) {
680+
LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
681+
LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
682+
<< "\n");
683+
return;
684+
}
685+
686+
// If there is a bitsize mismatch, we have used a wider vector and must strip
687+
// the MSBs to convert back to the original type
688+
assert(OriginalSize > NewSize);
689+
auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
690+
OriginalSize / NewVTy->getScalarSizeInBits());
691+
auto ExpandedVT = VectorType::get(
692+
Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
693+
ExpandedVecElementCount);
694+
auto Converted = dyn_cast<Instruction>(
695+
Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
696+
697+
auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
698+
SmallVector<int, 8> ShuffleMask;
699+
for (uint64_t I = 0; I < NarrowElementCount; I++)
700+
ShuffleMask.push_back(I);
701+
702+
auto NarrowVec = dyn_cast<Instruction>(
703+
Builder.CreateShuffleVector(Converted, ShuffleMask));
704+
LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
705+
LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
706+
return;
707+
}
708+
709+
bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
710+
// Vectors of illegal types are copied across blocks in an efficient manner.
711+
// They are scalarized and widened to legal scalars. In such cases, we can do
712+
// better by using legal vector types
713+
auto IType = I.getType();
714+
return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
715+
!I.getType()->getScalarType()->isPointerTy();
716+
}
717+
377718
unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
378719
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
379720

@@ -2275,6 +2616,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
22752616
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
22762617
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
22772618
Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2619+
Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
22782620
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
22792621
Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
22802622
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2297,6 +2639,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
22972639
Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
22982640
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
22992641
SIModeRegisterDefaults Mode(F, *Impl.ST);
2642+
Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
23002643
Impl.HasFP32DenormalFlush =
23012644
Mode.FP32Denormals == DenormalMode::getPreserveSign();
23022645
PreservedAnalyses PA = PreservedAnalyses::none();

0 commit comments

Comments
 (0)