@@ -107,6 +107,7 @@ class AMDGPUCodeGenPrepareImpl
107
107
Module *Mod = nullptr ;
108
108
const DataLayout *DL = nullptr ;
109
109
bool HasUnsafeFPMath = false ;
110
+ bool UsesGlobalISel = false ;
110
111
bool HasFP32DenormalFlush = false ;
111
112
bool FlowChanged = false ;
112
113
mutable Function *SqrtF32 = nullptr ;
@@ -343,6 +344,85 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
343
344
StringRef getPassName () const override { return " AMDGPU IR optimizations" ; }
344
345
};
345
346
347
+ class LiveRegConversion {
348
+ private:
349
+ // The instruction which defined the original virtual register used across
350
+ // blocks
351
+ Instruction *LiveRegDef;
352
+ // The original type
353
+ Type *OriginalType;
354
+ // The desired type
355
+ Type *NewType;
356
+ // The instruction sequence that converts the virtual register, to be used
357
+ // instead of the original
358
+ std::optional<Instruction *> Converted;
359
+ // The builder used to build the conversion instruction
360
+ IRBuilder<> ConvertBuilder;
361
+
362
+ public:
363
+ // The instruction which defined the original virtual register used across
364
+ // blocks
365
+ Instruction *getLiveRegDef () { return LiveRegDef; }
366
+ // The original type
367
+ Type *getOriginalType () { return OriginalType; }
368
+ // The desired type
369
+ Type *getNewType () { return NewType; }
370
+ void setNewType (Type *NewType) { this ->NewType = NewType; }
371
+ // The instruction that conerts the virtual register, to be used instead of
372
+ // the original
373
+ std::optional<Instruction *> &getConverted () { return Converted; }
374
+ void setConverted (Instruction *Converted) { this ->Converted = Converted; }
375
+ // The builder used to build the conversion instruction
376
+ IRBuilder<> &getConverBuilder () { return ConvertBuilder; }
377
+ // Do we have a instruction sequence which convert the original virtual
378
+ // register
379
+ bool hasConverted () { return Converted.has_value (); }
380
+
381
+ LiveRegConversion (Instruction *LiveRegDef, BasicBlock *InsertBlock,
382
+ BasicBlock::iterator InsertPt)
383
+ : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType ()),
384
+ ConvertBuilder(InsertBlock, InsertPt) {}
385
+ LiveRegConversion (Instruction *LiveRegDef, Type *NewType,
386
+ BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
387
+ : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType ()),
388
+ NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
389
+ };
390
+
391
+ class LiveRegOptimizer {
392
+ private:
393
+ Module *Mod = nullptr ;
394
+ // The scalar type to convert to
395
+ Type *ConvertToScalar;
396
+ // Holds the collection of PHIs with their pending new operands
397
+ SmallVector<std::pair<Instruction *,
398
+ SmallVector<std::pair<Instruction *, BasicBlock *>, 4 >>,
399
+ 4 >
400
+ PHIUpdater;
401
+
402
+ public:
403
+ // Should the def of the instruction be converted if it is live across blocks
404
+ bool shouldReplaceUses (const Instruction &I);
405
+ // Convert the virtual register to the compatible vector of legal type
406
+ void convertToOptType (LiveRegConversion &LR);
407
+ // Convert the virtual register back to the original type, stripping away
408
+ // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
409
+ void convertFromOptType (LiveRegConversion &LR);
410
+ // Get a vector of desired scalar type that is compatible with the original
411
+ // vector. In cases where there is no bitsize equivalent using a legal vector
412
+ // type, we pad the MSBs (e.g. v7i8 -> v2i32)
413
+ Type *getCompatibleType (Instruction *InstToConvert);
414
+ // Find and replace uses of the virtual register in different block with a
415
+ // newly produced virtual register of legal type
416
+ bool replaceUses (Instruction &I);
417
+ // Replace the collected PHIs with newly produced incoming values. Replacement
418
+ // is only done if we have a replacement for each original incoming value.
419
+ bool replacePHIs ();
420
+
421
+ LiveRegOptimizer (Module *Mod) : Mod(Mod) {
422
+ ConvertToScalar = Type::getInt32Ty (Mod->getContext ());
423
+ }
424
+ };
425
+
346
426
} // end anonymous namespace
347
427
348
428
bool AMDGPUCodeGenPrepareImpl::run (Function &F) {
@@ -360,6 +440,7 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
360
440
Next = std::next (I);
361
441
362
442
MadeChange |= visit (*I);
443
+ I->getType ();
363
444
364
445
if (Next != E) { // Control flow changed
365
446
BasicBlock *NextInstBB = Next->getParent ();
@@ -371,9 +452,269 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
371
452
}
372
453
}
373
454
}
455
+
456
+ // GlobalISel should directly use the values, and do not need to emit
457
+ // CopyTo/CopyFrom Regs across blocks
458
+ if (UsesGlobalISel)
459
+ return MadeChange;
460
+
461
+ // "Optimize" the virtual regs that cross basic block boundaries. In such
462
+ // cases, vectors of illegal types will be scalarized and widened, with each
463
+ // scalar living in its own physical register. The optimization converts the
464
+ // vectors to equivalent vectors of legal type (which are convereted back
465
+ // before uses in subsequenmt blocks), to pack the bits into fewer physical
466
+ // registers (used in CopyToReg/CopyFromReg pairs).
467
+ LiveRegOptimizer LRO (Mod);
468
+ for (auto &BB : F) {
469
+ for (auto &I : BB) {
470
+ if (!LRO.shouldReplaceUses (I))
471
+ continue ;
472
+ MadeChange |= LRO.replaceUses (I);
473
+ }
474
+ }
475
+
476
+ MadeChange |= LRO.replacePHIs ();
477
+ return MadeChange;
478
+ }
479
+
480
+ bool LiveRegOptimizer::replaceUses (Instruction &I) {
481
+ bool MadeChange = false ;
482
+
483
+ struct ConvertUseInfo {
484
+ Instruction *Converted;
485
+ SmallVector<Instruction *, 4 > Users;
486
+ };
487
+ DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
488
+
489
+ LiveRegConversion FromLRC (
490
+ &I, I.getParent (),
491
+ static_cast <BasicBlock::iterator>(std::next (I.getIterator ())));
492
+ FromLRC.setNewType (getCompatibleType (FromLRC.getLiveRegDef ()));
493
+ for (auto IUser = I.user_begin (); IUser != I.user_end (); IUser++) {
494
+
495
+ if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
496
+ if (UserInst->getParent () != I.getParent ()) {
497
+ LLVM_DEBUG (dbgs () << *UserInst << " \n\t Uses "
498
+ << *FromLRC.getOriginalType ()
499
+ << " from previous block. Needs conversion\n " );
500
+ convertToOptType (FromLRC);
501
+ if (!FromLRC.hasConverted ())
502
+ continue ;
503
+ // If it is a PHI node, just create and collect the new operand. We can
504
+ // only replace the PHI node once we have converted all the operands
505
+ if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
506
+ for (unsigned Idx = 0 ; Idx < PhiInst->getNumIncomingValues (); Idx++) {
507
+ auto IncVal = PhiInst->getIncomingValue (Idx);
508
+ if (&I == dyn_cast<Instruction>(IncVal)) {
509
+ auto IncBlock = PhiInst->getIncomingBlock (Idx);
510
+ auto PHIOps = find_if (
511
+ PHIUpdater,
512
+ [&UserInst](
513
+ std::pair<Instruction *,
514
+ SmallVector<
515
+ std::pair<Instruction *, BasicBlock *>, 4 >>
516
+ &Entry) { return Entry.first == UserInst; });
517
+
518
+ if (PHIOps == PHIUpdater.end ())
519
+ PHIUpdater.push_back (
520
+ {UserInst, {{*FromLRC.getConverted (), IncBlock}}});
521
+ else
522
+ PHIOps->second .push_back ({*FromLRC.getConverted (), IncBlock});
523
+
524
+ break ;
525
+ }
526
+ }
527
+ continue ;
528
+ }
529
+
530
+ // Do not create multiple conversion sequences if there are multiple
531
+ // uses in the same block
532
+ if (UseConvertTracker.contains (UserInst->getParent ())) {
533
+ UseConvertTracker[UserInst->getParent ()].Users .push_back (UserInst);
534
+ LLVM_DEBUG (dbgs () << " \t User already has access to converted def\n " );
535
+ continue ;
536
+ }
537
+
538
+ LiveRegConversion ToLRC (*FromLRC.getConverted (), I.getType (),
539
+ UserInst->getParent (),
540
+ static_cast <BasicBlock::iterator>(
541
+ UserInst->getParent ()->getFirstNonPHIIt ()));
542
+ convertFromOptType (ToLRC);
543
+ assert (ToLRC.hasConverted ());
544
+ UseConvertTracker[UserInst->getParent ()] = {*ToLRC.getConverted (),
545
+ {UserInst}};
546
+ }
547
+ }
548
+ }
549
+
550
+ // Replace uses of with in a separate loop that is not dependent upon the
551
+ // state of the uses
552
+ for (auto &Entry : UseConvertTracker) {
553
+ for (auto &UserInst : Entry.second .Users ) {
554
+ LLVM_DEBUG (dbgs () << *UserInst
555
+ << " \n\t Now uses: " << *Entry.second .Converted << " \n " );
556
+ UserInst->replaceUsesOfWith (&I, Entry.second .Converted );
557
+ MadeChange = true ;
558
+ }
559
+ }
560
+ return MadeChange;
561
+ }
562
+
563
+ bool LiveRegOptimizer::replacePHIs () {
564
+ bool MadeChange = false ;
565
+ for (auto Ele : PHIUpdater) {
566
+ auto ThePHINode = dyn_cast<PHINode>(Ele.first );
567
+ assert (ThePHINode);
568
+ auto NewPHINodeOps = Ele.second ;
569
+ LLVM_DEBUG (dbgs () << " Attempting to replace: " << *ThePHINode << " \n " );
570
+ // If we have conveted all the required operands, then do the replacement
571
+ if (ThePHINode->getNumIncomingValues () == NewPHINodeOps.size ()) {
572
+ IRBuilder<> Builder (Ele.first );
573
+ auto NPHI = Builder.CreatePHI (NewPHINodeOps[0 ].first ->getType (),
574
+ NewPHINodeOps.size ());
575
+ for (auto IncVals : NewPHINodeOps) {
576
+ NPHI->addIncoming (IncVals.first , IncVals.second );
577
+ LLVM_DEBUG (dbgs () << " Using: " << *IncVals.first
578
+ << " For: " << IncVals.second ->getName () << " \n " );
579
+ }
580
+ LLVM_DEBUG (dbgs () << " Sucessfully replaced with " << *NPHI << " \n " );
581
+ LiveRegConversion ToLRC (NPHI, ThePHINode->getType (),
582
+ ThePHINode->getParent (),
583
+ static_cast <BasicBlock::iterator>(
584
+ ThePHINode->getParent ()->getFirstNonPHIIt ()));
585
+ convertFromOptType (ToLRC);
586
+ assert (ToLRC.hasConverted ());
587
+ Ele.first ->replaceAllUsesWith (*ToLRC.getConverted ());
588
+ // The old PHI is no longer used
589
+ ThePHINode->eraseFromParent ();
590
+ MadeChange = true ;
591
+ }
592
+ }
374
593
return MadeChange;
375
594
}
376
595
596
+ Type *LiveRegOptimizer::getCompatibleType (Instruction *InstToConvert) {
597
+ auto OriginalType = InstToConvert->getType ();
598
+ assert (OriginalType->getScalarSizeInBits () <=
599
+ ConvertToScalar->getScalarSizeInBits ());
600
+ auto VTy = dyn_cast<VectorType>(OriginalType);
601
+ if (!VTy)
602
+ return ConvertToScalar;
603
+
604
+ auto OriginalSize =
605
+ VTy->getScalarSizeInBits () * VTy->getElementCount ().getFixedValue ();
606
+ auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits ();
607
+ auto ConvertEltCount =
608
+ (OriginalSize + ConvertScalarSize - 1 ) / ConvertScalarSize;
609
+
610
+ return VectorType::get (Type::getIntNTy (Mod->getContext (), ConvertScalarSize),
611
+ llvm::ElementCount::getFixed (ConvertEltCount));
612
+ }
613
+
614
+ void LiveRegOptimizer::convertToOptType (LiveRegConversion &LR) {
615
+ if (LR.hasConverted ()) {
616
+ LLVM_DEBUG (dbgs () << " \t Already has converted def\n " );
617
+ return ;
618
+ }
619
+
620
+ auto VTy = dyn_cast<VectorType>(LR.getOriginalType ());
621
+ assert (VTy);
622
+ auto NewVTy = dyn_cast<VectorType>(LR.getNewType ());
623
+ assert (NewVTy);
624
+
625
+ auto V = static_cast <Value *>(LR.getLiveRegDef ());
626
+ auto OriginalSize =
627
+ VTy->getScalarSizeInBits () * VTy->getElementCount ().getFixedValue ();
628
+ auto NewSize =
629
+ NewVTy->getScalarSizeInBits () * NewVTy->getElementCount ().getFixedValue ();
630
+
631
+ auto &Builder = LR.getConverBuilder ();
632
+
633
+ // If there is a bitsize match, we can fit the old vector into a new vector of
634
+ // desired type
635
+ if (OriginalSize == NewSize) {
636
+ LR.setConverted (dyn_cast<Instruction>(Builder.CreateBitCast (V, NewVTy)));
637
+ LLVM_DEBUG (dbgs () << " \t Converted def to "
638
+ << *(*LR.getConverted ())->getType () << " \n " );
639
+ return ;
640
+ }
641
+
642
+ // If there is a bitsize mismatch, we must use a wider vector
643
+ assert (NewSize > OriginalSize);
644
+ auto ExpandedVecElementCount =
645
+ llvm::ElementCount::getFixed (NewSize / VTy->getScalarSizeInBits ());
646
+
647
+ SmallVector<int , 8 > ShuffleMask;
648
+ for (unsigned I = 0 ; I < VTy->getElementCount ().getFixedValue (); I++)
649
+ ShuffleMask.push_back (I);
650
+
651
+ for (uint64_t I = VTy->getElementCount ().getFixedValue ();
652
+ I < ExpandedVecElementCount.getFixedValue (); I++)
653
+ ShuffleMask.push_back (VTy->getElementCount ().getFixedValue ());
654
+
655
+ auto ExpandedVec =
656
+ dyn_cast<Instruction>(Builder.CreateShuffleVector (V, ShuffleMask));
657
+ LR.setConverted (
658
+ dyn_cast<Instruction>(Builder.CreateBitCast (ExpandedVec, NewVTy)));
659
+ LLVM_DEBUG (dbgs () << " \t Converted def to " << *(*LR.getConverted ())->getType ()
660
+ << " \n " );
661
+ return ;
662
+ }
663
+
664
+ void LiveRegOptimizer::convertFromOptType (LiveRegConversion &LRC) {
665
+ auto VTy = dyn_cast<VectorType>(LRC.getOriginalType ());
666
+ assert (VTy);
667
+ auto NewVTy = dyn_cast<VectorType>(LRC.getNewType ());
668
+ assert (NewVTy);
669
+
670
+ auto V = static_cast <Value *>(LRC.getLiveRegDef ());
671
+ auto OriginalSize =
672
+ VTy->getScalarSizeInBits () * VTy->getElementCount ().getFixedValue ();
673
+ auto NewSize =
674
+ NewVTy->getScalarSizeInBits () * NewVTy->getElementCount ().getFixedValue ();
675
+
676
+ auto &Builder = LRC.getConverBuilder ();
677
+
678
+ // If there is a bitsize match, we simply convert back to the original type
679
+ if (OriginalSize == NewSize) {
680
+ LRC.setConverted (dyn_cast<Instruction>(Builder.CreateBitCast (V, NewVTy)));
681
+ LLVM_DEBUG (dbgs () << " \t Produced for user: " << **LRC.getConverted ()
682
+ << " \n " );
683
+ return ;
684
+ }
685
+
686
+ // If there is a bitsize mismatch, we have used a wider vector and must strip
687
+ // the MSBs to convert back to the original type
688
+ assert (OriginalSize > NewSize);
689
+ auto ExpandedVecElementCount = llvm::ElementCount::getFixed (
690
+ OriginalSize / NewVTy->getScalarSizeInBits ());
691
+ auto ExpandedVT = VectorType::get (
692
+ Type::getIntNTy (Mod->getContext (), NewVTy->getScalarSizeInBits ()),
693
+ ExpandedVecElementCount);
694
+ auto Converted = dyn_cast<Instruction>(
695
+ Builder.CreateBitCast (LRC.getLiveRegDef (), ExpandedVT));
696
+
697
+ auto NarrowElementCount = NewVTy->getElementCount ().getFixedValue ();
698
+ SmallVector<int , 8 > ShuffleMask;
699
+ for (uint64_t I = 0 ; I < NarrowElementCount; I++)
700
+ ShuffleMask.push_back (I);
701
+
702
+ auto NarrowVec = dyn_cast<Instruction>(
703
+ Builder.CreateShuffleVector (Converted, ShuffleMask));
704
+ LRC.setConverted (dyn_cast<Instruction>(NarrowVec));
705
+ LLVM_DEBUG (dbgs () << " \t Produced for user: " << **LRC.getConverted () << " \n " );
706
+ return ;
707
+ }
708
+
709
+ bool LiveRegOptimizer::shouldReplaceUses (const Instruction &I) {
710
+ // Vectors of illegal types are copied across blocks in an efficient manner.
711
+ // They are scalarized and widened to legal scalars. In such cases, we can do
712
+ // better by using legal vector types
713
+ auto IType = I.getType ();
714
+ return IType->isVectorTy () && IType->getScalarSizeInBits () < 16 &&
715
+ !I.getType ()->getScalarType ()->isPointerTy ();
716
+ }
717
+
377
718
unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth (const Type *T) const {
378
719
assert (needsPromotionToI32 (T) && " T does not need promotion to i32" );
379
720
@@ -2275,6 +2616,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2275
2616
Impl.ST = &TM.getSubtarget <GCNSubtarget>(F);
2276
2617
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache (F);
2277
2618
Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo ();
2619
+ Impl.UsesGlobalISel = TM.Options .EnableGlobalISel ;
2278
2620
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2279
2621
Impl.DT = DTWP ? &DTWP->getDomTree () : nullptr ;
2280
2622
Impl.HasUnsafeFPMath = hasUnsafeFPMath (F);
@@ -2297,6 +2639,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
2297
2639
Impl.DT = FAM.getCachedResult <DominatorTreeAnalysis>(F);
2298
2640
Impl.HasUnsafeFPMath = hasUnsafeFPMath (F);
2299
2641
SIModeRegisterDefaults Mode (F, *Impl.ST );
2642
+ Impl.UsesGlobalISel = TM.Options .EnableGlobalISel ;
2300
2643
Impl.HasFP32DenormalFlush =
2301
2644
Mode.FP32Denormals == DenormalMode::getPreserveSign ();
2302
2645
PreservedAnalyses PA = PreservedAnalyses::none ();
0 commit comments