67
67
#include " AMDGPU.h"
68
68
#include " GCNSubtarget.h"
69
69
#include " MCTargetDesc/AMDGPUMCTargetDesc.h"
70
+ #include " llvm/ADT/SetOperations.h"
70
71
#include " llvm/CodeGen/MachineDominators.h"
71
72
#include " llvm/InitializePasses.h"
72
73
#include " llvm/Target/TargetMachine.h"
@@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge(
81
82
cl::init(true ));
82
83
83
84
namespace {
84
-
85
85
class SIFixSGPRCopies : public MachineFunctionPass {
86
86
MachineDominatorTree *MDT;
87
+ unsigned NextVGPRToSGPRCopyID;
87
88
88
89
public:
89
90
static char ID;
@@ -92,9 +93,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
92
93
const SIRegisterInfo *TRI;
93
94
const SIInstrInfo *TII;
94
95
95
- SIFixSGPRCopies () : MachineFunctionPass(ID) {}
96
+ SIFixSGPRCopies () : MachineFunctionPass(ID), NextVGPRToSGPRCopyID( 0 ) {}
96
97
97
98
bool runOnMachineFunction (MachineFunction &MF) override ;
99
+ unsigned getNextVGPRToSGPRCopyId () { return ++NextVGPRToSGPRCopyID; }
100
+ void lowerVGPR2SGPRCopies (MachineFunction &MF);
101
+ // Handles copies which source register is:
102
+ // 1. Physical register
103
+ // 2. AGPR
104
+ // 3. Defined by the instruction the merely moves the immediate
105
+ bool lowerSpecialCase (MachineInstr &MI);
98
106
99
107
MachineBasicBlock *processPHINode (MachineInstr &MI);
100
108
@@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
569
577
TII = ST.getInstrInfo ();
570
578
MDT = &getAnalysis<MachineDominatorTree>();
571
579
580
+ // We have to lower VGPR to SGPR copies before the main loop
581
+ // because the REG_SEQUENCE and PHI lowering in main loop
582
+ // convert the def-use chains to VALU and close the opportunities
583
+ // for keeping them scalar.
584
+ // TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
585
+ // addresses their lowering and unify the processing in one main loop.
586
+ lowerVGPR2SGPRCopies (MF);
587
+
572
588
for (MachineFunction::iterator BI = MF.begin (), BE = MF.end ();
573
589
BI != BE; ++BI) {
574
590
MachineBasicBlock *MBB = &*BI;
@@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
640
656
continue ;
641
657
}
642
658
643
- if (isVGPRToSGPRCopy (SrcRC, DstRC, *TRI)) {
644
- Register SrcReg = MI.getOperand (1 ).getReg ();
645
- if (!SrcReg.isVirtual ()) {
646
- MachineBasicBlock *NewBB = TII->moveToVALU (MI, MDT);
647
- if (NewBB && NewBB != MBB) {
648
- MBB = NewBB;
649
- E = MBB->end ();
650
- BI = MachineFunction::iterator (MBB);
651
- BE = MF.end ();
652
- }
653
- assert ((!NewBB || NewBB == I->getParent ()) &&
654
- " moveToVALU did not return the right basic block" );
655
- break ;
656
- }
657
-
658
- MachineInstr *DefMI = MRI->getVRegDef (SrcReg);
659
- unsigned SMovOp;
660
- int64_t Imm;
661
- // If we are just copying an immediate, we can replace the copy with
662
- // s_mov_b32.
663
- if (isSafeToFoldImmIntoCopy (&MI, DefMI, TII, SMovOp, Imm)) {
664
- MI.getOperand (1 ).ChangeToImmediate (Imm);
665
- MI.addImplicitDefUseOperands (MF);
666
- MI.setDesc (TII->get (SMovOp));
667
- break ;
668
- }
669
- MachineBasicBlock *NewBB = TII->moveToVALU (MI, MDT);
670
- if (NewBB && NewBB != MBB) {
671
- MBB = NewBB;
672
- E = MBB->end ();
673
- BI = MachineFunction::iterator (MBB);
674
- BE = MF.end ();
675
- }
676
- assert ((!NewBB || NewBB == I->getParent ()) &&
677
- " moveToVALU did not return the right basic block" );
678
- } else if (isSGPRToVGPRCopy (SrcRC, DstRC, *TRI)) {
659
+ if (isSGPRToVGPRCopy (SrcRC, DstRC, *TRI)) {
679
660
tryChangeVGPRtoSGPRinCopy (MI, TRI, TII);
680
661
}
681
662
@@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
916
897
}
917
898
return CreatedBB;
918
899
}
900
+
901
+ bool SIFixSGPRCopies::lowerSpecialCase (MachineInstr &MI) {
902
+ MachineBasicBlock *MBB = MI.getParent ();
903
+ const TargetRegisterClass *SrcRC, *DstRC;
904
+ std::tie (SrcRC, DstRC) = getCopyRegClasses (MI, *TRI, *MRI);
905
+
906
+ // We return true to indicate that no further processing needed
907
+ if (!isVGPRToSGPRCopy (SrcRC, DstRC, *TRI))
908
+ return true ;
909
+
910
+ Register SrcReg = MI.getOperand (1 ).getReg ();
911
+ if (!SrcReg.isVirtual () || TRI->isAGPR (*MRI, SrcReg)) {
912
+ TII->moveToVALU (MI, MDT);
913
+ return true ;
914
+ }
915
+
916
+ unsigned SMovOp;
917
+ int64_t Imm;
918
+ // If we are just copying an immediate, we can replace the copy with
919
+ // s_mov_b32.
920
+ if (isSafeToFoldImmIntoCopy (&MI, MRI->getVRegDef (SrcReg), TII, SMovOp, Imm)) {
921
+ MI.getOperand (1 ).ChangeToImmediate (Imm);
922
+ MI.addImplicitDefUseOperands (*MBB->getParent ());
923
+ MI.setDesc (TII->get (SMovOp));
924
+ return true ;
925
+ }
926
+ return false ;
927
+ }
928
+
929
+ class V2SCopyInfo {
930
+ public:
931
+ // VGPR to SGPR copy being processed
932
+ MachineInstr *Copy;
933
+ // All SALU instructions reachable from this copy in SSA graph
934
+ DenseSet<MachineInstr *> SChain;
935
+ // Number of SGPR to VGPR copies that are used to put the SALU computation
936
+ // results back to VALU.
937
+ unsigned NumSVCopies;
938
+
939
+ unsigned Score;
940
+ // Actual count of v_readfirstlane_b32
941
+ // which need to be inserted to keep SChain SALU
942
+ unsigned NumReadfirstlanes;
943
+ // Current score state. To speedup selection V2SCopyInfos for processing
944
+ bool NeedToBeConvertedToVALU = false ;
945
+ // Unique ID. Used as a key for mapping to keep permanent order.
946
+ unsigned ID;
947
+
948
+ // Count of another VGPR to SGPR copies that contribute to the
949
+ // current copy SChain
950
+ unsigned SiblingPenalty = 0 ;
951
+ SetVector<unsigned > Siblings;
952
+ V2SCopyInfo () : Copy(nullptr ), ID(0 ){};
953
+ V2SCopyInfo (unsigned Id, MachineInstr *C, unsigned Width)
954
+ : Copy(C), NumSVCopies(0 ), NumReadfirstlanes(Width / 32 ), ID(Id){};
955
+ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
956
+ void dump () {
957
+ dbgs () << ID << " : " << *Copy << " \n\t S:" << SChain.size ()
958
+ << " \n\t SV:" << NumSVCopies << " \n\t SP: " << SiblingPenalty
959
+ << " \n Score: " << Score << " \n " ;
960
+ }
961
+ #endif
962
+ };
963
+
964
+ void SIFixSGPRCopies::lowerVGPR2SGPRCopies (MachineFunction &MF) {
965
+
966
+ DenseMap<unsigned , V2SCopyInfo> Copies;
967
+ DenseMap<MachineInstr *, SetVector<unsigned >> SiblingPenalty;
968
+
969
+ // The main function that computes the VGPR to SGPR copy score
970
+ // and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
971
+ auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
972
+ if (I->SChain .empty ())
973
+ return true ;
974
+ I->Siblings = SiblingPenalty[*std::max_element (
975
+ I->SChain .begin (), I->SChain .end (),
976
+ [&](MachineInstr *A, MachineInstr *B) -> bool {
977
+ return SiblingPenalty[A].size () < SiblingPenalty[B].size ();
978
+ })];
979
+ I->Siblings .remove_if ([&](unsigned ID) { return ID == I->ID ; });
980
+ // The loop below computes the number of another VGPR to SGPR copies
981
+ // which contribute to the current copy SALU chain. We assume that all the
982
+ // copies with the same source virtual register will be squashed to one by
983
+ // regalloc. Also we take careof the copies of the differnt subregs of the
984
+ // same register.
985
+ SmallSet<std::pair<Register, unsigned >, 4 > SrcRegs;
986
+ for (auto J : I->Siblings ) {
987
+ auto InfoIt = Copies.find (J);
988
+ if (InfoIt != Copies.end ()) {
989
+ MachineInstr *SiblingCopy = InfoIt->getSecond ().Copy ;
990
+ if (SiblingCopy->isImplicitDef ())
991
+ // the COPY has already been MoveToVALUed
992
+ continue ;
993
+
994
+ SrcRegs.insert (std::make_pair (SiblingCopy->getOperand (1 ).getReg (),
995
+ SiblingCopy->getOperand (1 ).getSubReg ()));
996
+ }
997
+ }
998
+ I->SiblingPenalty = SrcRegs.size ();
999
+
1000
+ unsigned Penalty =
1001
+ I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes ;
1002
+ unsigned Profit = I->SChain .size ();
1003
+ I->Score = Penalty > Profit ? 0 : Profit - Penalty;
1004
+ I->NeedToBeConvertedToVALU = I->Score < 3 ;
1005
+ return I->NeedToBeConvertedToVALU ;
1006
+ };
1007
+
1008
+ auto needProcessing = [](MachineInstr &MI) -> bool {
1009
+ switch (MI.getOpcode ()) {
1010
+ case AMDGPU::COPY:
1011
+ case AMDGPU::WQM:
1012
+ case AMDGPU::STRICT_WQM:
1013
+ case AMDGPU::SOFT_WQM:
1014
+ case AMDGPU::STRICT_WWM:
1015
+ return true ;
1016
+ default :
1017
+ return false ;
1018
+ }
1019
+ };
1020
+
1021
+ for (MachineFunction::iterator BI = MF.begin (), BE = MF.end (); BI != BE;
1022
+ ++BI) {
1023
+ MachineBasicBlock *MBB = &*BI;
1024
+ for (MachineBasicBlock::iterator I = MBB->begin (), E = MBB->end (); I != E;
1025
+ ++I) {
1026
+ MachineInstr &MI = *I;
1027
+ if (!needProcessing (MI))
1028
+ continue ;
1029
+ if (lowerSpecialCase (MI))
1030
+ continue ;
1031
+
1032
+ // Compute the COPY width to pass it to V2SCopyInfo Ctor
1033
+ Register DstReg = MI.getOperand (0 ).getReg ();
1034
+
1035
+ const TargetRegisterClass *DstRC = TRI->getRegClassForReg (*MRI, DstReg);
1036
+
1037
+ V2SCopyInfo In (getNextVGPRToSGPRCopyId (), &MI,
1038
+ TRI->getRegSizeInBits (*DstRC));
1039
+
1040
+ SmallVector<MachineInstr *, 8 > AnalysisWorklist;
1041
+ // Needed because the SSA is not a tree but a graph and may have
1042
+ // forks and joins. We should not then go same way twice.
1043
+ DenseSet<MachineInstr *> Visited;
1044
+ AnalysisWorklist.push_back (&MI);
1045
+ while (!AnalysisWorklist.empty ()) {
1046
+
1047
+ MachineInstr *Inst = AnalysisWorklist.pop_back_val ();
1048
+
1049
+ if (!Visited.insert (Inst).second )
1050
+ continue ;
1051
+
1052
+ // Copies and REG_SEQUENCE do not contribute to the final assembly
1053
+ // So, skip them but take care of the SGPR to VGPR copies bookkeeping.
1054
+ if (Inst->isCopy () || Inst->isRegSequence ()) {
1055
+ if (TRI->isVGPR (*MRI, Inst->getOperand (0 ).getReg ())) {
1056
+ if (!Inst->isCopy () ||
1057
+ !tryChangeVGPRtoSGPRinCopy (*Inst, TRI, TII)) {
1058
+ In.NumSVCopies ++;
1059
+ continue ;
1060
+ }
1061
+ }
1062
+ }
1063
+
1064
+ SiblingPenalty[Inst].insert (In.ID );
1065
+
1066
+ SmallVector<MachineInstr *, 4 > Users;
1067
+ if ((TII->isSALU (*Inst) && Inst->isCompare ()) ||
1068
+ (Inst->isCopy () && Inst->getOperand (0 ).getReg () == AMDGPU::SCC)) {
1069
+ auto I = Inst->getIterator ();
1070
+ auto E = Inst->getParent ()->end ();
1071
+ while (++I != E && !I->findRegisterDefOperand (AMDGPU::SCC)) {
1072
+ if (I->readsRegister (AMDGPU::SCC))
1073
+ Users.push_back (&*I);
1074
+ }
1075
+ } else if (Inst->getNumExplicitDefs () != 0 ) {
1076
+ Register Reg = Inst->getOperand (0 ).getReg ();
1077
+ if (TRI->isSGPRReg (*MRI, Reg))
1078
+ for (auto &U : MRI->use_instructions (Reg))
1079
+ Users.push_back (&U);
1080
+ }
1081
+ for (auto U : Users) {
1082
+ if (TII->isSALU (*U))
1083
+ In.SChain .insert (U);
1084
+ AnalysisWorklist.push_back (U);
1085
+ }
1086
+ }
1087
+ Copies[In.ID ] = In;
1088
+ }
1089
+ }
1090
+
1091
+ SmallVector<unsigned , 8 > LoweringWorklist;
1092
+ for (auto &C : Copies) {
1093
+ if (needToBeConvertedToVALU (&C.second ))
1094
+ LoweringWorklist.push_back (C.second .ID );
1095
+ }
1096
+
1097
+ while (!LoweringWorklist.empty ()) {
1098
+ unsigned CurID = LoweringWorklist.pop_back_val ();
1099
+ auto CurInfoIt = Copies.find (CurID);
1100
+ if (CurInfoIt != Copies.end ()) {
1101
+ V2SCopyInfo C = CurInfoIt->getSecond ();
1102
+ LLVM_DEBUG (dbgs () << " Processing ...\n " ; C.dump ());
1103
+ for (auto S : C.Siblings ) {
1104
+ auto SibInfoIt = Copies.find (S);
1105
+ if (SibInfoIt != Copies.end ()) {
1106
+ V2SCopyInfo &SI = SibInfoIt->getSecond ();
1107
+ LLVM_DEBUG (dbgs () << " Sibling:\n " ; SI.dump ());
1108
+ if (!SI.NeedToBeConvertedToVALU ) {
1109
+ set_subtract (SI.SChain , C.SChain );
1110
+ if (needToBeConvertedToVALU (&SI))
1111
+ LoweringWorklist.push_back (SI.ID );
1112
+ }
1113
+ SI.Siblings .remove_if ([&](unsigned ID) { return ID == C.ID ; });
1114
+ }
1115
+ }
1116
+ LLVM_DEBUG (dbgs () << " V2S copy " << *C.Copy
1117
+ << " is being turned to VALU\n " );
1118
+ Copies.erase (C.ID );
1119
+ TII->moveToVALU (*C.Copy , MDT);
1120
+ }
1121
+ }
1122
+
1123
+ // Now do actual lowering
1124
+ for (auto C : Copies) {
1125
+ MachineInstr *MI = C.second .Copy ;
1126
+ MachineBasicBlock *MBB = MI->getParent ();
1127
+ // We decide to turn V2S copy to v_readfirstlane_b32
1128
+ // remove it from the V2SCopies and remove it from all its siblings
1129
+ LLVM_DEBUG (dbgs () << " V2S copy " << *MI
1130
+ << " is being turned to v_readfirstlane_b32"
1131
+ << " Score: " << C.second .Score << " \n " );
1132
+ Register DstReg = MI->getOperand (0 ).getReg ();
1133
+ Register SrcReg = MI->getOperand (1 ).getReg ();
1134
+ unsigned SubReg = MI->getOperand (1 ).getSubReg ();
1135
+ const TargetRegisterClass *SrcRC = TRI->getRegClassForReg (*MRI, SrcReg);
1136
+ SrcRC = TRI->getSubRegClass (SrcRC, SubReg);
1137
+ size_t SrcSize = TRI->getRegSizeInBits (*SrcRC);
1138
+ if (SrcSize == 16 ) {
1139
+ // HACK to handle possible 16bit VGPR source
1140
+ auto MIB = BuildMI (*MBB, MI, MI->getDebugLoc (),
1141
+ TII->get (AMDGPU::V_READFIRSTLANE_B32), DstReg);
1142
+ MIB.addReg (SrcReg, 0 , AMDGPU::NoSubRegister);
1143
+ } else if (SrcSize == 32 ) {
1144
+ auto MIB = BuildMI (*MBB, MI, MI->getDebugLoc (),
1145
+ TII->get (AMDGPU::V_READFIRSTLANE_B32), DstReg);
1146
+ MIB.addReg (SrcReg, 0 , SubReg);
1147
+ } else {
1148
+ auto Result = BuildMI (*MBB, MI, MI->getDebugLoc (),
1149
+ TII->get (AMDGPU::REG_SEQUENCE), DstReg);
1150
+ int N = TRI->getRegSizeInBits (*SrcRC) / 32 ;
1151
+ for (int i = 0 ; i < N; i++) {
1152
+ Register PartialSrc = TII->buildExtractSubReg (
1153
+ Result, *MRI, MI->getOperand (1 ), SrcRC,
1154
+ TRI->getSubRegFromChannel (i), &AMDGPU::VGPR_32RegClass);
1155
+ Register PartialDst =
1156
+ MRI->createVirtualRegister (&AMDGPU::SReg_32RegClass);
1157
+ BuildMI (*MBB, *Result, Result->getDebugLoc (),
1158
+ TII->get (AMDGPU::V_READFIRSTLANE_B32), PartialDst)
1159
+ .addReg (PartialSrc);
1160
+ Result.addReg (PartialDst).addImm (TRI->getSubRegFromChannel (i));
1161
+ }
1162
+ }
1163
+ MI->eraseFromParent ();
1164
+ }
1165
+ }
0 commit comments