Skip to content

Commit 2e29b01

Browse files
committed
[AMDGPU] Lowering VGPR to SGPR copies to v_readfirstlane_b32 if profitable.
Since the divergence-driven instruction selection has been enabled for AMDGPU, all the uniform instructions are expected to be selected to SALU form, except those not having one. VGPR to SGPR copies appear in MIR to connect values producers and consumers. This change implements an algorithm that evolves a reasonable tradeoff between the profit achieved from keeping the uniform instructions in SALU form and overhead introduced by the data transfer between the VGPRs and SGPRs. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D128252
1 parent 5b0788f commit 2e29b01

27 files changed

+3033
-2273
lines changed

llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Lines changed: 285 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@
6767
#include "AMDGPU.h"
6868
#include "GCNSubtarget.h"
6969
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
70+
#include "llvm/ADT/SetOperations.h"
7071
#include "llvm/CodeGen/MachineDominators.h"
7172
#include "llvm/InitializePasses.h"
7273
#include "llvm/Target/TargetMachine.h"
@@ -81,9 +82,9 @@ static cl::opt<bool> EnableM0Merge(
8182
cl::init(true));
8283

8384
namespace {
84-
8585
class SIFixSGPRCopies : public MachineFunctionPass {
8686
MachineDominatorTree *MDT;
87+
unsigned NextVGPRToSGPRCopyID;
8788

8889
public:
8990
static char ID;
@@ -92,9 +93,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
9293
const SIRegisterInfo *TRI;
9394
const SIInstrInfo *TII;
9495

95-
SIFixSGPRCopies() : MachineFunctionPass(ID) {}
96+
SIFixSGPRCopies() : MachineFunctionPass(ID), NextVGPRToSGPRCopyID(0) {}
9697

9798
bool runOnMachineFunction(MachineFunction &MF) override;
99+
unsigned getNextVGPRToSGPRCopyId() { return ++NextVGPRToSGPRCopyID; }
100+
void lowerVGPR2SGPRCopies(MachineFunction &MF);
101+
// Handles copies which source register is:
102+
// 1. Physical register
103+
// 2. AGPR
104+
// 3. Defined by the instruction the merely moves the immediate
105+
bool lowerSpecialCase(MachineInstr &MI);
98106

99107
MachineBasicBlock *processPHINode(MachineInstr &MI);
100108

@@ -569,6 +577,14 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
569577
TII = ST.getInstrInfo();
570578
MDT = &getAnalysis<MachineDominatorTree>();
571579

580+
// We have to lower VGPR to SGPR copies before the main loop
581+
// because the REG_SEQUENCE and PHI lowering in main loop
582+
// convert the def-use chains to VALU and close the opportunities
583+
// for keeping them scalar.
584+
// TODO: REG_SEQENCE and PHIs are semantically copies. The next patch
585+
// addresses their lowering and unify the processing in one main loop.
586+
lowerVGPR2SGPRCopies(MF);
587+
572588
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
573589
BI != BE; ++BI) {
574590
MachineBasicBlock *MBB = &*BI;
@@ -640,42 +656,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
640656
continue;
641657
}
642658

643-
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
644-
Register SrcReg = MI.getOperand(1).getReg();
645-
if (!SrcReg.isVirtual()) {
646-
MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
647-
if (NewBB && NewBB != MBB) {
648-
MBB = NewBB;
649-
E = MBB->end();
650-
BI = MachineFunction::iterator(MBB);
651-
BE = MF.end();
652-
}
653-
assert((!NewBB || NewBB == I->getParent()) &&
654-
"moveToVALU did not return the right basic block");
655-
break;
656-
}
657-
658-
MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
659-
unsigned SMovOp;
660-
int64_t Imm;
661-
// If we are just copying an immediate, we can replace the copy with
662-
// s_mov_b32.
663-
if (isSafeToFoldImmIntoCopy(&MI, DefMI, TII, SMovOp, Imm)) {
664-
MI.getOperand(1).ChangeToImmediate(Imm);
665-
MI.addImplicitDefUseOperands(MF);
666-
MI.setDesc(TII->get(SMovOp));
667-
break;
668-
}
669-
MachineBasicBlock *NewBB = TII->moveToVALU(MI, MDT);
670-
if (NewBB && NewBB != MBB) {
671-
MBB = NewBB;
672-
E = MBB->end();
673-
BI = MachineFunction::iterator(MBB);
674-
BE = MF.end();
675-
}
676-
assert((!NewBB || NewBB == I->getParent()) &&
677-
"moveToVALU did not return the right basic block");
678-
} else if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
659+
if (isSGPRToVGPRCopy(SrcRC, DstRC, *TRI)) {
679660
tryChangeVGPRtoSGPRinCopy(MI, TRI, TII);
680661
}
681662

@@ -916,3 +897,269 @@ MachineBasicBlock *SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
916897
}
917898
return CreatedBB;
918899
}
900+
901+
bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI) {
902+
MachineBasicBlock *MBB = MI.getParent();
903+
const TargetRegisterClass *SrcRC, *DstRC;
904+
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
905+
906+
// We return true to indicate that no further processing needed
907+
if (!isVGPRToSGPRCopy(SrcRC, DstRC, *TRI))
908+
return true;
909+
910+
Register SrcReg = MI.getOperand(1).getReg();
911+
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
912+
TII->moveToVALU(MI, MDT);
913+
return true;
914+
}
915+
916+
unsigned SMovOp;
917+
int64_t Imm;
918+
// If we are just copying an immediate, we can replace the copy with
919+
// s_mov_b32.
920+
if (isSafeToFoldImmIntoCopy(&MI, MRI->getVRegDef(SrcReg), TII, SMovOp, Imm)) {
921+
MI.getOperand(1).ChangeToImmediate(Imm);
922+
MI.addImplicitDefUseOperands(*MBB->getParent());
923+
MI.setDesc(TII->get(SMovOp));
924+
return true;
925+
}
926+
return false;
927+
}
928+
929+
class V2SCopyInfo {
930+
public:
931+
// VGPR to SGPR copy being processed
932+
MachineInstr *Copy;
933+
// All SALU instructions reachable from this copy in SSA graph
934+
DenseSet<MachineInstr *> SChain;
935+
// Number of SGPR to VGPR copies that are used to put the SALU computation
936+
// results back to VALU.
937+
unsigned NumSVCopies;
938+
939+
unsigned Score;
940+
// Actual count of v_readfirstlane_b32
941+
// which need to be inserted to keep SChain SALU
942+
unsigned NumReadfirstlanes;
943+
// Current score state. To speedup selection V2SCopyInfos for processing
944+
bool NeedToBeConvertedToVALU = false;
945+
// Unique ID. Used as a key for mapping to keep permanent order.
946+
unsigned ID;
947+
948+
// Count of another VGPR to SGPR copies that contribute to the
949+
// current copy SChain
950+
unsigned SiblingPenalty = 0;
951+
SetVector<unsigned> Siblings;
952+
V2SCopyInfo() : Copy(nullptr), ID(0){};
953+
V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
954+
: Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
955+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
956+
void dump() {
957+
dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
958+
<< "\n\tSV:" << NumSVCopies << "\n\tSP: " << SiblingPenalty
959+
<< "\nScore: " << Score << "\n";
960+
}
961+
#endif
962+
};
963+
964+
void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
965+
966+
DenseMap<unsigned, V2SCopyInfo> Copies;
967+
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
968+
969+
// The main function that computes the VGPR to SGPR copy score
970+
// and determines copy further lowering way: v_readfirstlane_b32 or moveToVALU
971+
auto needToBeConvertedToVALU = [&](V2SCopyInfo *I) -> bool {
972+
if (I->SChain.empty())
973+
return true;
974+
I->Siblings = SiblingPenalty[*std::max_element(
975+
I->SChain.begin(), I->SChain.end(),
976+
[&](MachineInstr *A, MachineInstr *B) -> bool {
977+
return SiblingPenalty[A].size() < SiblingPenalty[B].size();
978+
})];
979+
I->Siblings.remove_if([&](unsigned ID) { return ID == I->ID; });
980+
// The loop below computes the number of another VGPR to SGPR copies
981+
// which contribute to the current copy SALU chain. We assume that all the
982+
// copies with the same source virtual register will be squashed to one by
983+
// regalloc. Also we take careof the copies of the differnt subregs of the
984+
// same register.
985+
SmallSet<std::pair<Register, unsigned>, 4> SrcRegs;
986+
for (auto J : I->Siblings) {
987+
auto InfoIt = Copies.find(J);
988+
if (InfoIt != Copies.end()) {
989+
MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
990+
if (SiblingCopy->isImplicitDef())
991+
// the COPY has already been MoveToVALUed
992+
continue;
993+
994+
SrcRegs.insert(std::make_pair(SiblingCopy->getOperand(1).getReg(),
995+
SiblingCopy->getOperand(1).getSubReg()));
996+
}
997+
}
998+
I->SiblingPenalty = SrcRegs.size();
999+
1000+
unsigned Penalty =
1001+
I->NumSVCopies + I->SiblingPenalty + I->NumReadfirstlanes;
1002+
unsigned Profit = I->SChain.size();
1003+
I->Score = Penalty > Profit ? 0 : Profit - Penalty;
1004+
I->NeedToBeConvertedToVALU = I->Score < 3;
1005+
return I->NeedToBeConvertedToVALU;
1006+
};
1007+
1008+
auto needProcessing = [](MachineInstr &MI) -> bool {
1009+
switch (MI.getOpcode()) {
1010+
case AMDGPU::COPY:
1011+
case AMDGPU::WQM:
1012+
case AMDGPU::STRICT_WQM:
1013+
case AMDGPU::SOFT_WQM:
1014+
case AMDGPU::STRICT_WWM:
1015+
return true;
1016+
default:
1017+
return false;
1018+
}
1019+
};
1020+
1021+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
1022+
++BI) {
1023+
MachineBasicBlock *MBB = &*BI;
1024+
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
1025+
++I) {
1026+
MachineInstr &MI = *I;
1027+
if (!needProcessing(MI))
1028+
continue;
1029+
if (lowerSpecialCase(MI))
1030+
continue;
1031+
1032+
// Compute the COPY width to pass it to V2SCopyInfo Ctor
1033+
Register DstReg = MI.getOperand(0).getReg();
1034+
1035+
const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);
1036+
1037+
V2SCopyInfo In(getNextVGPRToSGPRCopyId(), &MI,
1038+
TRI->getRegSizeInBits(*DstRC));
1039+
1040+
SmallVector<MachineInstr *, 8> AnalysisWorklist;
1041+
// Needed because the SSA is not a tree but a graph and may have
1042+
// forks and joins. We should not then go same way twice.
1043+
DenseSet<MachineInstr *> Visited;
1044+
AnalysisWorklist.push_back(&MI);
1045+
while (!AnalysisWorklist.empty()) {
1046+
1047+
MachineInstr *Inst = AnalysisWorklist.pop_back_val();
1048+
1049+
if (!Visited.insert(Inst).second)
1050+
continue;
1051+
1052+
// Copies and REG_SEQUENCE do not contribute to the final assembly
1053+
// So, skip them but take care of the SGPR to VGPR copies bookkeeping.
1054+
if (Inst->isCopy() || Inst->isRegSequence()) {
1055+
if (TRI->isVGPR(*MRI, Inst->getOperand(0).getReg())) {
1056+
if (!Inst->isCopy() ||
1057+
!tryChangeVGPRtoSGPRinCopy(*Inst, TRI, TII)) {
1058+
In.NumSVCopies++;
1059+
continue;
1060+
}
1061+
}
1062+
}
1063+
1064+
SiblingPenalty[Inst].insert(In.ID);
1065+
1066+
SmallVector<MachineInstr *, 4> Users;
1067+
if ((TII->isSALU(*Inst) && Inst->isCompare()) ||
1068+
(Inst->isCopy() && Inst->getOperand(0).getReg() == AMDGPU::SCC)) {
1069+
auto I = Inst->getIterator();
1070+
auto E = Inst->getParent()->end();
1071+
while (++I != E && !I->findRegisterDefOperand(AMDGPU::SCC)) {
1072+
if (I->readsRegister(AMDGPU::SCC))
1073+
Users.push_back(&*I);
1074+
}
1075+
} else if (Inst->getNumExplicitDefs() != 0) {
1076+
Register Reg = Inst->getOperand(0).getReg();
1077+
if (TRI->isSGPRReg(*MRI, Reg))
1078+
for (auto &U : MRI->use_instructions(Reg))
1079+
Users.push_back(&U);
1080+
}
1081+
for (auto U : Users) {
1082+
if (TII->isSALU(*U))
1083+
In.SChain.insert(U);
1084+
AnalysisWorklist.push_back(U);
1085+
}
1086+
}
1087+
Copies[In.ID] = In;
1088+
}
1089+
}
1090+
1091+
SmallVector<unsigned, 8> LoweringWorklist;
1092+
for (auto &C : Copies) {
1093+
if (needToBeConvertedToVALU(&C.second))
1094+
LoweringWorklist.push_back(C.second.ID);
1095+
}
1096+
1097+
while (!LoweringWorklist.empty()) {
1098+
unsigned CurID = LoweringWorklist.pop_back_val();
1099+
auto CurInfoIt = Copies.find(CurID);
1100+
if (CurInfoIt != Copies.end()) {
1101+
V2SCopyInfo C = CurInfoIt->getSecond();
1102+
LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
1103+
for (auto S : C.Siblings) {
1104+
auto SibInfoIt = Copies.find(S);
1105+
if (SibInfoIt != Copies.end()) {
1106+
V2SCopyInfo &SI = SibInfoIt->getSecond();
1107+
LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
1108+
if (!SI.NeedToBeConvertedToVALU) {
1109+
set_subtract(SI.SChain, C.SChain);
1110+
if (needToBeConvertedToVALU(&SI))
1111+
LoweringWorklist.push_back(SI.ID);
1112+
}
1113+
SI.Siblings.remove_if([&](unsigned ID) { return ID == C.ID; });
1114+
}
1115+
}
1116+
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
1117+
<< " is being turned to VALU\n");
1118+
Copies.erase(C.ID);
1119+
TII->moveToVALU(*C.Copy, MDT);
1120+
}
1121+
}
1122+
1123+
// Now do actual lowering
1124+
for (auto C : Copies) {
1125+
MachineInstr *MI = C.second.Copy;
1126+
MachineBasicBlock *MBB = MI->getParent();
1127+
// We decide to turn V2S copy to v_readfirstlane_b32
1128+
// remove it from the V2SCopies and remove it from all its siblings
1129+
LLVM_DEBUG(dbgs() << "V2S copy " << *MI
1130+
<< " is being turned to v_readfirstlane_b32"
1131+
<< " Score: " << C.second.Score << "\n");
1132+
Register DstReg = MI->getOperand(0).getReg();
1133+
Register SrcReg = MI->getOperand(1).getReg();
1134+
unsigned SubReg = MI->getOperand(1).getSubReg();
1135+
const TargetRegisterClass *SrcRC = TRI->getRegClassForReg(*MRI, SrcReg);
1136+
SrcRC = TRI->getSubRegClass(SrcRC, SubReg);
1137+
size_t SrcSize = TRI->getRegSizeInBits(*SrcRC);
1138+
if (SrcSize == 16) {
1139+
// HACK to handle possible 16bit VGPR source
1140+
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1141+
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1142+
MIB.addReg(SrcReg, 0, AMDGPU::NoSubRegister);
1143+
} else if (SrcSize == 32) {
1144+
auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
1145+
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg);
1146+
MIB.addReg(SrcReg, 0, SubReg);
1147+
} else {
1148+
auto Result = BuildMI(*MBB, MI, MI->getDebugLoc(),
1149+
TII->get(AMDGPU::REG_SEQUENCE), DstReg);
1150+
int N = TRI->getRegSizeInBits(*SrcRC) / 32;
1151+
for (int i = 0; i < N; i++) {
1152+
Register PartialSrc = TII->buildExtractSubReg(
1153+
Result, *MRI, MI->getOperand(1), SrcRC,
1154+
TRI->getSubRegFromChannel(i), &AMDGPU::VGPR_32RegClass);
1155+
Register PartialDst =
1156+
MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
1157+
BuildMI(*MBB, *Result, Result->getDebugLoc(),
1158+
TII->get(AMDGPU::V_READFIRSTLANE_B32), PartialDst)
1159+
.addReg(PartialSrc);
1160+
Result.addReg(PartialDst).addImm(TRI->getSubRegFromChannel(i));
1161+
}
1162+
}
1163+
MI->eraseFromParent();
1164+
}
1165+
}

llvm/test/CodeGen/AMDGPU/add3.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ define amdgpu_ps float @add3_uniform_vgpr(float inreg %a, float inreg %b, float
223223
; VI-NEXT: v_mov_b32_e32 v2, 0x40400000
224224
; VI-NEXT: v_add_f32_e32 v2, s4, v2
225225
; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0
226-
; VI-NEXT: v_add_u32_e32 v0, vcc, v2, v0
226+
; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2
227227
; VI-NEXT: ; return to shader part epilog
228228
;
229229
; GFX9-LABEL: add3_uniform_vgpr:

0 commit comments

Comments
 (0)