Skip to content

Commit df50c85

Browse files
AMDGPU/GlobalISel: StandaloneRegBankSelect
Assign register banks to virtual registers. Does not use generic RegBankSelect. Defs and uses of G_ instructions have register banks exclusively, if they had register class, reassign appropriate register bank. Assign register banks using machine uniformity analysis: SGPR - uniform values and some lane masks VGPR - divergent, non S1, values VCC - divergent S1 values(lane masks) StandaloneRegBankSelect does not consider available instructions and, in some cases, G_ instructions with some register bank assignment can't be inst-selected. This is solved in RegBankLegalize. Exceptions when uniformity analysis does not work: S32/S64 lane masks: - need to end up with SGPR register class after instruction selection - In most cases Uniformity analysis declares them as uniform (forced by tablegen) resulting in sgpr S32/S64 reg bank - When Uniformity analysis declares them as divergent (some phis), use intrinsic lane mask analyzer to still assign sgpr register bank temporal divergence copy: - COPY to vgpr with implicit use of $exec inside of the cycle - this copy is declared as uniform by uniformity analysis - make sure that assigned bank is vgpr Note: uniformity analysis does not consider that registers with vgpr def are divergent (you can have uniform value in vgpr). - TODO: implicit use of $exec could be implemented as indicator that instruction is divergent
1 parent 0c40f68 commit df50c85

File tree

5 files changed

+996
-685
lines changed

5 files changed

+996
-685
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "AMDGPUGlobalISelUtils.h"
10+
#include "AMDGPURegisterBankInfo.h"
1011
#include "GCNSubtarget.h"
1112
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
13+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1214
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1315
#include "llvm/CodeGenTypes/LowLevelType.h"
1416
#include "llvm/IR/Constants.h"
17+
#include "llvm/IR/IntrinsicsAMDGPU.h"
1518

1619
using namespace llvm;
20+
using namespace AMDGPU;
1721
using namespace MIPatternMatch;
1822

1923
std::pair<Register, unsigned>
@@ -69,3 +73,37 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
6973

7074
return std::pair(Reg, 0);
7175
}
76+
77+
IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
78+
: MRI(MF.getRegInfo()) {
79+
initLaneMaskIntrinsics(MF);
80+
}
81+
82+
bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) {
83+
return S32S64LaneMask.contains(Reg);
84+
}
85+
86+
void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
87+
for (auto &MBB : MF) {
88+
for (auto &MI : MBB) {
89+
if (isa<GIntrinsic>(MI) &&
90+
cast<GIntrinsic>(MI).getIntrinsicID() == Intrinsic::amdgcn_if_break) {
91+
S32S64LaneMask.insert(MI.getOperand(3).getReg());
92+
findLCSSAPhi(MI.getOperand(0).getReg());
93+
}
94+
95+
if (MI.getOpcode() == AMDGPU::SI_IF ||
96+
MI.getOpcode() == AMDGPU::SI_ELSE) {
97+
findLCSSAPhi(MI.getOperand(0).getReg());
98+
}
99+
}
100+
}
101+
}
102+
103+
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
104+
S32S64LaneMask.insert(Reg);
105+
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
106+
if (LCSSAPhi.isPHI())
107+
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
108+
}
109+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "llvm/ADT/DenseSet.h"
13+
#include "llvm/CodeGen/MachineFunction.h"
1214
#include "llvm/CodeGen/Register.h"
1315
#include <utility>
1416

@@ -26,6 +28,26 @@ std::pair<Register, unsigned>
2628
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
2729
GISelKnownBits *KnownBits = nullptr,
2830
bool CheckNUW = false);
31+
32+
// Currently finds S32/S64 lane masks that can be declared as divergent by
33+
// uniformity analysis (all are phis at the moment).
34+
// These are defined as i32/i64 in some IR intrinsics (not as i1).
35+
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
36+
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
37+
// reg class after instruction-select don't search for all of them.
38+
class IntrinsicLaneMaskAnalyzer {
39+
DenseSet<Register> S32S64LaneMask;
40+
MachineRegisterInfo &MRI;
41+
42+
public:
43+
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
44+
bool isS32S64LaneMask(Register Reg);
45+
46+
private:
47+
void initLaneMaskIntrinsics(MachineFunction &MF);
48+
// This will not be needed when we turn off LCSSA for global-isel.
49+
void findLCSSAPhi(Register Reg);
50+
};
2951
}
3052
}
3153

llvm/lib/Target/AMDGPU/AMDGPUStandaloneRegBankSelect.cpp

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,13 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
20+
#include "AMDGPURegisterBankInfo.h"
21+
#include "GCNSubtarget.h"
22+
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1924
#include "llvm/CodeGen/MachineFunctionPass.h"
25+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
2026
#include "llvm/InitializePasses.h"
2127

2228
#define DEBUG_TYPE "amdgpu-standalone-regbankselect"
@@ -41,6 +47,7 @@ class AMDGPUStandaloneRegBankSelect : public MachineFunctionPass {
4147
}
4248

4349
void getAnalysisUsage(AnalysisUsage &AU) const override {
50+
AU.addRequired<MachineUniformityAnalysisPass>();
4451
MachineFunctionPass::getAnalysisUsage(AU);
4552
}
4653

@@ -68,9 +75,220 @@ FunctionPass *llvm::createAMDGPUStandaloneRegBankSelectPass() {
6875
return new AMDGPUStandaloneRegBankSelect();
6976
}
7077

78+
class RegBankSelectHelper {
79+
MachineFunction &MF;
80+
MachineIRBuilder &B;
81+
MachineRegisterInfo &MRI;
82+
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
83+
const MachineUniformityInfo &MUI;
84+
const SIRegisterInfo &TRI;
85+
const RegisterBank *SgprRB;
86+
const RegisterBank *VgprRB;
87+
const RegisterBank *VccRB;
88+
89+
public:
90+
RegBankSelectHelper(MachineFunction &MF, MachineIRBuilder &B,
91+
MachineRegisterInfo &MRI,
92+
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
93+
const MachineUniformityInfo &MUI,
94+
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
95+
: MF(MF), B(B), MRI(MRI), ILMA(ILMA), MUI(MUI), TRI(TRI),
96+
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
97+
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
98+
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
99+
100+
bool shouldRegBankSelect(MachineInstr &MI) {
101+
return MI.isPreISelOpcode() || MI.isCopy();
102+
}
103+
104+
void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
105+
const RegisterBank *RB) {
106+
Register Reg = DefOP.getReg();
107+
// Register that already has Register class got it during pre-inst selection
108+
// of another instruction. Maybe cross bank copy was required so we insert a
109+
// copy that can be removed later. This simplifies post-rb-legalize artifact
110+
// combiner and avoids need to special case some patterns.
111+
if (MRI.getRegClassOrNull(Reg)) {
112+
LLT Ty = MRI.getType(Reg);
113+
Register NewReg = MRI.createVirtualRegister({RB, Ty});
114+
DefOP.setReg(NewReg);
115+
116+
auto &MBB = *MI.getParent();
117+
B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator())));
118+
B.buildCopy(Reg, NewReg);
119+
120+
// The problem was discovered for uniform S1 that was used as both
121+
// lane mask(vcc) and regular sgpr S1.
122+
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
123+
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
124+
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
125+
// - the regular sgpr S1(uniform) instruction is now broken since
126+
// it uses sreg_64_xexec(S1) which is divergent.
127+
128+
// "Clear" reg classes from uses on generic instructions and put register
129+
// banks instead.
130+
for (auto &UseMI : MRI.use_instructions(Reg)) {
131+
if (shouldRegBankSelect(UseMI)) {
132+
for (MachineOperand &Op : UseMI.operands()) {
133+
if (Op.isReg() && Op.getReg() == Reg)
134+
Op.setReg(NewReg);
135+
}
136+
}
137+
}
138+
139+
} else {
140+
MRI.setRegBank(Reg, *RB);
141+
}
142+
}
143+
144+
void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP,
145+
const RegisterBank *RB) {
146+
Register Reg = UseOP.getReg();
147+
148+
LLT Ty = MRI.getType(Reg);
149+
Register NewReg = MRI.createVirtualRegister({RB, Ty});
150+
UseOP.setReg(NewReg);
151+
152+
if (MI.isPHI()) {
153+
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
154+
MachineBasicBlock *DefMBB = DefMI->getParent();
155+
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
156+
} else {
157+
B.setInstr(MI);
158+
}
159+
160+
B.buildCopy(NewReg, Reg);
161+
}
162+
163+
std::optional<Register> tryGetVReg(MachineOperand &Op) {
164+
if (!Op.isReg())
165+
return std::nullopt;
166+
167+
Register Reg = Op.getReg();
168+
if (!Reg.isVirtual())
169+
return std::nullopt;
170+
171+
return Reg;
172+
}
173+
174+
void assignBanksOnDefs() {
175+
for (MachineBasicBlock &MBB : MF) {
176+
for (MachineInstr &MI : MBB) {
177+
if (!shouldRegBankSelect(MI))
178+
continue;
179+
180+
for (MachineOperand &DefOP : MI.defs()) {
181+
auto MaybeDefReg = tryGetVReg(DefOP);
182+
if (!MaybeDefReg)
183+
continue;
184+
Register DefReg = *MaybeDefReg;
185+
186+
// Copies can have register class on def registers.
187+
if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
188+
continue;
189+
}
190+
191+
if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
192+
setRBDef(MI, DefOP, SgprRB);
193+
} else {
194+
if (MRI.getType(DefReg) == LLT::scalar(1))
195+
setRBDef(MI, DefOP, VccRB);
196+
else
197+
setRBDef(MI, DefOP, VgprRB);
198+
}
199+
}
200+
}
201+
}
202+
}
203+
204+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
205+
// the cycle
206+
// Note: uniformity analysis does not consider that registers with vgpr def
207+
// are divergent (you can have uniform value in vgpr).
208+
// - TODO: implicit use of $exec could be implemented as indicator that
209+
// instruction is divergent
210+
bool isTemporalDivergenceCopy(Register Reg) {
211+
MachineInstr *MI = MRI.getVRegDef(Reg);
212+
if (!MI->isCopy())
213+
return false;
214+
215+
for (auto Op : MI->implicit_operands()) {
216+
if (!Op.isReg())
217+
continue;
218+
219+
if (Op.getReg() == TRI.getExec()) {
220+
return true;
221+
}
222+
}
223+
224+
return false;
225+
}
226+
227+
void constrainBanksOnUses() {
228+
for (MachineBasicBlock &MBB : MF) {
229+
for (MachineInstr &MI : MBB) {
230+
if (!shouldRegBankSelect(MI))
231+
continue;
232+
233+
// Copies can have register class on use registers.
234+
if (MI.isCopy())
235+
continue;
236+
237+
for (MachineOperand &UseOP : MI.uses()) {
238+
auto MaybeUseReg = tryGetVReg(UseOP);
239+
if (!MaybeUseReg)
240+
continue;
241+
Register UseReg = *MaybeUseReg;
242+
243+
// UseReg already has register bank.
244+
if (MRI.getRegBankOrNull(UseReg))
245+
continue;
246+
247+
if (!isTemporalDivergenceCopy(UseReg) &&
248+
(MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) {
249+
constrainRBUse(MI, UseOP, SgprRB);
250+
} else {
251+
if (MRI.getType(UseReg) == LLT::scalar(1))
252+
constrainRBUse(MI, UseOP, VccRB);
253+
else
254+
constrainRBUse(MI, UseOP, VgprRB);
255+
}
256+
}
257+
}
258+
}
259+
}
260+
};
261+
71262
bool AMDGPUStandaloneRegBankSelect::runOnMachineFunction(MachineFunction &MF) {
72263
if (MF.getProperties().hasProperty(
73264
MachineFunctionProperties::Property::FailedISel))
74265
return false;
266+
267+
MachineUniformityInfo &MUI =
268+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
269+
AMDGPU::IntrinsicLaneMaskAnalyzer ILMA(MF);
270+
MachineRegisterInfo &MRI = MF.getRegInfo();
271+
const SIRegisterInfo &TRI =
272+
*MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
273+
const RegisterBankInfo &RBI = *MF.getSubtarget().getRegBankInfo();
274+
275+
MachineIRBuilder B(MF);
276+
RegBankSelectHelper RBSHelper(MF, B, MRI, ILMA, MUI, TRI, RBI);
277+
278+
// Assign register banks to ALL def registers on G_ instructions.
279+
// Same for copies if they have no register bank or class on def.
280+
RBSHelper.assignBanksOnDefs();
281+
282+
// At this point all virtual registers have register class or bank
283+
// - Defs of G_ instructions have register banks.
284+
// - Defs and uses of inst-selected instructions have register class.
285+
// - Defs and uses of copies can have either register class or bank
286+
// and most notably
287+
// - Uses of G_ instructions can have either register class or bank
288+
289+
// Reassign uses of G_ instructions to only have register banks.
290+
RBSHelper.constrainBanksOnUses();
291+
292+
// Defs and uses of G_ instructions have register banks exclusively.
75293
return true;
76294
}

0 commit comments

Comments
 (0)