Skip to content

Commit 3866424

Browse files
AMDGPU/GlobalISel: AMDGPURegBankSelect
Assign register banks to virtual registers. Does not use generic RegBankSelect. After register bank selection all register operand of G_ instructions have LLT and register banks exclusively. If they had register class, reassign appropriate register bank. Assign register banks using machine uniformity analysis: Sgpr - uniform values and some lane masks Vgpr - divergent, non S1, values Vcc - divergent S1 values(lane masks) AMDGPURegBankSelect does not consider available instructions and, in some cases, G_ instructions with some register bank assignment can't be inst-selected. This is solved in RegBankLegalize. Exceptions when uniformity analysis does not work: S32/S64 lane masks: - need to end up with sgpr register class after instruction selection - In most cases Uniformity analysis declares them as uniform (forced by tablegen) resulting in sgpr S32/S64 reg bank - When Uniformity analysis declares them as divergent (some phis), use intrinsic lane mask analyzer to still assign sgpr register bank temporal divergence copy: - COPY to vgpr with implicit use of $exec inside of the cycle - this copy is declared as uniform by uniformity analysis - make sure that assigned bank is vgpr Note: uniformity analysis does not consider that registers with vgpr def are divergent (you can have uniform value in vgpr). - TODO: implicit use of $exec could be implemented as indicator that instruction is divergent
1 parent 1bcfb1d commit 3866424

File tree

5 files changed

+985
-661
lines changed

5 files changed

+985
-661
lines changed

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,14 @@
99
#include "AMDGPUGlobalISelUtils.h"
1010
#include "GCNSubtarget.h"
1111
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
12+
#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
1213
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
1314
#include "llvm/CodeGenTypes/LowLevelType.h"
1415
#include "llvm/IR/Constants.h"
16+
#include "llvm/IR/IntrinsicsAMDGPU.h"
1517

1618
using namespace llvm;
19+
using namespace AMDGPU;
1720
using namespace MIPatternMatch;
1821

1922
std::pair<Register, unsigned>
@@ -69,3 +72,37 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
6972

7073
return std::pair(Reg, 0);
7174
}
75+
76+
IntrinsicLaneMaskAnalyzer::IntrinsicLaneMaskAnalyzer(MachineFunction &MF)
77+
: MRI(MF.getRegInfo()) {
78+
initLaneMaskIntrinsics(MF);
79+
}
80+
81+
bool IntrinsicLaneMaskAnalyzer::isS32S64LaneMask(Register Reg) const {
82+
return S32S64LaneMask.contains(Reg);
83+
}
84+
85+
void IntrinsicLaneMaskAnalyzer::initLaneMaskIntrinsics(MachineFunction &MF) {
86+
for (auto &MBB : MF) {
87+
for (auto &MI : MBB) {
88+
GIntrinsic *GI = dyn_cast<GIntrinsic>(&MI);
89+
if (GI && GI->is(Intrinsic::amdgcn_if_break)) {
90+
S32S64LaneMask.insert(MI.getOperand(3).getReg());
91+
findLCSSAPhi(MI.getOperand(0).getReg());
92+
}
93+
94+
if (MI.getOpcode() == AMDGPU::SI_IF ||
95+
MI.getOpcode() == AMDGPU::SI_ELSE) {
96+
findLCSSAPhi(MI.getOperand(0).getReg());
97+
}
98+
}
99+
}
100+
}
101+
102+
void IntrinsicLaneMaskAnalyzer::findLCSSAPhi(Register Reg) {
103+
S32S64LaneMask.insert(Reg);
104+
for (const MachineInstr &LCSSAPhi : MRI.use_instructions(Reg)) {
105+
if (LCSSAPhi.isPHI())
106+
S32S64LaneMask.insert(LCSSAPhi.getOperand(0).getReg());
107+
}
108+
}

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1010
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
1111

12+
#include "llvm/ADT/DenseSet.h"
13+
#include "llvm/CodeGen/MachineFunction.h"
1214
#include "llvm/CodeGen/Register.h"
1315
#include <utility>
1416

@@ -26,6 +28,26 @@ std::pair<Register, unsigned>
2628
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
2729
GISelKnownBits *KnownBits = nullptr,
2830
bool CheckNUW = false);
31+
32+
// Currently finds S32/S64 lane masks that can be declared as divergent by
33+
// uniformity analysis (all are phis at the moment).
34+
// These are defined as i32/i64 in some IR intrinsics (not as i1).
35+
// Tablegen forces(via telling that lane mask IR intrinsics are uniform) most of
36+
// S32/S64 lane masks to be uniform, as this results in them ending up with sgpr
37+
// reg class after instruction-select don't search for all of them.
38+
class IntrinsicLaneMaskAnalyzer {
39+
SmallDenseSet<Register, 8> S32S64LaneMask;
40+
MachineRegisterInfo &MRI;
41+
42+
public:
43+
IntrinsicLaneMaskAnalyzer(MachineFunction &MF);
44+
bool isS32S64LaneMask(Register Reg) const;
45+
46+
private:
47+
void initLaneMaskIntrinsics(MachineFunction &MF);
48+
// This will not be needed when we turn off LCSSA for global-isel.
49+
void findLCSSAPhi(Register Reg);
50+
};
2951
}
3052
}
3153

llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp

Lines changed: 238 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,21 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "AMDGPUGlobalISelUtils.h"
20+
#include "GCNSubtarget.h"
21+
#include "llvm/CodeGen/GlobalISel/CSEInfo.h"
22+
#include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h"
23+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1924
#include "llvm/CodeGen/MachineFunctionPass.h"
25+
#include "llvm/CodeGen/MachineInstr.h"
26+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
27+
#include "llvm/CodeGen/TargetPassConfig.h"
2028
#include "llvm/InitializePasses.h"
2129

2230
#define DEBUG_TYPE "amdgpu-regbankselect"
2331

2432
using namespace llvm;
33+
using namespace AMDGPU;
2534

2635
namespace {
2736

@@ -40,6 +49,9 @@ class AMDGPURegBankSelect : public MachineFunctionPass {
4049
}
4150

4251
void getAnalysisUsage(AnalysisUsage &AU) const override {
52+
AU.addRequired<TargetPassConfig>();
53+
AU.addRequired<GISelCSEAnalysisWrapperPass>();
54+
AU.addRequired<MachineUniformityAnalysisPass>();
4355
MachineFunctionPass::getAnalysisUsage(AU);
4456
}
4557

@@ -55,6 +67,9 @@ class AMDGPURegBankSelect : public MachineFunctionPass {
5567

5668
INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, DEBUG_TYPE,
5769
"AMDGPU Register Bank Select", false, false)
70+
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
71+
INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
72+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
5873
INITIALIZE_PASS_END(AMDGPURegBankSelect, DEBUG_TYPE,
5974
"AMDGPU Register Bank Select", false, false)
6075

@@ -66,9 +81,232 @@ FunctionPass *llvm::createAMDGPURegBankSelectPass() {
6681
return new AMDGPURegBankSelect();
6782
}
6883

84+
class RegBankSelectHelper {
85+
MachineIRBuilder &B;
86+
MachineRegisterInfo &MRI;
87+
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA;
88+
const MachineUniformityInfo &MUI;
89+
const SIRegisterInfo &TRI;
90+
const RegisterBank *SgprRB;
91+
const RegisterBank *VgprRB;
92+
const RegisterBank *VccRB;
93+
94+
public:
95+
RegBankSelectHelper(MachineIRBuilder &B,
96+
AMDGPU::IntrinsicLaneMaskAnalyzer &ILMA,
97+
const MachineUniformityInfo &MUI,
98+
const SIRegisterInfo &TRI, const RegisterBankInfo &RBI)
99+
: B(B), MRI(*B.getMRI()), ILMA(ILMA), MUI(MUI), TRI(TRI),
100+
SgprRB(&RBI.getRegBank(AMDGPU::SGPRRegBankID)),
101+
VgprRB(&RBI.getRegBank(AMDGPU::VGPRRegBankID)),
102+
VccRB(&RBI.getRegBank(AMDGPU::VCCRegBankID)) {}
103+
104+
bool shouldRegBankSelect(MachineInstr &MI) {
105+
return MI.isPreISelOpcode() || MI.isCopy();
106+
}
107+
108+
// Temporal divergence copy: COPY to vgpr with implicit use of $exec inside of
109+
// the cycle
110+
// Note: uniformity analysis does not consider that registers with vgpr def
111+
// are divergent (you can have uniform value in vgpr).
112+
// - TODO: implicit use of $exec could be implemented as indicator that
113+
// instruction is divergent
114+
bool isTemporalDivergenceCopy(Register Reg) {
115+
MachineInstr *MI = MRI.getVRegDef(Reg);
116+
if (!MI->isCopy())
117+
return false;
118+
119+
for (auto Op : MI->implicit_operands()) {
120+
if (!Op.isReg())
121+
continue;
122+
123+
if (Op.getReg() == TRI.getExec()) {
124+
return true;
125+
}
126+
}
127+
128+
return false;
129+
}
130+
131+
void setRBDef(MachineInstr &MI, MachineOperand &DefOP,
132+
const RegisterBank *RB) {
133+
Register Reg = DefOP.getReg();
134+
135+
if (!MRI.getRegClassOrNull(Reg)) {
136+
MRI.setRegBank(Reg, *RB);
137+
return;
138+
}
139+
140+
// Register that already has Register class got it during pre-inst selection
141+
// of another instruction. Maybe cross bank copy was required so we insert a
142+
// copy that can be removed later. This simplifies post regbanklegalize
143+
// combiner and avoids need to special case some patterns.
144+
LLT Ty = MRI.getType(Reg);
145+
Register NewReg = MRI.createVirtualRegister({RB, Ty});
146+
DefOP.setReg(NewReg);
147+
148+
auto &MBB = *MI.getParent();
149+
B.setInsertPt(MBB, MBB.SkipPHIsAndLabels(std::next(MI.getIterator())));
150+
B.buildCopy(Reg, NewReg);
151+
152+
// The problem was discovered for uniform S1 that was used as both
153+
// lane mask(vcc) and regular sgpr S1.
154+
// - lane-mask(vcc) use was by si_if, this use is divergent and requires
155+
// non-trivial sgpr-S1-to-vcc copy. But pre-inst-selection of si_if sets
156+
// sreg_64_xexec(S1) on def of uniform S1 making it lane-mask.
157+
// - the regular sgpr S1(uniform) instruction is now broken since
158+
// it uses sreg_64_xexec(S1) which is divergent.
159+
160+
// Replace virtual registers with register class on generic instructions
161+
// uses with virtual registers with register bank.
162+
SmallVector<MachineOperand *, 4> RegUsesOnGInstrs;
163+
for (auto &UseMI : MRI.use_instructions(Reg)) {
164+
if (shouldRegBankSelect(UseMI)) {
165+
for (MachineOperand &Op : UseMI.operands()) {
166+
if (Op.isReg() && Op.getReg() == Reg)
167+
RegUsesOnGInstrs.push_back(&Op);
168+
}
169+
}
170+
}
171+
for (MachineOperand *Op : RegUsesOnGInstrs) {
172+
Op->setReg(NewReg);
173+
}
174+
}
175+
176+
Register tryGetVReg(MachineOperand &Op) {
177+
if (!Op.isReg())
178+
return {};
179+
180+
Register Reg = Op.getReg();
181+
if (!Reg.isVirtual())
182+
return {};
183+
184+
return Reg;
185+
}
186+
187+
void assignBanksOnDefs(MachineInstr &MI) {
188+
if (!shouldRegBankSelect(MI))
189+
return;
190+
191+
for (MachineOperand &DefOP : MI.defs()) {
192+
Register DefReg = tryGetVReg(DefOP);
193+
if (!DefReg.isValid())
194+
continue;
195+
196+
// Copies can have register class on def registers.
197+
if (MI.isCopy() && MRI.getRegClassOrNull(DefReg)) {
198+
continue;
199+
}
200+
201+
if (MUI.isUniform(DefReg) || ILMA.isS32S64LaneMask(DefReg)) {
202+
setRBDef(MI, DefOP, SgprRB);
203+
} else {
204+
if (MRI.getType(DefReg) == LLT::scalar(1))
205+
setRBDef(MI, DefOP, VccRB);
206+
else
207+
setRBDef(MI, DefOP, VgprRB);
208+
}
209+
}
210+
}
211+
212+
void constrainRBUse(MachineInstr &MI, MachineOperand &UseOP,
213+
const RegisterBank *RB) {
214+
Register Reg = UseOP.getReg();
215+
216+
LLT Ty = MRI.getType(Reg);
217+
Register NewReg = MRI.createVirtualRegister({RB, Ty});
218+
UseOP.setReg(NewReg);
219+
220+
if (MI.isPHI()) {
221+
auto DefMI = MRI.getVRegDef(Reg)->getIterator();
222+
MachineBasicBlock *DefMBB = DefMI->getParent();
223+
B.setInsertPt(*DefMBB, DefMBB->SkipPHIsAndLabels(std::next(DefMI)));
224+
} else {
225+
B.setInstr(MI);
226+
}
227+
228+
B.buildCopy(NewReg, Reg);
229+
}
230+
231+
void constrainBanksOnUses(MachineInstr &MI) {
232+
if (!shouldRegBankSelect(MI))
233+
return;
234+
235+
// Copies can have register class on use registers.
236+
if (MI.isCopy())
237+
return;
238+
239+
for (MachineOperand &UseOP : MI.uses()) {
240+
auto UseReg = tryGetVReg(UseOP);
241+
if (!UseReg.isValid())
242+
continue;
243+
244+
// UseReg already has register bank.
245+
if (MRI.getRegBankOrNull(UseReg))
246+
continue;
247+
248+
if (!isTemporalDivergenceCopy(UseReg) &&
249+
(MUI.isUniform(UseReg) || ILMA.isS32S64LaneMask(UseReg))) {
250+
constrainRBUse(MI, UseOP, SgprRB);
251+
} else {
252+
if (MRI.getType(UseReg) == LLT::scalar(1))
253+
constrainRBUse(MI, UseOP, VccRB);
254+
else
255+
constrainRBUse(MI, UseOP, VgprRB);
256+
}
257+
}
258+
}
259+
};
260+
69261
bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
70262
if (MF.getProperties().hasProperty(
71263
MachineFunctionProperties::Property::FailedISel))
72264
return false;
265+
266+
// Setup the instruction builder with CSE.
267+
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
268+
GISelCSEAnalysisWrapper &Wrapper =
269+
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
270+
GISelCSEInfo &CSEInfo = Wrapper.get(TPC.getCSEConfig());
271+
GISelObserverWrapper Observer;
272+
Observer.addObserver(&CSEInfo);
273+
274+
CSEMIRBuilder B(MF);
275+
B.setCSEInfo(&CSEInfo);
276+
B.setChangeObserver(Observer);
277+
278+
RAIIDelegateInstaller DelegateInstaller(MF, &Observer);
279+
RAIIMFObserverInstaller MFObserverInstaller(MF, Observer);
280+
281+
IntrinsicLaneMaskAnalyzer ILMA(MF);
282+
MachineUniformityInfo &MUI =
283+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
284+
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
285+
RegBankSelectHelper RBSHelper(B, ILMA, MUI, *ST.getRegisterInfo(),
286+
*ST.getRegBankInfo());
287+
288+
// Assign register banks to ALL def registers on G_ instructions.
289+
// Same for copies if they have no register bank or class on def.
290+
for (MachineBasicBlock &MBB : MF) {
291+
for (MachineInstr &MI : MBB) {
292+
RBSHelper.assignBanksOnDefs(MI);
293+
}
294+
}
295+
296+
// At this point all virtual registers have register class or bank
297+
// - Defs of G_ instructions have register banks.
298+
// - Defs and uses of inst-selected instructions have register class.
299+
// - Defs and uses of copies can have either register class or bank
300+
// and most notably:
301+
// - Uses of G_ instructions can have either register class or bank.
302+
303+
// Reassign uses of G_ instructions to only have register banks.
304+
for (MachineBasicBlock &MBB : MF) {
305+
for (MachineInstr &MI : MBB) {
306+
RBSHelper.constrainBanksOnUses(MI);
307+
}
308+
}
309+
310+
// Defs and uses of G_ instructions have register banks exclusively.
73311
return true;
74312
}

0 commit comments

Comments
 (0)