Skip to content

Commit 13fa684

Browse files
AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis
Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT. TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes. patch 3 from: #73337
1 parent 4ea1994 commit 13fa684

21 files changed

+831
-259
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,17 @@ class MachineRegisterInfo {
752752
Register createVirtualRegister(const TargetRegisterClass *RegClass,
753753
StringRef Name = "");
754754

755+
/// All avilable attributes a virtual register can have.
756+
struct RegisterAttributes {
757+
const RegClassOrRegBank *RCOrRB;
758+
LLT Ty;
759+
};
760+
761+
/// createVirtualRegister - Create and return a new virtual register in the
762+
/// function with the specified register attributes.
763+
Register createVirtualRegister(RegisterAttributes RegAttr,
764+
StringRef Name = "");
765+
755766
/// Create and return a new virtual register in the function with the same
756767
/// attributes as the given register.
757768
Register cloneVirtualRegister(Register VReg, StringRef Name = "");

llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
3232
MachineFunction &F, const MachineCycleInfo &cycleInfo,
3333
const MachineDomTree &domTree, bool HasBranchDivergence);
3434

35+
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
36+
class MachineUniformityAnalysisPass : public MachineFunctionPass {
37+
MachineUniformityInfo UI;
38+
39+
public:
40+
static char ID;
41+
42+
MachineUniformityAnalysisPass();
43+
44+
MachineUniformityInfo &getUniformityInfo() { return UI; }
45+
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
46+
47+
bool runOnMachineFunction(MachineFunction &F) override;
48+
void getAnalysisUsage(AnalysisUsage &AU) const override;
49+
void print(raw_ostream &OS, const Module *M = nullptr) const override;
50+
51+
// TODO: verify analysis
52+
};
53+
3554
} // namespace llvm
3655

3756
#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H

llvm/lib/CodeGen/MachineRegisterInfo.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,17 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
167167
return Reg;
168168
}
169169

170+
/// createVirtualRegister - Create and return a new virtual register in the
171+
/// function with the specified register attributes.
172+
Register MachineRegisterInfo::createVirtualRegister(RegisterAttributes RegAttr,
173+
StringRef Name) {
174+
Register Reg = createIncompleteVirtualRegister(Name);
175+
VRegInfo[Reg].first = *RegAttr.RCOrRB;
176+
setType(Reg, RegAttr.Ty);
177+
noteNewVirtualRegister(Reg);
178+
return Reg;
179+
}
180+
170181
Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
171182
StringRef Name) {
172183
Register Reg = createIncompleteVirtualRegister(Name);

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
165165

166166
namespace {
167167

168-
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
169-
class MachineUniformityAnalysisPass : public MachineFunctionPass {
170-
MachineUniformityInfo UI;
171-
172-
public:
173-
static char ID;
174-
175-
MachineUniformityAnalysisPass();
176-
177-
MachineUniformityInfo &getUniformityInfo() { return UI; }
178-
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
179-
180-
bool runOnMachineFunction(MachineFunction &F) override;
181-
void getAnalysisUsage(AnalysisUsage &AU) const override;
182-
void print(raw_ostream &OS, const Module *M = nullptr) const override;
183-
184-
// TODO: verify analysis
185-
};
186-
187168
class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
188169
public:
189170
static char ID;

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "SILowerI1Copies.h"
1920
#include "llvm/CodeGen/MachineFunctionPass.h"
21+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
22+
#include "llvm/InitializePasses.h"
2023

2124
#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
2225

@@ -42,14 +45,152 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4245

4346
void getAnalysisUsage(AnalysisUsage &AU) const override {
4447
AU.setPreservesCFG();
48+
AU.addRequired<MachineDominatorTree>();
49+
AU.addRequired<MachinePostDominatorTree>();
50+
AU.addRequired<MachineUniformityAnalysisPass>();
4551
MachineFunctionPass::getAnalysisUsage(AU);
4652
}
4753
};
4854

55+
class DivergenceLoweringHelper : public PhiLoweringHelper {
56+
public:
57+
DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
58+
MachinePostDominatorTree *PDT,
59+
MachineUniformityInfo *MUI);
60+
61+
private:
62+
MachineUniformityInfo *MUI = nullptr;
63+
64+
public:
65+
void markAsLaneMask(Register DstReg) const override;
66+
void getCandidatesForLowering(
67+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
68+
void collectIncomingValuesFromPhi(
69+
const MachineInstr *MI,
70+
SmallVectorImpl<Incoming> &Incomings) const override;
71+
void replaceDstReg(Register NewReg, Register OldReg,
72+
MachineBasicBlock *MBB) override;
73+
void buildMergeLaneMasks(MachineBasicBlock &MBB,
74+
MachineBasicBlock::iterator I, const DebugLoc &DL,
75+
Register DstReg, Register PrevReg,
76+
Register CurReg) override;
77+
void constrainAsLaneMask(Incoming &In) override;
78+
};
79+
80+
DivergenceLoweringHelper::DivergenceLoweringHelper(
81+
MachineFunction *MF, MachineDominatorTree *DT,
82+
MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
83+
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI) {}
84+
85+
// _(s1) -> SReg_32/64(s1)
86+
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
87+
assert(MRI->getType(DstReg) == LLT::scalar(1));
88+
89+
if (MRI->getRegClassOrNull(DstReg)) {
90+
MRI->constrainRegClass(DstReg, ST->getBoolRC());
91+
return;
92+
}
93+
94+
MRI->setRegClass(DstReg, ST->getBoolRC());
95+
}
96+
97+
void DivergenceLoweringHelper::getCandidatesForLowering(
98+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
99+
LLT S1 = LLT::scalar(1);
100+
101+
// Add divergent i1 phis to the list
102+
for (MachineBasicBlock &MBB : *MF) {
103+
for (MachineInstr &MI : MBB.phis()) {
104+
Register Dst = MI.getOperand(0).getReg();
105+
if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
106+
Vreg1Phis.push_back(&MI);
107+
}
108+
}
109+
}
110+
111+
void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
112+
const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
113+
for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
114+
Incomings.emplace_back(MI->getOperand(i).getReg(),
115+
MI->getOperand(i + 1).getMBB(), Register());
116+
}
117+
}
118+
119+
void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
120+
MachineBasicBlock *MBB) {
121+
BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
122+
.addReg(NewReg);
123+
}
124+
125+
// Get pointers to build instruction just after MI (skips phis if needed)
126+
static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
127+
getInsertAfterPtrs(MachineInstr *MI) {
128+
MachineBasicBlock *InsertMBB = MI->getParent();
129+
return {InsertMBB,
130+
InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator()))};
131+
}
132+
133+
// bb.previous
134+
// %PrevReg = ...
135+
//
136+
// bb.current
137+
// %CurReg = ...
138+
//
139+
// %DstReg - not defined
140+
//
141+
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
142+
//
143+
// bb.previous
144+
// %PrevReg = ...
145+
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
146+
//
147+
// bb.current
148+
// %CurReg = ...
149+
// %CurRegCopy:sreg_32(s1) = COPY %CurReg
150+
// ...
151+
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
152+
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
153+
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
154+
//
155+
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
156+
void DivergenceLoweringHelper::buildMergeLaneMasks(
157+
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
158+
Register DstReg, Register PrevReg, Register CurReg) {
159+
// DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
160+
// TODO: check if inputs are constants or results of a compare.
161+
162+
Register PrevRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
163+
auto [PrevMBB, AfterPrevReg] = getInsertAfterPtrs(MRI->getVRegDef(PrevReg));
164+
BuildMI(*PrevMBB, AfterPrevReg, DL, TII->get(AMDGPU::COPY), PrevRegCopy)
165+
.addReg(PrevReg);
166+
Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
167+
BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
168+
.addReg(PrevRegCopy)
169+
.addReg(ExecReg);
170+
171+
Register CurRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
172+
auto [CurMBB, AfterCurReg] = getInsertAfterPtrs(MRI->getVRegDef(CurReg));
173+
BuildMI(*CurMBB, AfterCurReg, DL, TII->get(AMDGPU::COPY), CurRegCopy)
174+
.addReg(CurReg);
175+
Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
176+
BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
177+
.addReg(ExecReg)
178+
.addReg(CurRegCopy);
179+
180+
BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
181+
.addReg(PrevMaskedReg)
182+
.addReg(CurMaskedReg);
183+
}
184+
185+
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
186+
49187
} // End anonymous namespace.
50188

51189
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
52190
"AMDGPU GlobalISel divergence lowering", false, false)
191+
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
192+
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
193+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
53194
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
54195
"AMDGPU GlobalISel divergence lowering", false, false)
55196

@@ -64,5 +205,14 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
64205

65206
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
66207
MachineFunction &MF) {
67-
return false;
208+
MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
209+
MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
210+
MachineUniformityInfo &MUI =
211+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
212+
213+
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
214+
215+
bool Changed = false;
216+
Changed |= Helper.lowerPhis();
217+
return Changed;
68218
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
210210
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211211
const Register DefReg = I.getOperand(0).getReg();
212212
const LLT DefTy = MRI->getType(DefReg);
213+
213214
if (DefTy == LLT::scalar(1)) {
214215
if (!AllowRiskySelect) {
215216
LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
@@ -3552,8 +3553,6 @@ bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
35523553
}
35533554

35543555
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3555-
if (I.isPHI())
3556-
return selectPHI(I);
35573556

35583557
if (!I.isPreISelOpcode()) {
35593558
if (I.isCopy())
@@ -3696,6 +3695,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
36963695
return selectWaveAddress(I);
36973696
case AMDGPU::G_STACKRESTORE:
36983697
return selectStackRestore(I);
3698+
case AMDGPU::G_PHI:
3699+
return selectPHI(I);
36993700
default:
37003701
return selectImpl(I, *CoverageInfo);
37013702
}

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131

3232
using namespace llvm;
3333

34-
static Register insertUndefLaneMask(MachineBasicBlock *MBB,
35-
MachineRegisterInfo *MRI,
36-
Register LaneMaskRegAttrs);
34+
static Register
35+
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
36+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
3737

3838
namespace {
3939

@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
7878
MachineBasicBlock::iterator I, const DebugLoc &DL,
7979
Register DstReg, Register PrevReg,
8080
Register CurReg) override;
81-
void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
81+
void constrainAsLaneMask(Incoming &In) override;
8282

8383
bool lowerCopiesFromI1();
8484
bool lowerCopiesToI1();
@@ -304,7 +304,8 @@ class LoopFinder {
304304
/// blocks, so that the SSA updater doesn't have to search all the way to the
305305
/// function entry.
306306
void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
307-
MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
307+
MachineRegisterInfo &MRI,
308+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs,
308309
ArrayRef<Incoming> Incomings = {}) {
309310
assert(LoopLevel < CommonDominators.size());
310311

@@ -411,14 +412,15 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
411412
return new SILowerI1Copies();
412413
}
413414

414-
Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
415-
Register LaneMaskRegAttrs) {
416-
return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
415+
Register llvm::createLaneMaskReg(
416+
MachineRegisterInfo *MRI,
417+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
418+
return MRI->createVirtualRegister(LaneMaskRegAttrs);
417419
}
418420

419-
static Register insertUndefLaneMask(MachineBasicBlock *MBB,
420-
MachineRegisterInfo *MRI,
421-
Register LaneMaskRegAttrs) {
421+
static Register
422+
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
423+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
422424
MachineFunction &MF = *MBB->getParent();
423425
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
424426
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -619,7 +621,7 @@ bool PhiLoweringHelper::lowerPhis() {
619621
for (auto &Incoming : Incomings) {
620622
MachineBasicBlock &IMBB = *Incoming.Block;
621623
if (PIA.isSource(IMBB)) {
622-
constrainIncomingRegisterTakenAsIs(Incoming);
624+
constrainAsLaneMask(Incoming);
623625
SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
624626
} else {
625627
Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -911,6 +913,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
911913
}
912914
}
913915

914-
void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
915-
return;
916-
}
916+
void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) { return; }

0 commit comments

Comments
 (0)