Skip to content

Commit 91ddcba

Browse files
AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis (#78482)
Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT. TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes. patch 3 from: #73337
1 parent 383d488 commit 91ddcba

21 files changed

+824
-259
lines changed

llvm/include/llvm/CodeGen/MachineRegisterInfo.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -752,6 +752,17 @@ class MachineRegisterInfo {
752752
Register createVirtualRegister(const TargetRegisterClass *RegClass,
753753
StringRef Name = "");
754754

755+
/// All avilable attributes a virtual register can have.
756+
struct RegisterAttributes {
757+
const RegClassOrRegBank *RCOrRB;
758+
LLT Ty;
759+
};
760+
761+
/// createVirtualRegister - Create and return a new virtual register in the
762+
/// function with the specified register attributes.
763+
Register createVirtualRegister(RegisterAttributes RegAttr,
764+
StringRef Name = "");
765+
755766
/// Create and return a new virtual register in the function with the same
756767
/// attributes as the given register.
757768
Register cloneVirtualRegister(Register VReg, StringRef Name = "");

llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
3232
MachineFunction &F, const MachineCycleInfo &cycleInfo,
3333
const MachineDomTree &domTree, bool HasBranchDivergence);
3434

35+
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
36+
class MachineUniformityAnalysisPass : public MachineFunctionPass {
37+
MachineUniformityInfo UI;
38+
39+
public:
40+
static char ID;
41+
42+
MachineUniformityAnalysisPass();
43+
44+
MachineUniformityInfo &getUniformityInfo() { return UI; }
45+
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
46+
47+
bool runOnMachineFunction(MachineFunction &F) override;
48+
void getAnalysisUsage(AnalysisUsage &AU) const override;
49+
void print(raw_ostream &OS, const Module *M = nullptr) const override;
50+
51+
// TODO: verify analysis
52+
};
53+
3554
} // namespace llvm
3655

3756
#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H

llvm/lib/CodeGen/MachineRegisterInfo.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,17 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
167167
return Reg;
168168
}
169169

170+
/// createVirtualRegister - Create and return a new virtual register in the
171+
/// function with the specified register attributes.
172+
Register MachineRegisterInfo::createVirtualRegister(RegisterAttributes RegAttr,
173+
StringRef Name) {
174+
Register Reg = createIncompleteVirtualRegister(Name);
175+
VRegInfo[Reg].first = *RegAttr.RCOrRB;
176+
setType(Reg, RegAttr.Ty);
177+
noteNewVirtualRegister(Reg);
178+
return Reg;
179+
}
180+
170181
Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
171182
StringRef Name) {
172183
Register Reg = createIncompleteVirtualRegister(Name);

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
165165

166166
namespace {
167167

168-
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
169-
class MachineUniformityAnalysisPass : public MachineFunctionPass {
170-
MachineUniformityInfo UI;
171-
172-
public:
173-
static char ID;
174-
175-
MachineUniformityAnalysisPass();
176-
177-
MachineUniformityInfo &getUniformityInfo() { return UI; }
178-
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
179-
180-
bool runOnMachineFunction(MachineFunction &F) override;
181-
void getAnalysisUsage(AnalysisUsage &AU) const override;
182-
void print(raw_ostream &OS, const Module *M = nullptr) const override;
183-
184-
// TODO: verify analysis
185-
};
186-
187168
class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
188169
public:
189170
static char ID;

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 144 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,11 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "SILowerI1Copies.h"
20+
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1921
#include "llvm/CodeGen/MachineFunctionPass.h"
22+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
23+
#include "llvm/InitializePasses.h"
2024

2125
#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
2226

@@ -42,14 +46,146 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4246

4347
void getAnalysisUsage(AnalysisUsage &AU) const override {
4448
AU.setPreservesCFG();
49+
AU.addRequired<MachineDominatorTree>();
50+
AU.addRequired<MachinePostDominatorTree>();
51+
AU.addRequired<MachineUniformityAnalysisPass>();
4552
MachineFunctionPass::getAnalysisUsage(AU);
4653
}
4754
};
4855

56+
class DivergenceLoweringHelper : public PhiLoweringHelper {
57+
public:
58+
DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
59+
MachinePostDominatorTree *PDT,
60+
MachineUniformityInfo *MUI);
61+
62+
private:
63+
MachineUniformityInfo *MUI = nullptr;
64+
MachineIRBuilder B;
65+
Register buildRegCopyToLaneMask(Register Reg);
66+
67+
public:
68+
void markAsLaneMask(Register DstReg) const override;
69+
void getCandidatesForLowering(
70+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
71+
void collectIncomingValuesFromPhi(
72+
const MachineInstr *MI,
73+
SmallVectorImpl<Incoming> &Incomings) const override;
74+
void replaceDstReg(Register NewReg, Register OldReg,
75+
MachineBasicBlock *MBB) override;
76+
void buildMergeLaneMasks(MachineBasicBlock &MBB,
77+
MachineBasicBlock::iterator I, const DebugLoc &DL,
78+
Register DstReg, Register PrevReg,
79+
Register CurReg) override;
80+
void constrainAsLaneMask(Incoming &In) override;
81+
};
82+
83+
DivergenceLoweringHelper::DivergenceLoweringHelper(
84+
MachineFunction *MF, MachineDominatorTree *DT,
85+
MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
86+
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
87+
88+
// _(s1) -> SReg_32/64(s1)
89+
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
90+
assert(MRI->getType(DstReg) == LLT::scalar(1));
91+
92+
if (MRI->getRegClassOrNull(DstReg)) {
93+
if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
94+
return;
95+
llvm_unreachable("Failed to constrain register class");
96+
}
97+
98+
MRI->setRegClass(DstReg, ST->getBoolRC());
99+
}
100+
101+
void DivergenceLoweringHelper::getCandidatesForLowering(
102+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
103+
LLT S1 = LLT::scalar(1);
104+
105+
// Add divergent i1 phis to the list
106+
for (MachineBasicBlock &MBB : *MF) {
107+
for (MachineInstr &MI : MBB.phis()) {
108+
Register Dst = MI.getOperand(0).getReg();
109+
if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
110+
Vreg1Phis.push_back(&MI);
111+
}
112+
}
113+
}
114+
115+
void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
116+
const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
117+
for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
118+
Incomings.emplace_back(MI->getOperand(i).getReg(),
119+
MI->getOperand(i + 1).getMBB(), Register());
120+
}
121+
}
122+
123+
void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
124+
MachineBasicBlock *MBB) {
125+
BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
126+
.addReg(NewReg);
127+
}
128+
129+
// Copy Reg to new lane mask register, insert a copy after instruction that
130+
// defines Reg while skipping phis if needed.
131+
Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
132+
Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
133+
MachineInstr *Instr = MRI->getVRegDef(Reg);
134+
MachineBasicBlock *MBB = Instr->getParent();
135+
B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
136+
B.buildCopy(LaneMask, Reg);
137+
return LaneMask;
138+
}
139+
140+
// bb.previous
141+
// %PrevReg = ...
142+
//
143+
// bb.current
144+
// %CurReg = ...
145+
//
146+
// %DstReg - not defined
147+
//
148+
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
149+
//
150+
// bb.previous
151+
// %PrevReg = ...
152+
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
153+
//
154+
// bb.current
155+
// %CurReg = ...
156+
// %CurRegCopy:sreg_32(s1) = COPY %CurReg
157+
// ...
158+
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
159+
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
160+
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
161+
//
162+
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
163+
void DivergenceLoweringHelper::buildMergeLaneMasks(
164+
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
165+
Register DstReg, Register PrevReg, Register CurReg) {
166+
// DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
167+
// TODO: check if inputs are constants or results of a compare.
168+
169+
Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
170+
Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
171+
Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
172+
Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
173+
174+
B.setInsertPt(MBB, I);
175+
B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
176+
B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
177+
B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
178+
}
179+
180+
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
181+
49182
} // End anonymous namespace.
50183

51184
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
52185
"AMDGPU GlobalISel divergence lowering", false, false)
186+
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
187+
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
188+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
53189
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
54190
"AMDGPU GlobalISel divergence lowering", false, false)
55191

@@ -64,5 +200,12 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
64200

65201
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
66202
MachineFunction &MF) {
67-
return false;
203+
MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
204+
MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
205+
MachineUniformityInfo &MUI =
206+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
207+
208+
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
209+
210+
return Helper.lowerPhis();
68211
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,7 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
210210
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211211
const Register DefReg = I.getOperand(0).getReg();
212212
const LLT DefTy = MRI->getType(DefReg);
213+
213214
if (DefTy == LLT::scalar(1)) {
214215
if (!AllowRiskySelect) {
215216
LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
@@ -3552,8 +3553,6 @@ bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
35523553
}
35533554

35543555
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
3555-
if (I.isPHI())
3556-
return selectPHI(I);
35573556

35583557
if (!I.isPreISelOpcode()) {
35593558
if (I.isCopy())
@@ -3696,6 +3695,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
36963695
return selectWaveAddress(I);
36973696
case AMDGPU::G_STACKRESTORE:
36983697
return selectStackRestore(I);
3698+
case AMDGPU::G_PHI:
3699+
return selectPHI(I);
36993700
default:
37003701
return selectImpl(I, *CoverageInfo);
37013702
}

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@
3131

3232
using namespace llvm;
3333

34-
static Register insertUndefLaneMask(MachineBasicBlock *MBB,
35-
MachineRegisterInfo *MRI,
36-
Register LaneMaskRegAttrs);
34+
static Register
35+
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
36+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
3737

3838
namespace {
3939

@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
7878
MachineBasicBlock::iterator I, const DebugLoc &DL,
7979
Register DstReg, Register PrevReg,
8080
Register CurReg) override;
81-
void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
81+
void constrainAsLaneMask(Incoming &In) override;
8282

8383
bool lowerCopiesFromI1();
8484
bool lowerCopiesToI1();
@@ -304,7 +304,8 @@ class LoopFinder {
304304
/// blocks, so that the SSA updater doesn't have to search all the way to the
305305
/// function entry.
306306
void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
307-
MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
307+
MachineRegisterInfo &MRI,
308+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs,
308309
ArrayRef<Incoming> Incomings = {}) {
309310
assert(LoopLevel < CommonDominators.size());
310311

@@ -411,14 +412,15 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
411412
return new SILowerI1Copies();
412413
}
413414

414-
Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
415-
Register LaneMaskRegAttrs) {
416-
return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
415+
Register llvm::createLaneMaskReg(
416+
MachineRegisterInfo *MRI,
417+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
418+
return MRI->createVirtualRegister(LaneMaskRegAttrs);
417419
}
418420

419-
static Register insertUndefLaneMask(MachineBasicBlock *MBB,
420-
MachineRegisterInfo *MRI,
421-
Register LaneMaskRegAttrs) {
421+
static Register
422+
insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
423+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
422424
MachineFunction &MF = *MBB->getParent();
423425
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
424426
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -619,7 +621,7 @@ bool PhiLoweringHelper::lowerPhis() {
619621
for (auto &Incoming : Incomings) {
620622
MachineBasicBlock &IMBB = *Incoming.Block;
621623
if (PIA.isSource(IMBB)) {
622-
constrainIncomingRegisterTakenAsIs(Incoming);
624+
constrainAsLaneMask(Incoming);
623625
SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
624626
} else {
625627
Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -911,6 +913,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
911913
}
912914
}
913915

914-
void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
915-
return;
916-
}
916+
void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,9 @@ struct Incoming {
3131
: Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
3232
};
3333

34-
Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs);
34+
Register
35+
createLaneMaskReg(MachineRegisterInfo *MRI,
36+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
3537

3638
class PhiLoweringHelper {
3739
public:
@@ -47,7 +49,7 @@ class PhiLoweringHelper {
4749
MachineRegisterInfo *MRI = nullptr;
4850
const GCNSubtarget *ST = nullptr;
4951
const SIInstrInfo *TII = nullptr;
50-
Register LaneMaskRegAttrs;
52+
MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs;
5153

5254
#ifndef NDEBUG
5355
DenseSet<Register> PhiRegisters;
@@ -68,7 +70,8 @@ class PhiLoweringHelper {
6870
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
6971

7072
void initializeLaneMaskRegisterAttributes(Register LaneMask) {
71-
LaneMaskRegAttrs = LaneMask;
73+
LaneMaskRegAttrs.RCOrRB = &MRI->getRegClassOrRegBank(LaneMask);
74+
LaneMaskRegAttrs.Ty = MRI->getType(LaneMask);
7275
}
7376

7477
bool isLaneMaskReg(Register Reg) const {
@@ -91,7 +94,7 @@ class PhiLoweringHelper {
9194
MachineBasicBlock::iterator I,
9295
const DebugLoc &DL, Register DstReg,
9396
Register PrevReg, Register CurReg) = 0;
94-
virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
97+
virtual void constrainAsLaneMask(Incoming &In) = 0;
9598
};
9699

97100
} // end namespace llvm

0 commit comments

Comments
 (0)