Skip to content

Commit 376ef1c

Browse files
AMDGPU/GlobalISelDivergenceLowering: select divergent i1 phis
Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper. Use machine uniformity analysis to find divergent i1 phis and select them as lane mask phis in same way SILowerI1Copies select VReg_1 phis. Note that divergent i1 phis include phis created by LCSSA and all cases of uses outside of cycle are actually covered by "lowering LCSSA phis". GlobalISel lane masks are registers with sgpr register class and S1 LLT. TODO: General goal is that instructions created in this pass are fully instruction-selected so that selection of lane mask phis is not split across multiple passes. patch 3 from: #73337
1 parent 4ea1994 commit 376ef1c

19 files changed

+796
-244
lines changed

llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
3232
MachineFunction &F, const MachineCycleInfo &cycleInfo,
3333
const MachineDomTree &domTree, bool HasBranchDivergence);
3434

35+
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
36+
class MachineUniformityAnalysisPass : public MachineFunctionPass {
37+
MachineUniformityInfo UI;
38+
39+
public:
40+
static char ID;
41+
42+
MachineUniformityAnalysisPass();
43+
44+
MachineUniformityInfo &getUniformityInfo() { return UI; }
45+
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
46+
47+
bool runOnMachineFunction(MachineFunction &F) override;
48+
void getAnalysisUsage(AnalysisUsage &AU) const override;
49+
void print(raw_ostream &OS, const Module *M = nullptr) const override;
50+
51+
// TODO: verify analysis
52+
};
53+
3554
} // namespace llvm
3655

3756
#endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H

llvm/lib/CodeGen/MachineUniformityAnalysis.cpp

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
165165

166166
namespace {
167167

168-
/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
169-
class MachineUniformityAnalysisPass : public MachineFunctionPass {
170-
MachineUniformityInfo UI;
171-
172-
public:
173-
static char ID;
174-
175-
MachineUniformityAnalysisPass();
176-
177-
MachineUniformityInfo &getUniformityInfo() { return UI; }
178-
const MachineUniformityInfo &getUniformityInfo() const { return UI; }
179-
180-
bool runOnMachineFunction(MachineFunction &F) override;
181-
void getAnalysisUsage(AnalysisUsage &AU) const override;
182-
void print(raw_ostream &OS, const Module *M = nullptr) const override;
183-
184-
// TODO: verify analysis
185-
};
186-
187168
class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
188169
public:
189170
static char ID;

llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp

Lines changed: 151 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
//===----------------------------------------------------------------------===//
1717

1818
#include "AMDGPU.h"
19+
#include "SILowerI1Copies.h"
1920
#include "llvm/CodeGen/MachineFunctionPass.h"
21+
#include "llvm/CodeGen/MachineUniformityAnalysis.h"
22+
#include "llvm/InitializePasses.h"
2023

2124
#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
2225

@@ -42,14 +45,152 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4245

4346
void getAnalysisUsage(AnalysisUsage &AU) const override {
4447
AU.setPreservesCFG();
48+
AU.addRequired<MachineDominatorTree>();
49+
AU.addRequired<MachinePostDominatorTree>();
50+
AU.addRequired<MachineUniformityAnalysisPass>();
4551
MachineFunctionPass::getAnalysisUsage(AU);
4652
}
4753
};
4854

55+
class DivergenceLoweringHelper : public PhiLoweringHelper {
56+
public:
57+
DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
58+
MachinePostDominatorTree *PDT,
59+
MachineUniformityInfo *MUI);
60+
61+
private:
62+
MachineUniformityInfo *MUI = nullptr;
63+
64+
public:
65+
void markAsLaneMask(Register DstReg) const override;
66+
void getCandidatesForLowering(
67+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
68+
void collectIncomingValuesFromPhi(
69+
const MachineInstr *MI,
70+
SmallVectorImpl<Incoming> &Incomings) const override;
71+
void replaceDstReg(Register NewReg, Register OldReg,
72+
MachineBasicBlock *MBB) override;
73+
void buildMergeLaneMasks(MachineBasicBlock &MBB,
74+
MachineBasicBlock::iterator I, const DebugLoc &DL,
75+
Register DstReg, Register PrevReg,
76+
Register CurReg) override;
77+
void constrainAsLaneMask(Incoming &In) override;
78+
};
79+
80+
DivergenceLoweringHelper::DivergenceLoweringHelper(
81+
MachineFunction *MF, MachineDominatorTree *DT,
82+
MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
83+
: PhiLoweringHelper(MF, DT, PDT), MUI(MUI) {}
84+
85+
// _(s1) -> SReg_32/64(s1)
86+
void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
87+
assert(MRI->getType(DstReg) == LLT::scalar(1));
88+
89+
if (MRI->getRegClassOrNull(DstReg)) {
90+
MRI->constrainRegClass(DstReg, ST->getBoolRC());
91+
return;
92+
}
93+
94+
MRI->setRegClass(DstReg, ST->getBoolRC());
95+
}
96+
97+
void DivergenceLoweringHelper::getCandidatesForLowering(
98+
SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
99+
LLT S1 = LLT::scalar(1);
100+
101+
// Add divergent i1 phis to the list
102+
for (MachineBasicBlock &MBB : *MF) {
103+
for (MachineInstr &MI : MBB.phis()) {
104+
Register Dst = MI.getOperand(0).getReg();
105+
if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
106+
Vreg1Phis.push_back(&MI);
107+
}
108+
}
109+
}
110+
111+
void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
112+
const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
113+
for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
114+
Incomings.emplace_back(MI->getOperand(i).getReg(),
115+
MI->getOperand(i + 1).getMBB(), Register());
116+
}
117+
}
118+
119+
void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
120+
MachineBasicBlock *MBB) {
121+
BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
122+
.addReg(NewReg);
123+
}
124+
125+
// Get pointers to build instruction just after MI (skips phis if needed)
126+
static std::pair<MachineBasicBlock *, MachineBasicBlock::iterator>
127+
getInsertAfterPtrs(MachineInstr *MI) {
128+
MachineBasicBlock *InsertMBB = MI->getParent();
129+
return {InsertMBB,
130+
InsertMBB->SkipPHIsAndLabels(std::next(MI->getIterator()))};
131+
}
132+
133+
// bb.previous
134+
// %PrevReg = ...
135+
//
136+
// bb.current
137+
// %CurReg = ...
138+
//
139+
// %DstReg - not defined
140+
//
141+
// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
142+
//
143+
// bb.previous
144+
// %PrevReg = ...
145+
// %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
146+
//
147+
// bb.current
148+
// %CurReg = ...
149+
// %CurRegCopy:sreg_32(s1) = COPY %CurReg
150+
// ...
151+
// %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
152+
// %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
153+
// %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
154+
//
155+
// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
156+
void DivergenceLoweringHelper::buildMergeLaneMasks(
157+
MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
158+
Register DstReg, Register PrevReg, Register CurReg) {
159+
// DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
160+
// TODO: check if inputs are constants or results of a compare.
161+
162+
Register PrevRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
163+
auto [PrevMBB, AfterPrevReg] = getInsertAfterPtrs(MRI->getVRegDef(PrevReg));
164+
BuildMI(*PrevMBB, AfterPrevReg, DL, TII->get(AMDGPU::COPY), PrevRegCopy)
165+
.addReg(PrevReg);
166+
Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
167+
BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
168+
.addReg(PrevRegCopy)
169+
.addReg(ExecReg);
170+
171+
Register CurRegCopy = createLaneMaskReg(MRI, LaneMaskRegAttrs);
172+
auto [CurMBB, AfterCurReg] = getInsertAfterPtrs(MRI->getVRegDef(CurReg));
173+
BuildMI(*CurMBB, AfterCurReg, DL, TII->get(AMDGPU::COPY), CurRegCopy)
174+
.addReg(CurReg);
175+
Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
176+
BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
177+
.addReg(ExecReg)
178+
.addReg(CurRegCopy);
179+
180+
BuildMI(MBB, I, DL, TII->get(OrOp), DstReg)
181+
.addReg(PrevMaskedReg)
182+
.addReg(CurMaskedReg);
183+
}
184+
185+
void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
186+
49187
} // End anonymous namespace.
50188

51189
INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
52190
"AMDGPU GlobalISel divergence lowering", false, false)
191+
INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
192+
INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
193+
INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
53194
INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
54195
"AMDGPU GlobalISel divergence lowering", false, false)
55196

@@ -64,5 +205,14 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
64205

65206
bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
66207
MachineFunction &MF) {
67-
return false;
208+
MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
209+
MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
210+
MachineUniformityInfo &MUI =
211+
getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
212+
213+
DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
214+
215+
bool Changed = false;
216+
Changed |= Helper.lowerPhis();
217+
return Changed;
68218
}

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
210210
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
211211
const Register DefReg = I.getOperand(0).getReg();
212212
const LLT DefTy = MRI->getType(DefReg);
213+
// Lane mask PHIs, PHI where all register operands have sgpr register class
214+
// with S1 LLT, are already selected in divergence lowering pass.
215+
if (I.getOpcode() == AMDGPU::PHI) {
216+
assert(MRI->getType(DefReg) == LLT::scalar(1));
217+
assert(TRI.isSGPRClass(MRI->getRegClass(DefReg)));
218+
return true;
219+
}
220+
213221
if (DefTy == LLT::scalar(1)) {
214222
if (!AllowRiskySelect) {
215223
LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");

llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
7878
MachineBasicBlock::iterator I, const DebugLoc &DL,
7979
Register DstReg, Register PrevReg,
8080
Register CurReg) override;
81-
void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
81+
void constrainAsLaneMask(Incoming &In) override;
8282

8383
bool lowerCopiesFromI1();
8484
bool lowerCopiesToI1();
@@ -619,7 +619,7 @@ bool PhiLoweringHelper::lowerPhis() {
619619
for (auto &Incoming : Incomings) {
620620
MachineBasicBlock &IMBB = *Incoming.Block;
621621
if (PIA.isSource(IMBB)) {
622-
constrainIncomingRegisterTakenAsIs(Incoming);
622+
constrainAsLaneMask(Incoming);
623623
SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
624624
} else {
625625
Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -911,6 +911,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
911911
}
912912
}
913913

914-
void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
915-
return;
916-
}
914+
void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) { return; }

llvm/lib/Target/AMDGPU/SILowerI1Copies.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ class PhiLoweringHelper {
9191
MachineBasicBlock::iterator I,
9292
const DebugLoc &DL, Register DstReg,
9393
Register PrevReg, Register CurReg) = 0;
94-
virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
94+
virtual void constrainAsLaneMask(Incoming &In) = 0;
9595
};
9696

9797
} // end namespace llvm

llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2-
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
2+
; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
3+
; REQUIRES: do-not-run-me
34

45
; Divergent phis that don't require lowering using lane mask merging
56

0 commit comments

Comments
 (0)