Skip to content

Commit 5b51a3c

Browse files
committed
[AMDGPU][AMDGPUIfConverter] First version of AMDGPUIfConversion
1 parent ca6302a commit 5b51a3c

File tree

2 files changed

+198
-11
lines changed

2 files changed

+198
-11
lines changed

llvm/lib/Target/AMDGPU/AMDGPUIfConverter.cpp

Lines changed: 196 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include <llvm/InitializePasses.h>
1212

1313
#include "AMDGPU.h"
14+
#include "GCNSubtarget.h"
15+
#include "SIInstrInfo.h"
16+
#include "SIMachineFunctionInfo.h"
1417

1518
using namespace llvm;
1619

@@ -19,10 +22,8 @@ namespace {
1922
const char PassName[] = "AMDGPU if conversion";
2023

2124
class AMDGPUIfConverter : public MachineFunctionPass {
22-
const TargetInstrInfo *TII = nullptr;
23-
const TargetRegisterInfo *TRI = nullptr;
25+
const SIInstrInfo *TII = nullptr;
2426
TargetSchedModel SchedModel;
25-
MachineRegisterInfo *MRI = nullptr;
2627
MachineDominatorTree *DomTree = nullptr;
2728
MachineBranchProbabilityInfo *MBPI = nullptr;
2829
MachineLoopInfo *Loops = nullptr;
@@ -41,6 +42,7 @@ class AMDGPUIfConverter : public MachineFunctionPass {
4142
void getAnalysisUsage(AnalysisUsage &AU) const override;
4243

4344
bool tryConvertIf(MachineBasicBlock *);
45+
bool shouldConvertIf();
4446

4547
StringRef getPassName() const override { return PassName; }
4648
};
@@ -60,10 +62,11 @@ bool AMDGPUIfConverter::runOnMachineFunction(MachineFunction &MF) {
6062
if (skipFunction(MF.getFunction()))
6163
return false;
6264

63-
const TargetSubtargetInfo &STI = MF.getSubtarget();
65+
const auto &STI = MF.getSubtarget<GCNSubtarget>();
66+
if (!STI.hasGFX10_3Insts())
67+
return false;
68+
6469
TII = STI.getInstrInfo();
65-
TRI = STI.getRegisterInfo();
66-
MRI = &MF.getRegInfo();
6770
SchedModel.init(&STI);
6871
DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
6972
Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
@@ -79,7 +82,193 @@ bool AMDGPUIfConverter::runOnMachineFunction(MachineFunction &MF) {
7982
return Changed;
8083
}
8184

82-
bool AMDGPUIfConverter::tryConvertIf(MachineBasicBlock *MBB) { return false; }
85+
unsigned getReversedVCMPXOpcode(unsigned Opcode) {
86+
// TODO: this is a placeholder for the real function
87+
switch (Opcode) {
88+
case AMDGPU::V_CMPX_LT_I32_nosdst_e64:
89+
return AMDGPU::V_CMPX_GE_I32_nosdst_e64;
90+
default:
91+
errs() << "unhandled: " << Opcode << "\n";
92+
llvm_unreachable("unhandled vcmp opcode");
93+
}
94+
}
95+
96+
bool needsPredication(const SIInstrInfo *TII, const MachineInstr &I) {
97+
return TII->isVALU(I) || TII->isVMEM(I);
98+
}
99+
100+
struct ExecPredicate : ifcvt::PredicationStrategy {
101+
const SIInstrInfo *TII;
102+
const SIRegisterInfo *RegInfo;
103+
104+
MachineInstr *Cmp = nullptr;
105+
106+
ExecPredicate(const SIInstrInfo *TII)
107+
: TII(TII), RegInfo(&TII->getRegisterInfo()) {}
108+
109+
bool canConvertIf(MachineBasicBlock *Head, MachineBasicBlock *TBB,
110+
MachineBasicBlock *FBB, MachineBasicBlock *Tail,
111+
ArrayRef<MachineOperand> Cond) override {
112+
113+
// check that the cmp is just before the branch and that it is promotable to
114+
// v_cmpx
115+
const unsigned SupportedBranchOpc[]{
116+
AMDGPU::S_CBRANCH_SCC0, AMDGPU::S_CBRANCH_SCC1, AMDGPU::S_CBRANCH_VCCNZ,
117+
AMDGPU::S_CBRANCH_VCCZ};
118+
119+
MachineInstr &CBranch = *Head->getFirstInstrTerminator();
120+
if (!llvm::is_contained(SupportedBranchOpc, CBranch.getOpcode()))
121+
return false;
122+
123+
auto CmpInstr = std::next(CBranch.getReverseIterator());
124+
if (CmpInstr == Head->instr_rend())
125+
return false;
126+
127+
Register SCCorVCC = Cond[1].getReg();
128+
bool ModifiesConditionReg = CmpInstr->modifiesRegister(SCCorVCC, RegInfo);
129+
if (!ModifiesConditionReg)
130+
return false;
131+
132+
Cmp = &*CmpInstr;
133+
134+
unsigned CmpOpc = Cmp->getOpcode();
135+
if (TII->isSALU(*Cmp))
136+
CmpOpc = TII->getVALUOp(*Cmp);
137+
if (AMDGPU::getVCMPXOpFromVCMP(CmpOpc) == -1) {
138+
errs() << *Cmp << "\n";
139+
return false;
140+
}
141+
142+
auto NeedsPredication = [&](const MachineInstr &I) {
143+
return needsPredication(TII, I);
144+
};
145+
auto BlockNeedsPredication = [&](const MachineBasicBlock *MBB) {
146+
if (MBB == Tail)
147+
return false;
148+
auto Insts = llvm::make_range(MBB->begin(), MBB->getFirstTerminator());
149+
return llvm::any_of(Insts, NeedsPredication);
150+
};
151+
152+
MachineBasicBlock *Blocks[] = {TBB, FBB};
153+
154+
if (llvm::none_of(Blocks, BlockNeedsPredication))
155+
return false;
156+
157+
return true;
158+
}
159+
160+
bool canPredicate(const MachineInstr &I) override {
161+
162+
// TODO: relax this condition, if exec is masked, check that it goes back to
163+
// normal
164+
// TODO: what about scc or vcc ? Are they taken into acount in the MBB
165+
// live-ins ?
166+
MCRegister Exec = RegInfo->getExec();
167+
bool ModifiesExec = I.modifiesRegister(Exec, RegInfo);
168+
if (ModifiesExec)
169+
return false;
170+
171+
if (needsPredication(TII, I))
172+
return true;
173+
174+
bool DontMoveAcrossStore = true;
175+
bool IsSpeculatable = I.isDereferenceableInvariantLoad() ||
176+
I.isSafeToMove(DontMoveAcrossStore);
177+
if (IsSpeculatable)
178+
return true;
179+
180+
return false;
181+
}
182+
183+
bool predicateBlock(MachineBasicBlock *MBB, ArrayRef<MachineOperand> Cond,
184+
bool Reverse) override {
185+
// save exec
186+
MachineFunction &MF = *MBB->getParent();
187+
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
188+
189+
Register ExecBackup = MFI->getSGPRForEXECCopy();
190+
191+
const DebugLoc &CmpLoc = Cmp->getDebugLoc();
192+
193+
auto FirstInstruction = MBB->begin();
194+
const bool IsSCCLive =
195+
false; // asume not since the live-ins are supposed to be empty
196+
TII->insertScratchExecCopy(MF, *MBB, FirstInstruction, CmpLoc, ExecBackup,
197+
IsSCCLive);
198+
199+
// mask exec
200+
unsigned CmpOpc = Cmp->getOpcode();
201+
if (TII->isSALU(*Cmp))
202+
CmpOpc = TII->getVALUOp(*Cmp);
203+
204+
CmpOpc = AMDGPU::getVCMPXOpFromVCMP(CmpOpc);
205+
if (Reverse)
206+
CmpOpc = getReversedVCMPXOpcode(CmpOpc);
207+
208+
// TODO: handle this properly. The second block may kill those registers.
209+
Cmp->getOperand(0).setIsKill(false);
210+
Cmp->getOperand(1).setIsKill(false);
211+
212+
auto VCmpX = BuildMI(*MBB, FirstInstruction, CmpLoc, TII->get(CmpOpc));
213+
VCmpX->addOperand(Cmp->getOperand(0));
214+
VCmpX->addOperand(Cmp->getOperand(1));
215+
216+
// restore exec
217+
TII->restoreExec(MF, *MBB, MBB->end(), DebugLoc(), ExecBackup);
218+
219+
return true;
220+
}
221+
222+
~ExecPredicate() override = default;
223+
};
224+
225+
/// Update the dominator tree after if-conversion erased some blocks.
226+
void updateDomTree(MachineDominatorTree *DomTree, const SSAIfConv &IfConv,
227+
ArrayRef<MachineBasicBlock *> Removed) {
228+
// convertIf can remove TBB, FBB, and Tail can be merged into Head.
229+
// TBB and FBB should not dominate any blocks.
230+
// Tail children should be transferred to Head.
231+
MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
232+
for (auto *B : Removed) {
233+
MachineDomTreeNode *Node = DomTree->getNode(B);
234+
assert(Node != HeadNode && "Cannot erase the head node");
235+
while (Node->getNumChildren()) {
236+
assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
237+
DomTree->changeImmediateDominator(Node->back(), HeadNode);
238+
}
239+
DomTree->eraseNode(B);
240+
}
241+
}
242+
243+
/// Update LoopInfo after if-conversion.
244+
void updateLoops(MachineLoopInfo *Loops,
245+
ArrayRef<MachineBasicBlock *> Removed) {
246+
// If-conversion doesn't change loop structure, and it doesn't mess with back
247+
// edges, so updating LoopInfo is simply removing the dead blocks.
248+
for (auto *B : Removed)
249+
Loops->removeBlock(B);
250+
}
251+
252+
bool AMDGPUIfConverter::shouldConvertIf() {
253+
// TODO: cost model
254+
return true;
255+
}
256+
257+
bool AMDGPUIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
258+
ExecPredicate Predicate{TII};
259+
bool Changed = false;
260+
while (IfConv.canConvertIf(MBB, Predicate) && shouldConvertIf()) {
261+
// If-convert MBB and update analyses.
262+
SmallVector<MachineBasicBlock *, 4> RemoveBlocks;
263+
IfConv.convertIf(RemoveBlocks, Predicate);
264+
Changed = true;
265+
updateDomTree(DomTree, IfConv, RemoveBlocks);
266+
for (MachineBasicBlock *MBB : RemoveBlocks)
267+
MBB->eraseFromParent();
268+
updateLoops(Loops, RemoveBlocks);
269+
}
270+
return Changed;
271+
}
83272

84273
} // namespace
85274

llvm/test/CodeGen/AMDGPU/amdgpu-if-cvt.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,9 @@ define amdgpu_kernel void @scalar_cmp(i32 noundef %value, ptr addrspace(8) nocap
55
; GCN-LABEL: scalar_cmp:
66
; GCN: ; %bb.0: ; %entry
77
; GCN-NEXT: s_load_dword s0, s[2:3], 0x4c
8+
; GCN-NEXT: s_or_saveexec_b32 s105, -1
89
; GCN-NEXT: s_waitcnt lgkmcnt(0)
9-
; GCN-NEXT: s_cmp_lt_i32 s0, 1
10-
; GCN-NEXT: s_cbranch_scc1 .LBB0_2
11-
; GCN-NEXT: ; %bb.1: ; %if.then
10+
; GCN-NEXT: v_cmpx_ge_i32_e64 s0, 1
1211
; GCN-NEXT: s_clause 0x2
1312
; GCN-NEXT: s_load_dword s4, s[2:3], 0x24
1413
; GCN-NEXT: s_load_dword s5, s[2:3], 0x44
@@ -17,7 +16,6 @@ define amdgpu_kernel void @scalar_cmp(i32 noundef %value, ptr addrspace(8) nocap
1716
; GCN-NEXT: v_mov_b32_e32 v0, s4
1817
; GCN-NEXT: v_mov_b32_e32 v1, s5
1918
; GCN-NEXT: buffer_store_dword v0, v1, s[0:3], 0 offen
20-
; GCN-NEXT: .LBB0_2: ; %if.end
2119
; GCN-NEXT: s_endpgm
2220
entry:
2321
%cmp = icmp sgt i32 %flag, 0

0 commit comments

Comments
 (0)