Skip to content

Commit f81ef6f

Browse files
alex-tkzhuravl
authored andcommitted
[AMDGPU] SI CF lowering change.
Change-Id: I8609c5abae7cd9307ffc4f6ace5011be860998e8
1 parent b633543 commit f81ef6f

File tree

328 files changed

+43458
-24622
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

328 files changed

+43458
-24622
lines changed

clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
1111
// GFX90A-CAS-LABEL: _Z14atomic_add_casPf
1212
// GFX90A-CAS: flat_atomic_cmpswap
13-
// GFX90A-CAS: s_cbranch_execnz
13+
// GFX90A-CAS: s_cbranch_scc1
1414
__device__ float atomic_add_cas(float *p) {
1515
return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
1616
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3128,7 +3128,7 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
31283128
[llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
31293129
>;
31303130

3131-
def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
3131+
def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
31323132
[IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
31333133

31343134
// Represent unreachable in a divergent region.

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,11 +1559,12 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
15591559
return true;
15601560
}
15611561

1562-
bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
1562+
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
1563+
MachineInstr &MI) const {
15631564
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
15641565
// SelectionDAG uses for wave32 vs wave64.
15651566
MachineBasicBlock *BB = MI.getParent();
1566-
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
1567+
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
15671568
.add(MI.getOperand(1));
15681569

15691570
Register Reg = MI.getOperand(1).getReg();
@@ -2121,8 +2122,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
21212122
MachineInstr &I) const {
21222123
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
21232124
switch (IntrinsicID) {
2124-
case Intrinsic::amdgcn_end_cf:
2125-
return selectEndCfIntrinsic(I);
2125+
case Intrinsic::amdgcn_wave_reconverge:
2126+
return selectWaveReconvergeIntrinsic(I);
21262127
case Intrinsic::amdgcn_ds_ordered_add:
21272128
case Intrinsic::amdgcn_ds_ordered_swap:
21282129
return selectDSOrderedIntrinsic(I, IntrinsicID);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
119119
bool selectReturnAddress(MachineInstr &I) const;
120120
bool selectG_INTRINSIC(MachineInstr &I) const;
121121

122-
bool selectEndCfIntrinsic(MachineInstr &MI) const;
122+
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
123123
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
124124
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
125125
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -785,8 +785,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
785785
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786786
const unsigned MovExecOpc =
787787
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788-
const unsigned MovExecTermOpc =
789-
Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
788+
// const unsigned MovExecTermOpc =
789+
// Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790790

791791
const unsigned XorTermOpc = Subtarget.isWave32() ?
792792
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
@@ -949,27 +949,29 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
949949

950950
B.setInsertPt(*BodyBB, BodyBB->end());
951951

952+
Register LoopMask = MRI.createVirtualRegister(
953+
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
952954
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
953-
B.buildInstr(XorTermOpc)
954-
.addDef(ExecReg)
955-
.addReg(ExecReg)
956-
.addReg(NewExec);
955+
B.buildInstr(XorTermOpc).addDef(LoopMask).addReg(ExecReg).addReg(NewExec);
957956

958957
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
959958
// s_cbranch_scc0?
960959

961960
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
962-
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
961+
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP)
962+
.addReg(LoopMask)
963+
.addReg(NewExec)
964+
.addMBB(LoopBB);
963965

964966
// Save the EXEC mask before the loop.
965967
BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
966968
.addReg(ExecReg);
967969

968970
// Restore the EXEC mask after the loop.
969-
B.setMBB(*RestoreExecBB);
970-
B.buildInstr(MovExecTermOpc)
971-
.addDef(ExecReg)
972-
.addReg(SaveExecReg);
971+
// B.setMBB(*RestoreExecBB);
972+
// B.buildInstr(MovExecTermOpc)
973+
// .addDef(ExecReg)
974+
// .addReg(SaveExecReg);
973975

974976
// Set the insert point after the original instruction, so any new
975977
// instructions will be in the remainder.
@@ -4940,7 +4942,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49404942
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
49414943
break;
49424944
}
4943-
case Intrinsic::amdgcn_end_cf: {
4945+
case Intrinsic::amdgcn_wave_reconverge: {
49444946
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
49454947
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
49464948
break;

llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp

Lines changed: 52 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16+
#include "llvm/Analysis/DomTreeUpdater.h"
1617
#include "llvm/Analysis/LoopInfo.h"
1718
#include "llvm/Analysis/UniformityAnalysis.h"
1819
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -53,7 +54,7 @@ class SIAnnotateControlFlow : public FunctionPass {
5354
Function *Else;
5455
Function *IfBreak;
5556
Function *Loop;
56-
Function *EndCf;
57+
Function *WaveReconverge;
5758

5859
DominatorTree *DT;
5960
StackVector Stack;
@@ -86,7 +87,7 @@ class SIAnnotateControlFlow : public FunctionPass {
8687

8788
bool handleLoop(BranchInst *Term);
8889

89-
bool closeControlFlow(BasicBlock *BB);
90+
bool tryWaveReconverge(BasicBlock *BB);
9091

9192
public:
9293
static char ID;
@@ -141,7 +142,8 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
141142
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
142143
{ IntMask });
143144
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
144-
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
145+
WaveReconverge = Intrinsic::getDeclaration(
146+
&M, Intrinsic::amdgcn_wave_reconverge, {IntMask});
145147
}
146148

147149
/// Is the branch condition uniform or did the StructurizeCFG pass
@@ -204,8 +206,6 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
204206

205207
/// Open a new "If" block
206208
bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
207-
if (isUniform(Term))
208-
return false;
209209

210210
IRBuilder<> IRB(Term);
211211
Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
@@ -306,41 +306,43 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
306306
}
307307

308308
/// Close the last opened control flow
309-
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
310-
llvm::Loop *L = LI->getLoopFor(BB);
309+
bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {
311310

312-
assert(Stack.back().first == BB);
311+
if (succ_empty(BB))
312+
return false;
313313

314-
if (L && L->getHeader() == BB) {
315-
// We can't insert an EndCF call into a loop header, because it will
316-
// get executed on every iteration of the loop, when it should be
317-
// executed only once before the loop.
318-
SmallVector <BasicBlock *, 8> Latches;
319-
L->getLoopLatches(Latches);
314+
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
315+
if (Term->getNumSuccessors() == 1) {
316+
// The current BBs single successor is a top of the stack. We need to
317+
// reconverge over thaqt path.
318+
BasicBlock *SingleSucc = *succ_begin(BB);
319+
BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
320320

321-
SmallVector<BasicBlock *, 2> Preds;
322-
for (BasicBlock *Pred : predecessors(BB)) {
323-
if (!is_contained(Latches, Pred))
324-
Preds.push_back(Pred);
321+
if (isTopOfStack(SingleSucc)) {
322+
Value *Exec = Stack.back().second;
323+
IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
325324
}
326-
327-
BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
328-
false);
329-
}
330-
331-
Value *Exec = popSaved();
332-
Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
333-
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
334-
Instruction *ExecDef = cast<Instruction>(Exec);
335-
BasicBlock *DefBB = ExecDef->getParent();
336-
if (!DT->dominates(DefBB, BB)) {
337-
// Split edge to make Def dominate Use
338-
FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
325+
} else {
326+
// We have a uniform conditional branch terminating the block.
327+
// THis block may be the last in the Then path of the enclosing divergent
328+
// IF.
329+
if (!isUniform(Term))
330+
// Divergent loop is going to be further processed in another place
331+
return false;
332+
333+
for (auto Succ : Term->successors()) {
334+
if (isTopOfStack(Succ)) {
335+
// Just split to make a room for further WAVE_RECONVERGE insertion
336+
SmallVector<BasicBlock *, 2> Preds;
337+
for (auto P : predecessors(Succ)) {
338+
if (DT->dominates(BB, P))
339+
Preds.push_back(P);
340+
}
341+
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
342+
SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, nullptr,
343+
false);
344+
}
339345
}
340-
IRBuilder<> IRB(FirstInsertionPt);
341-
// TODO: Clear dbg location for now as it causes regression in GDB tests.
342-
IRB.SetCurrentDebugLocation(DebugLoc());
343-
IRB.CreateCall(EndCf, {Exec});
344346
}
345347

346348
return true;
@@ -364,14 +366,20 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
364366

365367
if (!Term || Term->isUnconditional()) {
366368
if (isTopOfStack(BB))
367-
Changed |= closeControlFlow(BB);
369+
Stack.pop_back();
370+
371+
Changed |= tryWaveReconverge(BB);
368372

369373
continue;
370374
}
371375

372376
if (I.nodeVisited(Term->getSuccessor(1))) {
373377
if (isTopOfStack(BB))
374-
Changed |= closeControlFlow(BB);
378+
Stack.pop_back();
379+
380+
// Let's take care of uniform loop latch that may be closing the Then
381+
// path of the enclosing divergent branch.
382+
Changed |= tryWaveReconverge(BB);
375383

376384
if (DT->dominates(Term->getSuccessor(1), BB))
377385
Changed |= handleLoop(Term);
@@ -386,10 +394,15 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
386394
continue;
387395
}
388396

389-
Changed |= closeControlFlow(BB);
397+
Stack.pop_back();
390398
}
391399

392-
Changed |= openIf(Term);
400+
if (isUniform(Term))
401+
// Uniform conditional branch may be in the block that closes the Then
402+
// path of the divergent conditional branch.
403+
Changed |= tryWaveReconverge(BB);
404+
else
405+
Changed |= openIf(Term);
393406
}
394407

395408
if (!Stack.empty()) {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6091,7 +6091,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
60916091
return AMDGPUISD::ELSE;
60926092
case Intrinsic::amdgcn_loop:
60936093
return AMDGPUISD::LOOP;
6094-
case Intrinsic::amdgcn_end_cf:
6094+
case Intrinsic::amdgcn_wave_reconverge:
60956095
llvm_unreachable("should not occur");
60966096
default:
60976097
return 0;
@@ -9708,9 +9708,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
97089708

97099709
return SDValue(Load, 0);
97109710
}
9711-
case Intrinsic::amdgcn_end_cf:
9712-
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9713-
Op->getOperand(2), Chain), 0);
9711+
case Intrinsic::amdgcn_wave_reconverge:
9712+
return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL,
9713+
MVT::Other, Op->getOperand(2), Chain),
9714+
0);
97149715
case Intrinsic::amdgcn_s_barrier_init:
97159716
case Intrinsic::amdgcn_s_barrier_join:
97169717
case Intrinsic::amdgcn_s_wakeup_barrier: {
@@ -15486,6 +15487,32 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1548615487
}
1548715488
}
1548815489

15490+
// ISel inserts copy to regs for the successor PHIs
15491+
// at the BB end. We need to move the SI_WAVE_RECONVERGE right before the
15492+
// branch.
15493+
for (auto &MBB : MF) {
15494+
for (auto &MI : MBB) {
15495+
if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
15496+
MachineBasicBlock::iterator I(MI);
15497+
MachineBasicBlock::iterator Next = std::next(I);
15498+
bool NeedToMove = false;
15499+
while (Next != MBB.end() && !Next->isBranch()) {
15500+
NeedToMove = true;
15501+
Next++;
15502+
}
15503+
15504+
assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
15505+
"Malformed CFG detected!\n");
15506+
15507+
if (NeedToMove) {
15508+
MBB.splice(Next, &MBB, &MI);
15509+
}
15510+
15511+
break;
15512+
}
15513+
}
15514+
}
15515+
1548915516
// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
1549015517
// classes if required. Ideally the register class constraints would differ
1549115518
// per-subtarget, but there's no easy way to achieve that right now. This is
@@ -16059,7 +16086,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
1605916086
default:
1606016087
Result = false;
1606116088
break;
16062-
case Intrinsic::amdgcn_end_cf:
16089+
case Intrinsic::amdgcn_wave_reconverge:
1606316090
case Intrinsic::amdgcn_loop:
1606416091
Result = true;
1606516092
break;

0 commit comments

Comments
 (0)