Skip to content

Commit 28d5999

Browse files
committed
Revert "[AMDGPU] SI CF lowering change."
This reverts commit f81ef6f. Change-Id: I99c84dee06cd4e62e47abe79cdb0c177664599d0
1 parent 2b99317 commit 28d5999

File tree

328 files changed

+26573
-65675
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

328 files changed

+26573
-65675
lines changed

clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
1111
// GFX90A-CAS-LABEL: _Z14atomic_add_casPf
1212
// GFX90A-CAS: flat_atomic_cmpswap
13-
// GFX90A-CAS: s_cbranch_scc1
13+
// GFX90A-CAS: s_cbranch_execnz
1414
__device__ float atomic_add_cas(float *p) {
1515
return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
1616
}

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3124,7 +3124,7 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
31243124
[llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
31253125
>;
31263126

3127-
def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
3127+
def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
31283128
[IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
31293129

31303130
// Represent unreachable in a divergent region.

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1559,12 +1559,11 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
15591559
return true;
15601560
}
15611561

1562-
bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
1563-
MachineInstr &MI) const {
1562+
bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
15641563
// FIXME: Manually selecting to avoid dealing with the SReg_1 trick
15651564
// SelectionDAG uses for wave32 vs wave64.
15661565
MachineBasicBlock *BB = MI.getParent();
1567-
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
1566+
BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
15681567
.add(MI.getOperand(1));
15691568

15701569
Register Reg = MI.getOperand(1).getReg();
@@ -2122,8 +2121,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
21222121
MachineInstr &I) const {
21232122
unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
21242123
switch (IntrinsicID) {
2125-
case Intrinsic::amdgcn_wave_reconverge:
2126-
return selectWaveReconvergeIntrinsic(I);
2124+
case Intrinsic::amdgcn_end_cf:
2125+
return selectEndCfIntrinsic(I);
21272126
case Intrinsic::amdgcn_ds_ordered_add:
21282127
case Intrinsic::amdgcn_ds_ordered_swap:
21292128
return selectDSOrderedIntrinsic(I, IntrinsicID);

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
119119
bool selectReturnAddress(MachineInstr &I) const;
120120
bool selectG_INTRINSIC(MachineInstr &I) const;
121121

122-
bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
122+
bool selectEndCfIntrinsic(MachineInstr &MI) const;
123123
bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
124124
bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
125125
bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -785,8 +785,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
785785
const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
786786
const unsigned MovExecOpc =
787787
Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
788-
// const unsigned MovExecTermOpc =
789-
// Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
788+
const unsigned MovExecTermOpc =
789+
Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
790790

791791
const unsigned XorTermOpc = Subtarget.isWave32() ?
792792
AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
@@ -949,29 +949,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
949949

950950
B.setInsertPt(*BodyBB, BodyBB->end());
951951

952-
Register LoopMask = MRI.createVirtualRegister(
953-
TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
954952
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
955-
B.buildInstr(XorTermOpc).addDef(LoopMask).addReg(ExecReg).addReg(NewExec);
953+
B.buildInstr(XorTermOpc)
954+
.addDef(ExecReg)
955+
.addReg(ExecReg)
956+
.addReg(NewExec);
956957

957958
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
958959
// s_cbranch_scc0?
959960

960961
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
961-
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP)
962-
.addReg(LoopMask)
963-
.addReg(NewExec)
964-
.addMBB(LoopBB);
962+
B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
965963

966964
// Save the EXEC mask before the loop.
967965
BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
968966
.addReg(ExecReg);
969967

970968
// Restore the EXEC mask after the loop.
971-
// B.setMBB(*RestoreExecBB);
972-
// B.buildInstr(MovExecTermOpc)
973-
// .addDef(ExecReg)
974-
// .addReg(SaveExecReg);
969+
B.setMBB(*RestoreExecBB);
970+
B.buildInstr(MovExecTermOpc)
971+
.addDef(ExecReg)
972+
.addReg(SaveExecReg);
975973

976974
// Set the insert point after the original instruction, so any new
977975
// instructions will be in the remainder.
@@ -4942,7 +4940,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
49424940
OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
49434941
break;
49444942
}
4945-
case Intrinsic::amdgcn_wave_reconverge: {
4943+
case Intrinsic::amdgcn_end_cf: {
49464944
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
49474945
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
49484946
break;

llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp

Lines changed: 39 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
#include "AMDGPU.h"
1515
#include "GCNSubtarget.h"
16-
#include "llvm/Analysis/DomTreeUpdater.h"
1716
#include "llvm/Analysis/LoopInfo.h"
1817
#include "llvm/Analysis/UniformityAnalysis.h"
1918
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -54,7 +53,7 @@ class SIAnnotateControlFlow : public FunctionPass {
5453
Function *Else;
5554
Function *IfBreak;
5655
Function *Loop;
57-
Function *WaveReconverge;
56+
Function *EndCf;
5857

5958
DominatorTree *DT;
6059
StackVector Stack;
@@ -87,7 +86,7 @@ class SIAnnotateControlFlow : public FunctionPass {
8786

8887
bool handleLoop(BranchInst *Term);
8988

90-
bool tryWaveReconverge(BasicBlock *BB);
89+
bool closeControlFlow(BasicBlock *BB);
9190

9291
public:
9392
static char ID;
@@ -142,8 +141,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
142141
IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
143142
{ IntMask });
144143
Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
145-
WaveReconverge = Intrinsic::getDeclaration(
146-
&M, Intrinsic::amdgcn_wave_reconverge, {IntMask});
144+
EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
147145
}
148146

149147
/// Is the branch condition uniform or did the StructurizeCFG pass
@@ -206,6 +204,8 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
206204

207205
/// Open a new "If" block
208206
bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
207+
if (isUniform(Term))
208+
return false;
209209

210210
IRBuilder<> IRB(Term);
211211
Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
@@ -306,43 +306,41 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
306306
}
307307

308308
/// Close the last opened control flow
309-
bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {
309+
bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
310+
llvm::Loop *L = LI->getLoopFor(BB);
310311

311-
if (succ_empty(BB))
312-
return false;
312+
assert(Stack.back().first == BB);
313313

314-
BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
315-
if (Term->getNumSuccessors() == 1) {
316-
// The current BBs single successor is a top of the stack. We need to
317-
// reconverge over thaqt path.
318-
BasicBlock *SingleSucc = *succ_begin(BB);
319-
BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
314+
if (L && L->getHeader() == BB) {
315+
// We can't insert an EndCF call into a loop header, because it will
316+
// get executed on every iteration of the loop, when it should be
317+
// executed only once before the loop.
318+
SmallVector <BasicBlock *, 8> Latches;
319+
L->getLoopLatches(Latches);
320320

321-
if (isTopOfStack(SingleSucc)) {
322-
Value *Exec = Stack.back().second;
323-
IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
321+
SmallVector<BasicBlock *, 2> Preds;
322+
for (BasicBlock *Pred : predecessors(BB)) {
323+
if (!is_contained(Latches, Pred))
324+
Preds.push_back(Pred);
324325
}
325-
} else {
326-
// We have a uniform conditional branch terminating the block.
327-
// THis block may be the last in the Then path of the enclosing divergent
328-
// IF.
329-
if (!isUniform(Term))
330-
// Divergent loop is going to be further processed in another place
331-
return false;
332-
333-
for (auto Succ : Term->successors()) {
334-
if (isTopOfStack(Succ)) {
335-
// Just split to make a room for further WAVE_RECONVERGE insertion
336-
SmallVector<BasicBlock *, 2> Preds;
337-
for (auto P : predecessors(Succ)) {
338-
if (DT->dominates(BB, P))
339-
Preds.push_back(P);
340-
}
341-
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
342-
SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, nullptr,
343-
false);
344-
}
326+
327+
BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
328+
false);
329+
}
330+
331+
Value *Exec = popSaved();
332+
Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
333+
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
334+
Instruction *ExecDef = cast<Instruction>(Exec);
335+
BasicBlock *DefBB = ExecDef->getParent();
336+
if (!DT->dominates(DefBB, BB)) {
337+
// Split edge to make Def dominate Use
338+
FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
345339
}
340+
IRBuilder<> IRB(FirstInsertionPt);
341+
// TODO: Clear dbg location for now as it causes regression in GDB tests.
342+
IRB.SetCurrentDebugLocation(DebugLoc());
343+
IRB.CreateCall(EndCf, {Exec});
346344
}
347345

348346
return true;
@@ -366,20 +364,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
366364

367365
if (!Term || Term->isUnconditional()) {
368366
if (isTopOfStack(BB))
369-
Stack.pop_back();
370-
371-
Changed |= tryWaveReconverge(BB);
367+
Changed |= closeControlFlow(BB);
372368

373369
continue;
374370
}
375371

376372
if (I.nodeVisited(Term->getSuccessor(1))) {
377373
if (isTopOfStack(BB))
378-
Stack.pop_back();
379-
380-
// Let's take care of uniform loop latch that may be closing the Then
381-
// path of the enclosing divergent branch.
382-
Changed |= tryWaveReconverge(BB);
374+
Changed |= closeControlFlow(BB);
383375

384376
if (DT->dominates(Term->getSuccessor(1), BB))
385377
Changed |= handleLoop(Term);
@@ -394,15 +386,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
394386
continue;
395387
}
396388

397-
Stack.pop_back();
389+
Changed |= closeControlFlow(BB);
398390
}
399391

400-
if (isUniform(Term))
401-
// Uniform conditional branch may be in the block that closes the Then
402-
// path of the divergent conditional branch.
403-
Changed |= tryWaveReconverge(BB);
404-
else
405-
Changed |= openIf(Term);
392+
Changed |= openIf(Term);
406393
}
407394

408395
if (!Stack.empty()) {

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6255,7 +6255,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
62556255
return AMDGPUISD::ELSE;
62566256
case Intrinsic::amdgcn_loop:
62576257
return AMDGPUISD::LOOP;
6258-
case Intrinsic::amdgcn_wave_reconverge:
6258+
case Intrinsic::amdgcn_end_cf:
62596259
llvm_unreachable("should not occur");
62606260
default:
62616261
return 0;
@@ -9880,10 +9880,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
98809880

98819881
return SDValue(Load, 0);
98829882
}
9883-
case Intrinsic::amdgcn_wave_reconverge:
9884-
return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL,
9885-
MVT::Other, Op->getOperand(2), Chain),
9886-
0);
9883+
case Intrinsic::amdgcn_end_cf:
9884+
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
9885+
Op->getOperand(2), Chain), 0);
98879886
case Intrinsic::amdgcn_s_barrier_init:
98889887
case Intrinsic::amdgcn_s_barrier_join:
98899888
case Intrinsic::amdgcn_s_wakeup_barrier: {
@@ -15657,32 +15656,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
1565715656
}
1565815657
}
1565915658

15660-
// ISel inserts copy to regs for the successor PHIs
15661-
// at the BB end. We need to move the SI_WAVE_RECONVERGE right before the
15662-
// branch.
15663-
for (auto &MBB : MF) {
15664-
for (auto &MI : MBB) {
15665-
if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
15666-
MachineBasicBlock::iterator I(MI);
15667-
MachineBasicBlock::iterator Next = std::next(I);
15668-
bool NeedToMove = false;
15669-
while (Next != MBB.end() && !Next->isBranch()) {
15670-
NeedToMove = true;
15671-
Next++;
15672-
}
15673-
15674-
assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
15675-
"Malformed CFG detected!\n");
15676-
15677-
if (NeedToMove) {
15678-
MBB.splice(Next, &MBB, &MI);
15679-
}
15680-
15681-
break;
15682-
}
15683-
}
15684-
}
15685-
1568615659
// FIXME: This is a hack to fixup AGPR classes to use the properly aligned
1568715660
// classes if required. Ideally the register class constraints would differ
1568815661
// per-subtarget, but there's no easy way to achieve that right now. This is
@@ -16256,7 +16229,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
1625616229
default:
1625716230
Result = false;
1625816231
break;
16259-
case Intrinsic::amdgcn_wave_reconverge:
16232+
case Intrinsic::amdgcn_end_cf:
1626016233
case Intrinsic::amdgcn_loop:
1626116234
Result = true;
1626216235
break;

0 commit comments

Comments
 (0)