ROCm
diff --git a/‎clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
Lines changed: 1 addition & 1 deletion b/‎clang/test/CodeGenCUDA/atomics-remarks-gfx90a.cu
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 1 addition & 1 deletion b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 4 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 4 additions & 5 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Lines changed: 12 additions & 14 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Lines changed: 12 additions & 14 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
Lines changed: 39 additions & 52 deletions b/‎llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
Lines changed: 39 additions & 52 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 5 additions & 32 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 5 additions & 32 deletions
@@ -10,7 +10,7 @@
 // GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope
 // GFX90A-CAS-LABEL: _Z14atomic_add_casPf
 // GFX90A-CAS:  flat_atomic_cmpswap
-// GFX90A-CAS:  s_cbranch_scc1
+// GFX90A-CAS:  s_cbranch_execnz
 __device__ float atomic_add_cas(float *p) {
   return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);
 }
@@ -3124,7 +3124,7 @@ def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
   [llvm_anyint_ty], [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
-def int_amdgcn_wave_reconverge : Intrinsic<[], [llvm_anyint_ty],
+def int_amdgcn_end_cf : Intrinsic<[], [llvm_anyint_ty],
   [IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // Represent unreachable in a divergent region.
 
@@ -1559,12 +1559,11 @@ bool AMDGPUInstructionSelector::selectReturnAddress(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectWaveReconvergeIntrinsic(
-    MachineInstr &MI) const {
+bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
   // FIXME: Manually selecting to avoid dealing with the SReg_1 trick
   // SelectionDAG uses for wave32 vs wave64.
   MachineBasicBlock *BB = MI.getParent();
-  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_WAVE_RECONVERGE))
+  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
       .add(MI.getOperand(1));
 
   Register Reg = MI.getOperand(1).getReg();
@@ -2122,8 +2121,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     MachineInstr &I) const {
   unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
   switch (IntrinsicID) {
-  case Intrinsic::amdgcn_wave_reconverge:
-    return selectWaveReconvergeIntrinsic(I);
+  case Intrinsic::amdgcn_end_cf:
+    return selectEndCfIntrinsic(I);
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
     return selectDSOrderedIntrinsic(I, IntrinsicID);
 
@@ -119,7 +119,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectReturnAddress(MachineInstr &I) const;
   bool selectG_INTRINSIC(MachineInstr &I) const;
 
-  bool selectWaveReconvergeIntrinsic(MachineInstr &MI) const;
+  bool selectEndCfIntrinsic(MachineInstr &MI) const;
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
 
@@ -785,8 +785,8 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
   const unsigned MovExecOpc =
       Subtarget.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-  // const unsigned MovExecTermOpc =
-  //     Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+  const unsigned MovExecTermOpc =
+      Subtarget.isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
 
   const unsigned XorTermOpc = Subtarget.isWave32() ?
     AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
@@ -949,29 +949,27 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   B.setInsertPt(*BodyBB, BodyBB->end());
 
-  Register LoopMask = MRI.createVirtualRegister(
-      TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID));
   // Update EXEC, switch all done bits to 0 and all todo bits to 1.
-  B.buildInstr(XorTermOpc).addDef(LoopMask).addReg(ExecReg).addReg(NewExec);
+  B.buildInstr(XorTermOpc)
+    .addDef(ExecReg)
+    .addReg(ExecReg)
+    .addReg(NewExec);
 
   // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
   // s_cbranch_scc0?
 
   // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
-  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP)
-      .addReg(LoopMask)
-      .addReg(NewExec)
-      .addMBB(LoopBB);
+  B.buildInstr(AMDGPU::SI_WATERFALL_LOOP).addMBB(LoopBB);
 
   // Save the EXEC mask before the loop.
   BuildMI(MBB, MBB.end(), DL, TII->get(MovExecOpc), SaveExecReg)
     .addReg(ExecReg);
 
   // Restore the EXEC mask after the loop.
-  // B.setMBB(*RestoreExecBB);
-  // B.buildInstr(MovExecTermOpc)
-  //   .addDef(ExecReg)
-  //   .addReg(SaveExecReg);
+  B.setMBB(*RestoreExecBB);
+  B.buildInstr(MovExecTermOpc)
+    .addDef(ExecReg)
+    .addReg(SaveExecReg);
 
   // Set the insert point after the original instruction, so any new
   // instructions will be in the remainder.
@@ -4942,7 +4940,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
-    case Intrinsic::amdgcn_wave_reconverge: {
+    case Intrinsic::amdgcn_end_cf: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
 
@@ -13,7 +13,6 @@
 
 #include "AMDGPU.h"
 #include "GCNSubtarget.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -54,7 +53,7 @@ class SIAnnotateControlFlow : public FunctionPass {
   Function *Else;
   Function *IfBreak;
   Function *Loop;
-  Function *WaveReconverge;
+  Function *EndCf;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -87,7 +86,7 @@ class SIAnnotateControlFlow : public FunctionPass {
 
   bool handleLoop(BranchInst *Term);
 
-  bool tryWaveReconverge(BasicBlock *BB);
+  bool closeControlFlow(BasicBlock *BB);
 
 public:
   static char ID;
@@ -142,8 +141,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
                                       { IntMask });
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
-  WaveReconverge = Intrinsic::getDeclaration(
-      &M, Intrinsic::amdgcn_wave_reconverge, {IntMask});
+  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
 }
 
 /// Is the branch condition uniform or did the StructurizeCFG pass
@@ -206,6 +204,8 @@ bool SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
 
 /// Open a new "If" block
 bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
+  if (isUniform(Term))
+    return false;
 
   IRBuilder<> IRB(Term);
   Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
@@ -306,43 +306,41 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
 }
 
 /// Close the last opened control flow
-bool SIAnnotateControlFlow::tryWaveReconverge(BasicBlock *BB) {
+bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
+  llvm::Loop *L = LI->getLoopFor(BB);
 
-  if (succ_empty(BB))
-    return false;
+  assert(Stack.back().first == BB);
 
-  BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
-  if (Term->getNumSuccessors() == 1) {
-    // The current BBs single successor is a top of the stack. We need to
-    // reconverge over thaqt path.
-    BasicBlock *SingleSucc = *succ_begin(BB);
-    BasicBlock::iterator InsPt = Term ? BasicBlock::iterator(Term) : BB->end();
+  if (L && L->getHeader() == BB) {
+    // We can't insert an EndCF call into a loop header, because it will
+    // get executed on every iteration of the loop, when it should be
+    // executed only once before the loop.
+    SmallVector <BasicBlock *, 8> Latches;
+    L->getLoopLatches(Latches);
 
-    if (isTopOfStack(SingleSucc)) {
-      Value *Exec = Stack.back().second;
-      IRBuilder<>(BB, InsPt).CreateCall(WaveReconverge, {Exec});
+    SmallVector<BasicBlock *, 2> Preds;
+    for (BasicBlock *Pred : predecessors(BB)) {
+      if (!is_contained(Latches, Pred))
+        Preds.push_back(Pred);
     }
-  } else {
-    // We have a uniform conditional branch terminating the block.
-    // THis block may be the last in the Then path of the enclosing divergent
-    // IF.
-    if (!isUniform(Term))
-      // Divergent loop is going to be further processed in another place
-      return false;
-
-    for (auto Succ : Term->successors()) {
-      if (isTopOfStack(Succ)) {
-        // Just split to make a room for further WAVE_RECONVERGE insertion
-        SmallVector<BasicBlock *, 2> Preds;
-        for (auto P : predecessors(Succ)) {
-          if (DT->dominates(BB, P))
-            Preds.push_back(P);
-        }
-        DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-        SplitBlockPredecessors(Succ, Preds, ".reconverge", &DTU, LI, nullptr,
-                               false);
-      }
+
+    BB = SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, nullptr,
+                                false);
+  }
+
+  Value *Exec = popSaved();
+  Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
+  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
+    Instruction *ExecDef = cast<Instruction>(Exec);
+    BasicBlock *DefBB = ExecDef->getParent();
+    if (!DT->dominates(DefBB, BB)) {
+      // Split edge to make Def dominate Use
+      FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
     }
+    IRBuilder<> IRB(FirstInsertionPt);
+    // TODO: Clear dbg location for now as it causes regression in GDB tests.
+    IRB.SetCurrentDebugLocation(DebugLoc());
+    IRB.CreateCall(EndCf, {Exec});
   }
 
   return true;
@@ -366,20 +364,14 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
 
     if (!Term || Term->isUnconditional()) {
       if (isTopOfStack(BB))
-        Stack.pop_back();
-
-      Changed |= tryWaveReconverge(BB);
+        Changed |= closeControlFlow(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
       if (isTopOfStack(BB))
-        Stack.pop_back();
-
-      // Let's take care of uniform loop latch that may be closing the Then
-      // path of the enclosing divergent branch.
-      Changed |= tryWaveReconverge(BB);
+        Changed |= closeControlFlow(BB);
 
       if (DT->dominates(Term->getSuccessor(1), BB))
         Changed |= handleLoop(Term);
@@ -394,15 +386,10 @@ bool SIAnnotateControlFlow::runOnFunction(Function &F) {
         continue;
       }
 
-      Stack.pop_back();
+      Changed |= closeControlFlow(BB);
     }
 
-    if (isUniform(Term))
-      // Uniform conditional branch may be in the block that closes the Then
-      // path of the divergent conditional branch.
-      Changed |= tryWaveReconverge(BB);
-    else
-      Changed |= openIf(Term);
+    Changed |= openIf(Term);
   }
 
   if (!Stack.empty()) {
 
@@ -6255,7 +6255,7 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
       return AMDGPUISD::ELSE;
     case Intrinsic::amdgcn_loop:
       return AMDGPUISD::LOOP;
-    case Intrinsic::amdgcn_wave_reconverge:
+    case Intrinsic::amdgcn_end_cf:
       llvm_unreachable("should not occur");
     default:
       return 0;
@@ -9880,10 +9880,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
 
     return SDValue(Load, 0);
   }
-  case Intrinsic::amdgcn_wave_reconverge:
-    return SDValue(DAG.getMachineNode(AMDGPU::SI_WAVE_RECONVERGE, DL,
-                                      MVT::Other, Op->getOperand(2), Chain),
-                   0);
+  case Intrinsic::amdgcn_end_cf:
+    return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
+                                      Op->getOperand(2), Chain), 0);
   case Intrinsic::amdgcn_s_barrier_init:
   case Intrinsic::amdgcn_s_barrier_join:
   case Intrinsic::amdgcn_s_wakeup_barrier: {
@@ -15657,32 +15656,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
     }
   }
 
-  // ISel inserts copy to regs for the successor PHIs
-  // at the BB end. We need to move the SI_WAVE_RECONVERGE right before the
-  // branch.
-  for (auto &MBB : MF) {
-    for (auto &MI : MBB) {
-      if (MI.getOpcode() == AMDGPU::SI_WAVE_RECONVERGE) {
-        MachineBasicBlock::iterator I(MI);
-        MachineBasicBlock::iterator Next = std::next(I);
-        bool NeedToMove = false;
-        while (Next != MBB.end() && !Next->isBranch()) {
-          NeedToMove = true;
-          Next++;
-        }
-
-        assert((Next == MBB.end() || !Next->readsRegister(AMDGPU::SCC, TRI)) &&
-               "Malformed CFG detected!\n");
-
-        if (NeedToMove) {
-          MBB.splice(Next, &MBB, &MI);
-        }
-
-        break;
-      }
-    }
-  }
-
   // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
   // classes if required. Ideally the register class constraints would differ
   // per-subtarget, but there's no easy way to achieve that right now. This is
@@ -16256,7 +16229,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
         default:
           Result = false;
           break;
-        case Intrinsic::amdgcn_wave_reconverge:
+        case Intrinsic::amdgcn_end_cf:
         case Intrinsic::amdgcn_loop:
           Result = true;
           break;
Original file line number	Diff line number	Diff line change
`@@ -10,7 +10,7 @@`
`10`	`10`	`// GFX90A-CAS: A compare and swap loop was generated for an atomic fadd operation at system memory scope`
`11`	`11`	`// GFX90A-CAS-LABEL: _Z14atomic_add_casPf`
`12`	`12`	`// GFX90A-CAS: flat_atomic_cmpswap`
`13`		`-// GFX90A-CAS: s_cbranch_scc1`
	`13`	`+// GFX90A-CAS: s_cbranch_execnz`
`14`	`14`	`__device__ float atomic_add_cas(float *p) {`
`15`	`15`	`return __atomic_fetch_add(p, 1.0f, memory_order_relaxed);`
`16`	`16`	`}`