llvm
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 14 additions & 0 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 14 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Lines changed: 5 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Lines changed: 5 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 10 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
Lines changed: 5 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Lines changed: 1 addition & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
Lines changed: 1 addition & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Lines changed: 8 additions & 4 deletions b/‎llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Lines changed: 8 additions & 4 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 133 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Lines changed: 133 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 8 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIInstructions.td
Lines changed: 8 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
Lines changed: 3 additions & 0 deletions
@@ -208,6 +208,20 @@ def int_amdgcn_init_exec_from_input : Intrinsic<[],
   [IntrConvergent, IntrHasSideEffects, IntrNoMem, IntrNoCallback,
    IntrNoFree, IntrWillReturn, ImmArg<ArgIndex<1>>]>;
 
+// Sets the function into whole-wave-mode and returns whether the lane was
+// active when entering the function. A branch depending on this return will
+// revert the EXEC mask to what it was when entering the function, thus
+// resulting in a no-op. This pattern is used to optimize branches when function
+// tails need to be run in whole-wave-mode. It may also have other consequences
+// (mostly related to WWM CSR handling) that differentiate it from using
+// a plain `amdgcn.init.exec -1`.
+//
+// Can only be used in functions with the `amdgpu_cs_chain` calling convention.
+// Using this intrinsic without immediately branching on its return value is an
+// error.
+def int_amdgcn_init_whole_wave : Intrinsic<[llvm_i1_ty], [], [
+    IntrHasSideEffects, IntrNoMem, IntrNoDuplicate, IntrConvergent]>;
+
 def int_amdgcn_wavefrontsize :
   ClangBuiltin<"__builtin_amdgcn_wavefrontsize">,
   DefaultAttrsIntrinsic<[llvm_i32_ty], [], [NoUndef<RetIndex>, IntrNoMem, IntrSpeculatable]>;
 
@@ -2738,6 +2738,11 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_ds_bvh_stack_rtn:
     SelectDSBvhStackIntrinsic(N);
     return;
+  case Intrinsic::amdgcn_init_whole_wave:
+    CurDAG->getMachineFunction()
+        .getInfo<SIMachineFunctionInfo>()
+        ->setInitWholeWave();
+    break;
   }
 
   SelectCode(N);
 
@@ -1772,6 +1772,14 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
   return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
 }
 
+bool AMDGPUInstructionSelector::selectInitWholeWave(MachineInstr &MI) const {
+  MachineFunction *MF = MI.getParent()->getParent();
+  SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
+
+  MFInfo->setInitWholeWave();
+  return selectImpl(MI, *CoverageInfo);
+}
+
 bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
@@ -2099,6 +2107,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
     return selectDSAppendConsume(I, true);
   case Intrinsic::amdgcn_ds_consume:
     return selectDSAppendConsume(I, false);
+  case Intrinsic::amdgcn_init_whole_wave:
+    return selectInitWholeWave(I);
   case Intrinsic::amdgcn_s_barrier:
     return selectSBarrier(I);
   case Intrinsic::amdgcn_raw_buffer_load_lds:
 
@@ -120,6 +120,7 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
   bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
+  bool selectInitWholeWave(MachineInstr &MI) const;
   bool selectSBarrier(MachineInstr &MI) const;
   bool selectDSBvhStackIntrinsic(MachineInstr &MI) const;
 
 
@@ -67,6 +67,8 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   // Kernel may need limited waves per EU for better performance.
   bool WaveLimiter = false;
 
+  bool HasInitWholeWave = false;
+
 public:
   AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
 
@@ -109,6 +111,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
     return WaveLimiter;
   }
 
+  bool hasInitWholeWave() const { return HasInitWholeWave; }
+  void setInitWholeWave() { HasInitWholeWave = true; }
+
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
     return allocateLDSGlobal(DL, GV, DynLDSAlign);
   }
 
@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
       break;
     }
+    case Intrinsic::amdgcn_init_whole_wave:
     case Intrinsic::amdgcn_live_mask: {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;
 
@@ -329,6 +329,7 @@ def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
 def : SourceOfDivergence<int_amdgcn_writelane>;
+def : SourceOfDivergence<int_amdgcn_init_whole_wave>;
 
 foreach intr = AMDGPUMFMAIntrinsics908 in
 def : SourceOfDivergence<intr>;
 
@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
                                            ? DenormalMode::IEEE
                                            : DenormalMode::PreserveSign;
 
+  if (YamlMFI.HasInitWholeWave)
+    MFI->setInitWholeWave();
+
   return false;
 }
 
 
@@ -1343,10 +1343,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
 
   // Allocate spill slots for WWM reserved VGPRs.
   // For chain functions, we only need to do this if we have calls to
-  // llvm.amdgcn.cs.chain.
-  bool IsChainWithoutCalls =
-      FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
-  if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
+  // llvm.amdgcn.cs.chain (otherwise there's no one to save them for, since
+  // chain functions do not return) and the function did not contain a call to
+  // llvm.amdgcn.init.whole.wave (since in that case there are no inactive lanes
+  // when entering the function).
+  bool IsChainWithoutRestores =
+      FuncInfo->isChainFunction() &&
+      (!MF.getFrameInfo().hasTailCall() || FuncInfo->hasInitWholeWave());
+  if (!FuncInfo->isEntryFunction() && !IsChainWithoutRestores) {
     for (Register Reg : FuncInfo->getWWMReservedRegs()) {
       const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
       FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
 
@@ -15677,6 +15677,133 @@ static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
   }
 }
 
+static void removeInitWholeWaveBranch(MachineFunction &MF,
+                                      MachineRegisterInfo &MRI,
+                                      const SIInstrInfo *TII) {
+  // Remove SI_INIT_WHOLE_WAVE and the following SI_IF/END_CF and instead set
+  // EXEC to -1 at SI_END_CF.
+  auto IWWIt = find_if(MF.begin()->instrs(), [](const MachineInstr &MI) {
+    return MI.getOpcode() == AMDGPU::SI_INIT_WHOLE_WAVE;
+  });
+  if (IWWIt == MF.begin()->instr_end())
+    return; // We've been here before (GISel runs finalizeLowering twice).
+
+  MachineInstr &If = *MRI.use_begin(IWWIt->getOperand(0).getReg())->getParent();
+  assert(If.getOpcode() == AMDGPU::SI_IF &&
+         "Unexpected user for init.whole.wave result");
+  assert(MRI.hasOneUse(IWWIt->getOperand(0).getReg()) &&
+         "Expected simple control flow");
+
+  MachineInstr &EndCf = *MRI.use_begin(If.getOperand(0).getReg())->getParent();
+  MachineBasicBlock *EndBB = EndCf.getParent();
+
+  // Update all the Phis: since we're removing a predecessor, we need to remove
+  // the corresponding pair of operands. However, we can't just drop the value
+  // coming from the 'if' block - that's going to be the value of the inactive
+  // lanes.
+  // %v = phi (%inactive, %if), (%active1, %shader1), ... (%activeN, %shaderN)
+  // should become
+  // %t = phi (%active1, %shader1), ... (%activeN, %shaderN)
+  // %v = v_set_inactive %t, %inactive
+  // Note that usually EndCf will be the first instruction after the phis and as
+  // such will serve as the end of the range when iterating over phis.
+  // Therefore, we shouldn't introduce any new instructions before it.
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  auto AfterEndCf = std::next(EndCf.getIterator());
+  for (auto &Phi : EndBB->phis()) {
+    Register PhiDest = Phi.getOperand(0).getReg();
+    const TargetRegisterClass *PhiRC = MRI.getRegClass(PhiDest);
+
+    Register NewPhiDest = MRI.createVirtualRegister(PhiRC);
+    Phi.getOperand(0).setReg(NewPhiDest);
+
+    unsigned InactiveOpIdx = 0;
+    for (unsigned I = 1; I < Phi.getNumOperands(); I += 2) {
+      if (Phi.getOperand(I + 1).getMBB() == If.getParent()) {
+        InactiveOpIdx = I;
+        break;
+      }
+    }
+    assert(InactiveOpIdx != 0 && "Broken phi?");
+
+    // At this point, the register class could be larger than 32 or 64, so we
+    // might have to use more than one V_SET_INACTIVE instruction.
+    unsigned Size = TRI.getRegSizeInBits(*PhiRC);
+    switch (Size) {
+    case 32:
+      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+              TII->get(AMDGPU::V_SET_INACTIVE_B32), PhiDest)
+          .addReg(NewPhiDest)
+          .add(Phi.getOperand(InactiveOpIdx));
+      break;
+    case 64:
+      BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+              TII->get(AMDGPU::V_SET_INACTIVE_B64), PhiDest)
+          .addReg(NewPhiDest)
+          .add(Phi.getOperand(InactiveOpIdx));
+      break;
+    default: {
+      // For each 32-bit subregister of the register at InactiveOpIdx, insert
+      // a COPY to a new register, and a V_SET_INACTIVE_B32 using the
+      // corresponding subregisters of PhiDest and NewPhiDest.
+      // FIXME: There has to be a better way to iterate over this...
+      llvm::SmallVector<Register, 16> PhiSubRegs;
+      const unsigned SubRegIndices[] = {
+          AMDGPU::sub0,  AMDGPU::sub1,  AMDGPU::sub2,  AMDGPU::sub3,
+          AMDGPU::sub4,  AMDGPU::sub5,  AMDGPU::sub6,  AMDGPU::sub7,
+          AMDGPU::sub8,  AMDGPU::sub9,  AMDGPU::sub10, AMDGPU::sub11,
+          AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+          AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+          AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+          AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+          AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31};
+      const unsigned NumSubRegs = Size / 32;
+      assert(sizeof(SubRegIndices) / sizeof(SubRegIndices[0]) >= NumSubRegs &&
+             "Not enough subregister indices");
+      for (unsigned I = 0; I != NumSubRegs; ++I) {
+        unsigned SubRegIdx = SubRegIndices[I];
+        Register InactiveSubReg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(), TII->get(AMDGPU::COPY),
+                InactiveSubReg)
+            .addReg(Phi.getOperand(InactiveOpIdx).getReg(), 0, SubRegIdx);
+
+        Register AllLanesSubReg =
+            MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+        BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+                TII->get(AMDGPU::V_SET_INACTIVE_B32), AllLanesSubReg)
+            .addReg(NewPhiDest, 0, SubRegIdx)
+            .addReg(InactiveSubReg);
+        PhiSubRegs.push_back(AllLanesSubReg);
+      }
+      // Now we need to combine the subregisters into the original register.
+      auto RegSequence = BuildMI(*EndBB, AfterEndCf, Phi.getDebugLoc(),
+                                 TII->get(AMDGPU::REG_SEQUENCE), PhiDest);
+      for (unsigned I = 0; I < NumSubRegs; ++I) {
+        RegSequence.addReg(PhiSubRegs[I]);
+        RegSequence.addImm(SubRegIndices[I]);
+      }
+      break;
+    }
+    }
+
+    Phi.removeOperand(InactiveOpIdx + 1);
+    Phi.removeOperand(InactiveOpIdx);
+  }
+  If.getParent()->removeSuccessor(EndBB);
+
+  BuildMI(*EndBB, AfterEndCf, IWWIt->getDebugLoc(),
+          TII->get(MF.getSubtarget<GCNSubtarget>().isWave32()
+                       ? AMDGPU::S_MOV_B32
+                       : AMDGPU::S_MOV_B64),
+          TII->getRegisterInfo().getExec())
+      .addImm(-1);
+
+  EndCf.eraseFromParent();
+  If.eraseFromParent();
+  IWWIt->eraseFromParent();
+}
+
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
@@ -15687,6 +15814,12 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
 
+  if (Info->hasInitWholeWave()) {
+    assert(Info->isChainFunction() &&
+           "init.whole.wave may only be used in chain functions");
+    removeInitWholeWaveBranch(MF, MRI, TII);
+  }
+
   if (Info->isEntryFunction()) {
     // Callable functions have fixed registers used for stack access.
     reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
 
@@ -583,6 +583,14 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
   let Defs = [EXEC];
 }
 
+// Sets EXEC to all lanes and returns the previous EXEC.
+def SI_INIT_WHOLE_WAVE : SPseudoInstSI <
+  (outs SReg_1:$dst), (ins),
+  [(set i1:$dst, (int_amdgcn_init_whole_wave))]> {
+  let Defs = [EXEC];
+  let Uses = [EXEC];
+}
+
 // Return for returning shaders to a shader variant epilog.
 def SI_RETURN_TO_EPILOG : SPseudoInstSI <
   (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
 
@@ -289,6 +289,8 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   StringValue SGPRForEXECCopy;
   StringValue LongBranchReservedReg;
 
+  bool HasInitWholeWave = false;
+
   SIMachineFunctionInfo() = default;
   SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
                         const TargetRegisterInfo &TRI,
@@ -336,6 +338,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
                        StringValue()); // Don't print out when it's empty.
     YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
                        StringValue());
+    YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
   }
 };
Original file line number	Diff line number	Diff line change
`@@ -4997,6 +4997,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {`
`4997`	`4997`	`OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);`
`4998`	`4998`	`break;`
`4999`	`4999`	`}`
	`5000`	`+ case Intrinsic::amdgcn_init_whole_wave:`
`5000`	`5001`	`case Intrinsic::amdgcn_live_mask: {`
`5001`	`5002`	`OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);`
`5002`	`5003`	`break;`
Original file line number	Diff line number	Diff line change
`@@ -1739,6 +1739,9 @@ bool GCNTargetMachine::parseMachineFunctionInfo(`
`1739`	`1739`	`? DenormalMode::IEEE`
`1740`	`1740`	`: DenormalMode::PreserveSign;`
`1741`	`1741`
	`1742`	`+ if (YamlMFI.HasInitWholeWave)`
	`1743`	`+ MFI->setInitWholeWave();`
	`1744`	`+`
`1742`	`1745`	`return false;`
`1743`	`1746`	`}`
`1744`	`1747`