[AMDGPU] W/a hazard if 64 bit shift amount is a highest allocated VGPR

rampitec · rampitec · commit 95d497ff2a76 · 2022-09-07T14:23:49.000-07:00
In this case gfx90a uses v0 instead of the correct register. Swap the value temporarily with a lower register and then swap it back. Unfortunately hazard recognizer works after wait count insertion, so we cannot simply reuse an arbitrary register, hence w/a also includes a full waitcount. This can be avoided if we run it from expandPostRAPseudo, but that is a complete misplacement. Differential Revision: https://reviews.llvm.org/D133067
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -300,6 +300,20 @@ void GCNHazardRecognizer::processBundle() {
   CurrCycleInstr = nullptr;
 }
 
+void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
+  assert(IsHazardRecognizerMode);
+
+  unsigned NumPreNoops = PreEmitNoops(MI);
+  EmitNoops(NumPreNoops);
+  if (MI->isInsideBundle())
+    insertNoopsInBundle(MI, TII, NumPreNoops);
+  else
+    TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
+                    NumPreNoops);
+  EmitInstruction(MI);
+  AdvanceCycle();
+}
+
 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   IsHazardRecognizerMode = true;
   CurrCycleInstr = MI;
@@ -1087,6 +1101,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
   fixVALUPartialForwardingHazard(MI);
   fixVALUTransUseHazard(MI);
   fixWMMAHazards(MI);
+  fixShift64HighRegBug(MI);
 }
 
 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -1739,6 +1754,105 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
   return true;
 }
 
+bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
+  if (!ST.hasShift64HighRegBug())
+    return false;
+
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AMDGPU::V_LSHLREV_B64_e64:
+  case AMDGPU::V_LSHRREV_B64_e64:
+  case AMDGPU::V_ASHRREV_I64_e64:
+    break;
+  }
+
+  MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
+  if (!Amt->isReg())
+    return false;
+
+  Register AmtReg = Amt->getReg();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Check if this is a last VGPR in the allocation block.
+  if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
+    return false;
+
+  if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
+    return false;
+
+  MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
+  bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
+  bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
+  bool Overlapped = OverlappedSrc || OverlappedDst;
+
+  assert(!OverlappedDst || !OverlappedSrc ||
+         Src1->getReg() == MI->getOperand(0).getReg());
+  assert(ST.needsAlignedVGPRs());
+  static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
+
+  Register NewReg;
+  for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
+                                   : AMDGPU::VGPR_32RegClass) {
+    if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
+      NewReg = Reg;
+      break;
+    }
+  }
+
+  Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
+                               : NewReg;
+  Register NewAmtLo;
+
+  if (Overlapped)
+    NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
+
+  DebugLoc DL = MI->getDebugLoc();
+  MachineBasicBlock *MBB = MI->getParent();
+  // Insert a full wait count because found register might be pending a wait.
+  BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
+      .addImm(0);
+
+  // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
+  if (Overlapped)
+    runOnInstruction(
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
+            .addDef(AmtReg - 1)
+            .addReg(AmtReg - 1)
+            .addReg(NewAmtLo));
+  runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
+                       .addDef(AmtReg)
+                       .addReg(AmtReg)
+                       .addReg(NewAmt));
+
+  // Instructions emitted after the current instruction will be processed by the
+  // parent loop of the hazard recognizer in a natural way.
+  BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
+          AmtReg)
+      .addDef(NewAmt)
+      .addReg(NewAmt)
+      .addReg(AmtReg);
+  if (Overlapped)
+    BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
+            AmtReg - 1)
+        .addDef(NewAmtLo)
+        .addReg(NewAmtLo)
+        .addReg(AmtReg - 1);
+
+  // Re-running hazard recognizer on the modified instruction is not necessary,
+  // inserted V_SWAP_B32 has already both read and write new registers so
+  // hazards related to these register has already been handled.
+  Amt->setReg(NewAmt);
+  Amt->setIsKill(false);
+  if (OverlappedDst)
+    MI->getOperand(0).setReg(NewReg);
+  if (OverlappedSrc) {
+    Src1->setReg(NewReg);
+    Src1->setIsKill(false);
+  }
+
+  return true;
+}
+
 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
   int NSAtoVMEMWaitStates = 1;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -70,6 +70,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   // instructions.
   void processBundle();
 
+  // Run on an individual instruction in hazard recognizer mode. This can be
+  // used on a newly inserted instruction before returning from PreEmitNoops.
+  void runOnInstruction(MachineInstr *MI);
+
   int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
   int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
   int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -101,6 +105,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   bool fixVALUPartialForwardingHazard(MachineInstr *MI);
   bool fixVALUTransUseHazard(MachineInstr *MI);
   bool fixWMMAHazards(MachineInstr *MI);
+  bool fixShift64HighRegBug(MachineInstr *MI);
 
   int checkMAIHazards(MachineInstr *MI);
   int checkMAIHazards908(MachineInstr *MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1008,6 +1008,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasLdsBranchVmemWARHazard;
   }
 
+  // Shift amount of a 64 bit shift cannot be a highest allocated register
+  // if also at the end of the allocation block.
+  bool hasShift64HighRegBug() const {
+    return GFX90AInsts && !GFX940Insts;
+  }
+
   // Has one cycle hazard on transcendental instruction feeding a
   // non transcendental VALU.
   bool hasTransForwardingHazard() const { return GFX940Insts; }
diff --git a/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir b/llvm/test/CodeGen/AMDGPU/hazard-shift64.mir