AMDGPU: Don't add emergency stack slot if all spills are SGPR->VGPR

arsenm · arsenm · commit 7b6c5d28f54c · 2017-02-22T22:23:32.000Z
This should avoid reporting any stack needs to be allocated in the
case where no stack is truly used. An unused stack slot is still
left around in other cases where there are real stack objects
but no spilling occurs.

llvm-svn: 295891
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -383,6 +383,16 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 
 }
 
+static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+       I != E; ++I) {
+    if (!MFI.isDeadObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
@@ -391,8 +401,51 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (!MFI.hasStackObjects())
     return;
 
-  bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
-  if (MayNeedScavengingEmergencySlot) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  bool AllSGPRSpilledToVGPRs = false;
+
+  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
+    AllSGPRSpilledToVGPRs = true;
+
+    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
+    // are spilled to VGPRs, in which case we can eliminate the stack usage.
+    //
+    // XXX - This operates under the assumption that only other SGPR spills are
+    // users of the frame index. I'm not 100% sure this is correct. The
+    // StackColoring pass has a comment saying a future improvement would be to
+    // merging of allocas with spill slots, but for now according to
+    // MachineFrameInfo isSpillSlot can't alias any other object.
+    for (MachineBasicBlock &MBB : MF) {
+      MachineBasicBlock::iterator Next;
+      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+        MachineInstr &MI = *I;
+        Next = std::next(I);
+
+        if (TII->isSGPRSpill(MI)) {
+          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
+            (void)Spilled;
+            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+          } else
+            AllSGPRSpilledToVGPRs = false;
+        }
+      }
+    }
+
+    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
+  }
+
+  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
+  // but currently hasNonSpillStackObjects is set only from source
+  // allocas. Stack temps produced from legalization are not counted currently.
+  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
+      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
+    assert(RS && "RegScavenger required if spilling");
+
     // We force this to be at offset 0 so no user object ever has 0 as an
     // address, so we may use 0 as an invalid pointer value. This is because
     // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
@@ -410,40 +463,6 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
       AMDGPU::SGPR_32RegClass.getSize(), 0, false);
     RS->addScavengingFrameIndex(ScavengeFI);
   }
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo &TRI = TII->getRegisterInfo();
-  if (!TRI.spillSGPRToVGPR())
-    return;
-
-  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  if (!FuncInfo->hasSpilledSGPRs())
-    return;
-
-  // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
-  // are spilled to VGPRs, in which case we can eliminate the stack usage.
-  //
-  // XXX - This operates under the assumption that only other SGPR spills are
-  // users of the frame index. I'm not 100% sure this is correct. The
-  // StackColoring pass has a comment saying a future improvement would be to
-  // merging of allocas with spill slots, but for now according to
-  // MachineFrameInfo isSpillSlot can't alias any other object.
-  for (MachineBasicBlock &MBB : MF) {
-    MachineBasicBlock::iterator Next;
-    for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
-      MachineInstr &MI = *I;
-      Next = std::next(I);
-
-      if (TII->isSGPRSpill(MI)) {
-        int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
-        if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI))
-          TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
-      }
-    }
-  }
-
-  FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
 }
 
 void SIFrameLowering::emitDebuggerPrologue(MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/llvm/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -18,13 +18,12 @@
 ; GCN: s_mov_b32 m0
 
 ; Make sure scratch space isn't being used for SGPR->VGPR spills
-; FIXME: Seem to be leaving behind unused emergency slot.
 
 ; Writing to M0 from an SMRD instruction will hang the GPU.
 ; GCN-NOT: s_buffer_load_dword m0
 ; GCN: s_endpgm
 
-; TOVGPR: ScratchSize: 4{{$}}
+; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
@@ -768,7 +767,7 @@ ENDIF66:                                          ; preds = %LOOP65
 
 ; GCN-LABEL: {{^}}main1:
 ; GCN: s_endpgm
-; TOVGPR: ScratchSize: 4{{$}}
+; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0