Add Frame-Pointer support to stack calls:

dlei6g · sys_zuul · commit a0bbba5e01a5 · 2020-06-22T12:39:05.000-07:00
- Stack call convention match standard OS implementation instead of relying only on SP and offsets for frame calculation.
- Extra register used to store FP info.
- Support for stack walk by storing previous frame's FP to the start of each frame.

Other code refactor and bugfixes also included.

Change-Id: I7e5137133e2e9affdf49c3d1f9310cfe2d379077
diff --git a/IGC/Compiler/CISACodeGen/CShader.cpp b/IGC/Compiler/CISACodeGen/CShader.cpp
@@ -99,7 +99,8 @@ void CShader::InitEncoder(SIMDMode simdSize, bool canAbortOnSpill, ShaderDispatc
     m_DBG = nullptr;
     m_HW_TID = nullptr;
     m_SP = nullptr;
-    m_SavedSP = nullptr;
+    m_FP = nullptr;
+    m_SavedFP = nullptr;
     m_ARGV = nullptr;
     m_RETV = nullptr;
 
@@ -247,6 +248,9 @@ CVariable* CShader::CreateSP()
     // create stack-pointer register
     m_SP = GetNewVariable(1, ISA_TYPE_UQ, EALIGN_QWORD, true, 1, "SP");
     encoder.GetVISAPredefinedVar(m_SP, PREDEFINED_FE_SP);
+    // create frame-pointer register
+    m_FP = GetNewVariable(1, ISA_TYPE_UQ, EALIGN_QWORD, true, 1, "FP");
+    encoder.GetVISAPredefinedVar(m_FP, PREDEFINED_FE_FP);
 
     return m_SP;
 }
@@ -269,108 +273,26 @@ uint32_t CShader::GetMaxPrivateMem()
     return MaxPrivateSize;
 }
 
-/// initial stack-pointer at the beginning of the kernel
-void CShader::InitKernelStack(CVariable*& stackBase, CVariable*& stackAllocSize)
+/// save FP of previous frame when entering a stack-call function
+void CShader::SaveStackState()
 {
-    CreateSP();
-    ImplicitArgs implicitArgs(*entry, m_pMdUtils);
-    unsigned numPushArgs = m_ModuleMetadata->pushInfo.pushAnalysisWIInfos.size();
-    unsigned numImplicitArgs = implicitArgs.size();
-    unsigned numFuncArgs = IGCLLVM::GetFuncArgSize(entry) - numImplicitArgs - numPushArgs;
-
-    Argument* kerArg = nullptr;
-    llvm::Function::arg_iterator arg = entry->arg_begin();
-    for (unsigned i = 0; i < numFuncArgs; ++i, ++arg);
-    for (unsigned i = 0; i < numImplicitArgs; ++i, ++arg) {
-        ImplicitArg implicitArg = implicitArgs[i];
-        if (implicitArg.getArgType() == ImplicitArg::ArgType::PRIVATE_BASE)
-        {
-            kerArg = (&*arg);
-            break;
-        }
-    }
-    IGC_ASSERT(kerArg);
-
-    CVariable* pHWTID = GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, "HWTID");
-    encoder.SetSrcRegion(0, 0, 1, 0);
-    encoder.SetSrcSubReg(0, 5);
-    encoder.And(pHWTID, GetR0(), ImmToVariable(0x1ff, ISA_TYPE_UD));
+    IGC_ASSERT(!m_SavedFP && m_FP && m_SP);
+    m_SavedFP = GetNewVariable(m_FP);
+    encoder.Copy(m_SavedFP, m_FP);
     encoder.Push();
-
-    CVariable* pSize = nullptr;
-
-    // Maximun private size in byte, per-workitem
-    // When there's stack call, we don't know the actual stack size being used,
-    // so set a conservative max stack size.
-    uint32_t MaxPrivateSize = GetMaxPrivateMem();
-    if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
-    {
-        // Experimental: Patch private memory size
-        std::string patchName = "INTEL_PATCH_PRIVATE_MEMORY_SIZE";
-        pSize = GetNewVariable(1, ISA_TYPE_UD, CVariable::getAlignment(getGRFSize()), true, CName(patchName));
-        encoder.AddVISASymbol(patchName, pSize);
-    }
-    else
-    {
-        // hard-code per-workitem private-memory size to max size
-        pSize = ImmToVariable(MaxPrivateSize * numLanes(m_dispatchSize), ISA_TYPE_UD);
-    }
-
-    CVariable* pTemp = GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, CName::NONE);
-    encoder.Mul(pTemp, pHWTID, pSize);
-    encoder.Push();
-
-    // reserve space for alloca
-    auto funcMDItr = m_ModuleMetadata->FuncMD.find(entry);
-    if (funcMDItr != m_ModuleMetadata->FuncMD.end())
-    {
-        if (funcMDItr->second.privateMemoryPerWI != 0)
-        {
-            unsigned totalAllocaSize = funcMDItr->second.privateMemoryPerWI * numLanes(m_dispatchSize);
-            encoder.Add(pTemp, pTemp, ImmToVariable(totalAllocaSize, ISA_TYPE_UD));
-            encoder.Push();
-
-            // Set the total alloca size for the entry function
-            encoder.SetFunctionAllocaStackSize(entry, totalAllocaSize);
-
-            if ((uint32_t)funcMDItr->second.privateMemoryPerWI > MaxPrivateSize)
-            {
-                GetContext()->EmitError("Private memory allocation exceeds max allowed size");
-                IGC_ASSERT(0);
-            }
-        }
-    }
-
-    if (!IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
-    {
-        // If we don't return per-function private memory size,
-        // modify private-memory size to a large setting.
-        // This will be reported through patch-tokens as per-kernel requirement.
-        m_ModuleMetadata->FuncMD[entry].privateMemoryPerWI = MaxPrivateSize;
-    }
-
-    stackBase = GetSymbol(kerArg);
-    stackAllocSize = pTemp;
 }
 
-/// save stack-pointer when entering a stack-call function
-void CShader::SaveSP()
+/// restore SP and FP when exiting a stack-call function
+void CShader::RestoreStackState()
 {
-    IGC_ASSERT(!m_SavedSP);
-    IGC_ASSERT(m_SP);
-    m_SavedSP = GetNewVariable(m_SP);
-    encoder.Copy(m_SavedSP, m_SP);
+    IGC_ASSERT(m_SavedFP && m_FP && m_SP);
+    // Restore SP to current FP
+    encoder.Copy(m_SP, m_FP);
     encoder.Push();
-}
-
-/// restore stack-pointer when exiting a stack-call function
-void CShader::RestoreSP()
-{
-    IGC_ASSERT(m_SavedSP);
-    IGC_ASSERT(m_SP);
-    encoder.Copy(m_SP, m_SavedSP);
+    // Restore FP to previous frame's FP
+    encoder.Copy(m_FP, m_SavedFP);
     encoder.Push();
-    m_SavedSP = nullptr;
+    m_SavedFP = nullptr;
 }
 
 void CShader::CreateImplicitArgs()
@@ -822,13 +744,41 @@ CVariable* CShader::GetHWTID()
 {
     if (!m_HW_TID)
     {
-        m_HW_TID = GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, CName::NONE);
-        encoder.GetVISAPredefinedVar(m_HW_TID, PREDEFINED_HW_TID);
+        {
+            m_HW_TID = GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, "HWTID");
+            encoder.GetVISAPredefinedVar(m_HW_TID, PREDEFINED_HW_TID);
+        }
     }
     return m_HW_TID;
 }
 
+CVariable* CShader::GetPrivateBase()
+{
+    ImplicitArgs implicitArgs(*entry, m_pMdUtils);
+    unsigned numPushArgs = m_ModuleMetadata->pushInfo.pushAnalysisWIInfos.size();
+    unsigned numImplicitArgs = implicitArgs.size();
+    unsigned numFuncArgs = IGCLLVM::GetFuncArgSize(entry) - numImplicitArgs - numPushArgs;
+
+    Argument* kerArg = nullptr;
+    llvm::Function::arg_iterator arg = entry->arg_begin();
+    for (unsigned i = 0; i < numFuncArgs; ++i, ++arg);
+    for (unsigned i = 0; i < numImplicitArgs; ++i, ++arg) {
+        ImplicitArg implicitArg = implicitArgs[i];
+        if (implicitArg.getArgType() == ImplicitArg::ArgType::PRIVATE_BASE)
+        {
+            kerArg = (&*arg);
+            break;
+        }
+    }
+    IGC_ASSERT(kerArg);
+    return GetSymbol(kerArg);
+}
 
+CVariable* CShader::GetFP()
+{
+    IGC_ASSERT(m_FP);
+    return m_FP;
+}
 CVariable* CShader::GetSP()
 {
     IGC_ASSERT(m_SP);
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.cpp
@@ -41,6 +41,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "ShaderCodeGen.hpp"
 #include "common/allocator.h"
 #include "common/debug/Dump.hpp"
+#include "common/debug/Dump.hpp"
 #include "common/igc_regkeys.hpp"
 #include "common/Stats.hpp"
 #include "Compiler/CISACodeGen/helper.h"
@@ -450,10 +451,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
         if (hasStackCall)
         {
             m_encoder->InitFuncAttribute(&F, true);
-            CVariable* pStackBase = nullptr;
-            CVariable* pStackSize = nullptr;
-            m_currShader->InitKernelStack(pStackBase, pStackSize);
-            emitAddSP(m_currShader->GetSP(), pStackBase, pStackSize);
+            InitializeKernelStack(&F);
         }
         m_currShader->AddPrologue();
     }
@@ -9380,6 +9378,80 @@ void EmitPass::emitReturn(llvm::ReturnInst* inst)
     m_currShader->AddEpilogue(inst);
 }
 
+/// Initializes the kernel for stack call by initializing the SP and FP
+void EmitPass::InitializeKernelStack(Function* pKernel)
+{
+    m_currShader->CreateSP();
+    auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
+    auto pModuleMetadata = pCtx->getModuleMetaData();
+
+    CVariable* pStackBufferBase = m_currShader->GetPrivateBase();
+
+    CVariable* pHWTID = m_currShader->GetHWTID();
+
+    CVariable* pSize = nullptr;
+
+    // Maximun private size in byte, per-workitem
+    // When there's stack call, we don't know the actual stack size being used,
+    // so set a conservative max stack size.
+    uint32_t MaxPrivateSize = m_currShader->GetMaxPrivateMem();
+    if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
+    {
+        // Experimental: Patch private memory size
+        std::string patchName = "INTEL_PATCH_PRIVATE_MEMORY_SIZE";
+        pSize = m_currShader->GetNewVariable(1, ISA_TYPE_UD, CVariable::getAlignment(getGRFSize()), true, CName(patchName));
+        m_encoder->AddVISASymbol(patchName, pSize);
+    }
+    else
+    {
+        // hard-code per-workitem private-memory size to max size
+        pSize = m_currShader->ImmToVariable(MaxPrivateSize * numLanes(m_currShader->m_dispatchSize), ISA_TYPE_UD);
+    }
+
+    CVariable* pThreadOffset = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, CName::NONE);
+    m_encoder->Mul(pThreadOffset, pHWTID, pSize);
+    m_encoder->Push();
+
+    unsigned totalAllocaSize = 0;
+
+    // reserve space for kernel FP
+    totalAllocaSize += SIZE_OWORD;
+
+    // reserve space for alloca
+    auto funcMDItr = pModuleMetadata->FuncMD.find(pKernel);
+    if (funcMDItr != pModuleMetadata->FuncMD.end())
+    {
+        if (funcMDItr->second.privateMemoryPerWI != 0)
+        {
+            totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
+
+            if ((uint32_t)funcMDItr->second.privateMemoryPerWI > MaxPrivateSize)
+            {
+                pCtx->EmitError("Private memory allocation exceeds max allowed size");
+                IGC_ASSERT(0);
+            }
+        }
+    }
+
+    // Set the total alloca size for the entry function
+    m_encoder->SetFunctionAllocaStackSize(pKernel, totalAllocaSize);
+
+    if (!IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
+    {
+        // If we don't return per-function private memory size,
+        // modify private-memory size to a large setting.
+        // This will be reported through patch-tokens as per-kernel requirement.
+        pModuleMetadata->FuncMD[pKernel].privateMemoryPerWI = MaxPrivateSize;
+    }
+
+    // Initialize SP to per-thread kernel stack base
+    CVariable* pSP = m_currShader->GetSP();
+    emitAddSP(pSP, pStackBufferBase, pThreadOffset);
+
+    // Update FP and SP
+    emitPushToStack(m_currShader->ImmToVariable(totalAllocaSize, ISA_TYPE_UD), true);
+}
+
 /// This function is NOT about the alignment-rule for storing argv into GRF!
 /// It is about the alignment-rule when we pack the arguments into a block for stack-call!
 uint EmitPass::stackCallArgumentAlignment(CVariable* argv)
@@ -9942,35 +10014,41 @@ void EmitPass::emitStackFuncEntry(Function* F)
             }
         }
     }
-    // save SP before allocation
-    m_currShader->SaveSP();
+
+    unsigned totalAllocaSize = 0;
+
+    // reserve space to store caller's FP
+    totalAllocaSize += SIZE_OWORD;
 
     // reserve space for all the alloca in the function subgroup
     auto funcMDItr = m_currShader->m_ModuleMetadata->FuncMD.find(F);
     if (funcMDItr != m_currShader->m_ModuleMetadata->FuncMD.end())
     {
         if (funcMDItr->second.privateMemoryPerWI != 0)
         {
-            CVariable* pSP = m_currShader->GetSP();
-            unsigned totalAllocaSize = funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
-            emitAddSP(pSP, pSP, m_currShader->ImmToVariable(totalAllocaSize, ISA_TYPE_UD));
-
-            // Set the per-function private mem size
-            m_encoder->SetFunctionAllocaStackSize(F, totalAllocaSize);
-
+            totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
             if ((uint32_t)funcMDItr->second.privateMemoryPerWI > m_currShader->GetMaxPrivateMem())
             {
                 m_currShader->GetContext()->EmitError("Private memory allocation exceeds max allowed size");
                 IGC_ASSERT(0);
             }
         }
     }
+
+    // save FP before allocation
+    m_currShader->SaveStackState();
+
+    // Update SP and FP
+    emitPushToStack(m_currShader->ImmToVariable(totalAllocaSize, ISA_TYPE_UD), false);
+
+    // Set the per-function private mem size
+    m_encoder->SetFunctionAllocaStackSize(F, totalAllocaSize);
 }
 
 void EmitPass::emitStackFuncExit(llvm::ReturnInst* inst)
 {
-    // restore SP
-    m_currShader->RestoreSP();
+    // restore SP and FP
+    m_currShader->RestoreStackState();
 
     llvm::Function* F = inst->getParent()->getParent();
     llvm::Type* RetTy = F->getReturnType();
@@ -15955,6 +16033,35 @@ void EmitPass::emitGenISACopy(GenIntrinsicInst* GenCopyInst)
     emitCopyAll(Dst, Src, Ty);
 }
 
+// Puts FP on stack, update FP to SP, then update SP by pushOffset
+// If isKernel, write special FP value instead to indicate base of the stack
+void EmitPass::emitPushToStack(CVariable* pushOffset, bool isKernel)
+{
+    CVariable* pFP = m_currShader->GetFP();
+    CVariable* pSP = m_currShader->GetSP();
+    if (isKernel)
+    {
+        // Put 0 into FP to indicate kernel stack base
+        m_encoder->Copy(pFP, m_currShader->ImmToVariable(0, ISA_TYPE_UQ));
+        m_encoder->Push();
+    }
+
+    // Store FP value into current SP
+    bool is64BitAddr = (pSP->GetSize() > 4);
+    if (is64BitAddr)
+        m_encoder->OWStoreA64(pFP, pSP, SIZE_OWORD, 0);
+    else
+        m_encoder->OWStore(pFP, ESURFACE_STATELESS, nullptr, pSP, SIZE_OWORD, 0);
+    m_encoder->Push();
+
+    // Set FP = SP
+    m_encoder->Copy(pFP, pSP);
+    m_encoder->Push();
+
+    // Update SP by pushOffset
+    emitAddSP(pSP, pSP, pushOffset);
+}
+
 void EmitPass::emitAddSP(CVariable* Dst, CVariable* Src, CVariable* offset)
 {
     if (m_currShader->m_Platform->hasNoInt64Inst() &&
diff --git a/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp b/IGC/Compiler/CISACodeGen/EmitVISAPass.hpp
@@ -143,6 +143,7 @@ class EmitPass : public llvm::FunctionPass
     void emitStackFuncEntry(llvm::Function* F);
     void emitStackFuncExit(llvm::ReturnInst* inst);
     uint stackCallArgumentAlignment(CVariable* argv);
+    void InitializeKernelStack(llvm::Function* pKernel);
 
     // emits the visa relocation instructions for function/global symbols
     void emitSymbolRelocation(llvm::Function& F);
@@ -419,6 +420,7 @@ class EmitPass : public llvm::FunctionPass
         uint32_t DstSubRegOffset = 0, uint32_t SrcSubRegOffset = 0);
     void emitCopyAll(CVariable* Dst, CVariable* Src, llvm::Type* Ty);
 
+    void emitPushToStack(CVariable* pushOffset, bool isKernel);
     void emitAddSP(CVariable* Dst, CVariable* Src, CVariable* offset);
     // emitAddPair - emulate 64bit addtition by 32-bit operations.
     // Dst and Src0 must be a 64-bit type variable.
diff --git a/IGC/Compiler/CISACodeGen/ShaderCodeGen.hpp b/IGC/Compiler/CISACodeGen/ShaderCodeGen.hpp