@@ -41,6 +41,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
41
41
#include "ShaderCodeGen.hpp"
42
42
#include "common/allocator.h"
43
43
#include "common/debug/Dump.hpp"
44
+ #include "common/debug/Dump.hpp"
44
45
#include "common/igc_regkeys.hpp"
45
46
#include "common/Stats.hpp"
46
47
#include "Compiler/CISACodeGen/helper.h"
@@ -450,10 +451,7 @@ bool EmitPass::runOnFunction(llvm::Function& F)
450
451
if (hasStackCall)
451
452
{
452
453
m_encoder->InitFuncAttribute(&F, true);
453
- CVariable* pStackBase = nullptr;
454
- CVariable* pStackSize = nullptr;
455
- m_currShader->InitKernelStack(pStackBase, pStackSize);
456
- emitAddSP(m_currShader->GetSP(), pStackBase, pStackSize);
454
+ InitializeKernelStack(&F);
457
455
}
458
456
m_currShader->AddPrologue();
459
457
}
@@ -9380,6 +9378,80 @@ void EmitPass::emitReturn(llvm::ReturnInst* inst)
9380
9378
m_currShader->AddEpilogue(inst);
9381
9379
}
9382
9380
9381
+ /// Initializes the kernel for stack call by initializing the SP and FP
9382
+ void EmitPass::InitializeKernelStack(Function* pKernel)
9383
+ {
9384
+ m_currShader->CreateSP();
9385
+ auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
9386
+ auto pModuleMetadata = pCtx->getModuleMetaData();
9387
+
9388
+ CVariable* pStackBufferBase = m_currShader->GetPrivateBase();
9389
+
9390
+ CVariable* pHWTID = m_currShader->GetHWTID();
9391
+
9392
+ CVariable* pSize = nullptr;
9393
+
9394
+ // Maximun private size in byte, per-workitem
9395
+ // When there's stack call, we don't know the actual stack size being used,
9396
+ // so set a conservative max stack size.
9397
+ uint32_t MaxPrivateSize = m_currShader->GetMaxPrivateMem();
9398
+ if (IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
9399
+ {
9400
+ // Experimental: Patch private memory size
9401
+ std::string patchName = "INTEL_PATCH_PRIVATE_MEMORY_SIZE";
9402
+ pSize = m_currShader->GetNewVariable(1, ISA_TYPE_UD, CVariable::getAlignment(getGRFSize()), true, CName(patchName));
9403
+ m_encoder->AddVISASymbol(patchName, pSize);
9404
+ }
9405
+ else
9406
+ {
9407
+ // hard-code per-workitem private-memory size to max size
9408
+ pSize = m_currShader->ImmToVariable(MaxPrivateSize * numLanes(m_currShader->m_dispatchSize), ISA_TYPE_UD);
9409
+ }
9410
+
9411
+ CVariable* pThreadOffset = m_currShader->GetNewVariable(1, ISA_TYPE_UD, EALIGN_DWORD, true, 1, CName::NONE);
9412
+ m_encoder->Mul(pThreadOffset, pHWTID, pSize);
9413
+ m_encoder->Push();
9414
+
9415
+ unsigned totalAllocaSize = 0;
9416
+
9417
+ // reserve space for kernel FP
9418
+ totalAllocaSize += SIZE_OWORD;
9419
+
9420
+ // reserve space for alloca
9421
+ auto funcMDItr = pModuleMetadata->FuncMD.find(pKernel);
9422
+ if (funcMDItr != pModuleMetadata->FuncMD.end())
9423
+ {
9424
+ if (funcMDItr->second.privateMemoryPerWI != 0)
9425
+ {
9426
+ totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
9427
+
9428
+ if ((uint32_t)funcMDItr->second.privateMemoryPerWI > MaxPrivateSize)
9429
+ {
9430
+ pCtx->EmitError("Private memory allocation exceeds max allowed size");
9431
+ IGC_ASSERT(0);
9432
+ }
9433
+ }
9434
+ }
9435
+
9436
+ // Set the total alloca size for the entry function
9437
+ m_encoder->SetFunctionAllocaStackSize(pKernel, totalAllocaSize);
9438
+
9439
+ if (!IGC_IS_FLAG_ENABLED(EnableRuntimeFuncAttributePatching))
9440
+ {
9441
+ // If we don't return per-function private memory size,
9442
+ // modify private-memory size to a large setting.
9443
+ // This will be reported through patch-tokens as per-kernel requirement.
9444
+ pModuleMetadata->FuncMD[pKernel].privateMemoryPerWI = MaxPrivateSize;
9445
+ }
9446
+
9447
+ // Initialize SP to per-thread kernel stack base
9448
+ CVariable* pSP = m_currShader->GetSP();
9449
+ emitAddSP(pSP, pStackBufferBase, pThreadOffset);
9450
+
9451
+ // Update FP and SP
9452
+ emitPushToStack(m_currShader->ImmToVariable(totalAllocaSize, ISA_TYPE_UD), true);
9453
+ }
9454
+
9383
9455
/// This function is NOT about the alignment-rule for storing argv into GRF!
9384
9456
/// It is about the alignment-rule when we pack the arguments into a block for stack-call!
9385
9457
uint EmitPass::stackCallArgumentAlignment(CVariable* argv)
@@ -9942,35 +10014,41 @@ void EmitPass::emitStackFuncEntry(Function* F)
9942
10014
}
9943
10015
}
9944
10016
}
9945
- // save SP before allocation
9946
- m_currShader->SaveSP();
10017
+
10018
+ unsigned totalAllocaSize = 0;
10019
+
10020
+ // reserve space to store caller's FP
10021
+ totalAllocaSize += SIZE_OWORD;
9947
10022
9948
10023
// reserve space for all the alloca in the function subgroup
9949
10024
auto funcMDItr = m_currShader->m_ModuleMetadata->FuncMD.find(F);
9950
10025
if (funcMDItr != m_currShader->m_ModuleMetadata->FuncMD.end())
9951
10026
{
9952
10027
if (funcMDItr->second.privateMemoryPerWI != 0)
9953
10028
{
9954
- CVariable* pSP = m_currShader->GetSP();
9955
- unsigned totalAllocaSize = funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
9956
- emitAddSP(pSP, pSP, m_currShader->ImmToVariable(totalAllocaSize, ISA_TYPE_UD));
9957
-
9958
- // Set the per-function private mem size
9959
- m_encoder->SetFunctionAllocaStackSize(F, totalAllocaSize);
9960
-
10029
+ totalAllocaSize += funcMDItr->second.privateMemoryPerWI * numLanes(m_currShader->m_dispatchSize);
9961
10030
if ((uint32_t)funcMDItr->second.privateMemoryPerWI > m_currShader->GetMaxPrivateMem())
9962
10031
{
9963
10032
m_currShader->GetContext()->EmitError("Private memory allocation exceeds max allowed size");
9964
10033
IGC_ASSERT(0);
9965
10034
}
9966
10035
}
9967
10036
}
10037
+
10038
+ // save FP before allocation
10039
+ m_currShader->SaveStackState();
10040
+
10041
+ // Update SP and FP
10042
+ emitPushToStack(m_currShader->ImmToVariable(totalAllocaSize, ISA_TYPE_UD), false);
10043
+
10044
+ // Set the per-function private mem size
10045
+ m_encoder->SetFunctionAllocaStackSize(F, totalAllocaSize);
9968
10046
}
9969
10047
9970
10048
void EmitPass::emitStackFuncExit(llvm::ReturnInst* inst)
9971
10049
{
9972
- // restore SP
9973
- m_currShader->RestoreSP ();
10050
+ // restore SP and FP
10051
+ m_currShader->RestoreStackState ();
9974
10052
9975
10053
llvm::Function* F = inst->getParent()->getParent();
9976
10054
llvm::Type* RetTy = F->getReturnType();
@@ -15955,6 +16033,35 @@ void EmitPass::emitGenISACopy(GenIntrinsicInst* GenCopyInst)
15955
16033
emitCopyAll(Dst, Src, Ty);
15956
16034
}
15957
16035
16036
+ // Puts FP on stack, update FP to SP, then update SP by pushOffset
16037
+ // If isKernel, write special FP value instead to indicate base of the stack
16038
+ void EmitPass::emitPushToStack(CVariable* pushOffset, bool isKernel)
16039
+ {
16040
+ CVariable* pFP = m_currShader->GetFP();
16041
+ CVariable* pSP = m_currShader->GetSP();
16042
+ if (isKernel)
16043
+ {
16044
+ // Put 0 into FP to indicate kernel stack base
16045
+ m_encoder->Copy(pFP, m_currShader->ImmToVariable(0, ISA_TYPE_UQ));
16046
+ m_encoder->Push();
16047
+ }
16048
+
16049
+ // Store FP value into current SP
16050
+ bool is64BitAddr = (pSP->GetSize() > 4);
16051
+ if (is64BitAddr)
16052
+ m_encoder->OWStoreA64(pFP, pSP, SIZE_OWORD, 0);
16053
+ else
16054
+ m_encoder->OWStore(pFP, ESURFACE_STATELESS, nullptr, pSP, SIZE_OWORD, 0);
16055
+ m_encoder->Push();
16056
+
16057
+ // Set FP = SP
16058
+ m_encoder->Copy(pFP, pSP);
16059
+ m_encoder->Push();
16060
+
16061
+ // Update SP by pushOffset
16062
+ emitAddSP(pSP, pSP, pushOffset);
16063
+ }
16064
+
15958
16065
void EmitPass::emitAddSP(CVariable* Dst, CVariable* Src, CVariable* offset)
15959
16066
{
15960
16067
if (m_currShader->m_Platform->hasNoInt64Inst() &&
0 commit comments