Skip to content

Commit 3b10634

Browse files
mgrzywacigcbot
authored andcommitted
Stateless scratch space for OGL
Implement logic to detect if stateless scratch space pointer is needed (per thread scratch space size > 256kB), and set according information in compiler output and payload.
1 parent 3872664 commit 3b10634

File tree

8 files changed

+479
-381
lines changed

8 files changed

+479
-381
lines changed

IGC/Compiler/CISACodeGen/CShader.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3458,7 +3458,9 @@ void CShader::PackAndCopyVariable(
34583458

34593459
bool CShader::CompileSIMDSizeInCommon(SIMDMode simdMode)
34603460
{
3461-
bool ret = (m_ScratchSpaceSize <= m_ctx->platform.maxPerThreadScratchSpace());
3461+
bool ret = ((m_ScratchSpaceSize <= m_ctx->platform.maxPerThreadScratchSpace()) ||
3462+
m_ctx->m_DriverInfo.supportsStatelessSpacePrivateMemory());
3463+
34623464
m_simdProgram.setScratchSpaceUsedByShader(m_ScratchSpaceSize);
34633465
if (m_ctx->platform.hasScratchSurface() && m_ctx->m_DriverInfo.supportsSeparatingSpillAndPrivateScratchMemorySpace()) {
34643466
ret = (m_simdProgram.getScratchSpaceUsageInSlot0() <= m_ctx->platform.maxPerThreadScratchSpace());

IGC/Compiler/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ set(IGC_BUILD__SRC__Compiler
6666
"${CMAKE_CURRENT_SOURCE_DIR}/SampleMultiversioning.cpp"
6767
"${CMAKE_CURRENT_SOURCE_DIR}/HandleFRemInstructions.cpp"
6868
"${CMAKE_CURRENT_SOURCE_DIR}/GenRotate.cpp"
69+
"${CMAKE_CURRENT_SOURCE_DIR}/ModuleAllocaAnalysis.cpp"
6970
"${IGC_BUILD__GFX_DEV_SRC_DIR}/skuwa/ibdw_wa.c"
7071
"${IGC_BUILD__GFX_DEV_SRC_DIR}/skuwa/ichv_wa.c"
7172
"${IGC_BUILD__GFX_DEV_SRC_DIR}/skuwa/ibxt_wa.c"
@@ -146,6 +147,7 @@ set(IGC_BUILD__HDR__Compiler
146147
"${CMAKE_CURRENT_SOURCE_DIR}/SampleMultiversioning.hpp"
147148
"${CMAKE_CURRENT_SOURCE_DIR}/HandleFRemInstructions.hpp"
148149
"${CMAKE_CURRENT_SOURCE_DIR}/GenRotate.hpp"
150+
"${CMAKE_CURRENT_SOURCE_DIR}/ModuleAllocaAnalysis.hpp"
149151
${IGC_BUILD__HDR__Compiler_CISACodeGen}
150152
${IGC_BUILD__HDR__Compiler_DebugInfo}
151153
${IGC_BUILD__HDR__Compiler_Legalizer}

IGC/Compiler/ModuleAllocaAnalysis.cpp

Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
/*========================== begin_copyright_notice ============================
2+
3+
Copyright (C) 2021 Intel Corporation
4+
5+
SPDX-License-Identifier: MIT
6+
7+
============================= end_copyright_notice ===========================*/
8+
9+
#include "ModuleAllocaAnalysis.hpp"
10+
#include "Compiler/Optimizer/OpenCLPasses/KernelArgs.hpp"
11+
#include "Compiler/MetaDataUtilsWrapper.h"
12+
#include "Compiler/IGCPassSupport.h"
13+
#include "Compiler/CISACodeGen/GenCodeGenModule.h"
14+
#include "llvm/IR/Instructions.h"
15+
16+
using namespace llvm;
17+
18+
namespace IGC
19+
{
20+
ModuleAllocaAnalysis::ModuleAllocaAnalysis() : ModulePass(ID)
21+
{
22+
initializeModuleAllocaAnalysisPass(*PassRegistry::getPassRegistry());
23+
}
24+
25+
ModuleAllocaAnalysis::~ModuleAllocaAnalysis()
26+
{
27+
for (auto I = InfoMap.begin(), E = InfoMap.end(); I != E; ++I)
28+
delete I->second;
29+
}
30+
31+
void ModuleAllocaAnalysis::getAnalysisUsage(AnalysisUsage& AU) const
32+
{
33+
AU.setPreservesAll();
34+
AU.addRequired<MetaDataUtilsWrapper>();
35+
AU.addRequired<CodeGenContextWrapper>();
36+
AU.addRequired<GenXFunctionGroupAnalysis>();
37+
}
38+
39+
StringRef ModuleAllocaAnalysis::getPassName() const
40+
{
41+
return "ModuleAllocaAnalysis";
42+
}
43+
44+
bool ModuleAllocaAnalysis::runOnModule(Module& mod)
45+
{
46+
M = &mod;
47+
FGA = getAnalysisIfAvailable<GenXFunctionGroupAnalysis>();
48+
analyze();
49+
50+
return false;
51+
}
52+
53+
bool ModuleAllocaAnalysis::safeToUseScratchSpace() const
54+
{
55+
IGC_ASSERT(M);
56+
57+
// Get the analysis
58+
IGCMD::MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils();
59+
const DataLayout* DL = &M->getDataLayout();
60+
ModuleMetaData& modMD = *getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData();
61+
CodeGenContext& Ctx = *getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
62+
63+
//
64+
// Update UseScratchSpacePrivateMemory based on WA and be consistent with
65+
// the implementation of CEncoder::ByteScatter().
66+
//
67+
if (Ctx.m_DriverInfo.NeedWAToTransformA32MessagesToA64()
68+
&& Ctx.platform.getWATable().WaNoA32ByteScatteredStatelessMessages)
69+
{
70+
return false;
71+
}
72+
73+
//
74+
// For now, all APIs that use scratch space for private memory, must use scratch
75+
// memory except OpenCL, which can also use non-scratch space. For debugging
76+
// purpose, a registry key is used for OCL to turn ocl-use-scratch on/off.
77+
//
78+
bool supportsScratchSpacePrivateMemory = Ctx.m_DriverInfo.supportsScratchSpacePrivateMemory();
79+
bool supportsStatelessSpacePrivateMemory = Ctx.m_DriverInfo.supportsStatelessSpacePrivateMemory();
80+
bool bOCLLegacyStatelessCheck = true;
81+
82+
if (Ctx.allocatePrivateAsGlobalBuffer())
83+
{
84+
return false;
85+
}
86+
87+
if ((modMD.compOpt.OptDisable && bOCLLegacyStatelessCheck) || !supportsScratchSpacePrivateMemory)
88+
{
89+
return false;
90+
}
91+
92+
//
93+
// Do not use scratch space if module has any stack call.
94+
//
95+
if (bOCLLegacyStatelessCheck) {
96+
if (FGA) {
97+
if (FGA->getModule() == M) {
98+
for (auto& I : *FGA) {
99+
if (I->hasStackCall())
100+
return false;
101+
}
102+
}
103+
}
104+
}
105+
106+
for (auto& F : *M) {
107+
if (F.isDeclaration())
108+
continue;
109+
110+
// Check each instr of this function.
111+
for (auto& BB : F) {
112+
for (auto& I : BB) {
113+
if (AddrSpaceCastInst* CI = dyn_cast<AddrSpaceCastInst>(&I)) {
114+
// It is not safe to use scratch space as private memory if kernel does
115+
// AS casting to ADDRESS_SPACE_GLOBAL_OR_PRIVATE or ADDRESS_SPACE_PRIVATE.
116+
// See speical hack CI code generated at ProgramScopeConstantResolution
117+
const ADDRESS_SPACE targetAS = (ADDRESS_SPACE)(cast<PointerType>(CI->getType()))->getAddressSpace();
118+
if (targetAS == ADDRESS_SPACE_GLOBAL_OR_PRIVATE || targetAS == ADDRESS_SPACE_PRIVATE) {
119+
return false;
120+
}
121+
}
122+
}
123+
}
124+
125+
if (!isEntryFunc(pMdUtils, &F))
126+
continue;
127+
128+
//
129+
// OCL kernel arguments with type like queue_t and struct are expressed as
130+
// pointer type. Since there is no explicit AS associated with those pointers,
131+
// e.g., %opencl.queue_t*, to have both host and device use the same pointer
132+
// size for those arguments, it is better to disable the use of scratch memory.
133+
//
134+
// TODO: fixed those types (they should be in global address space)
135+
if (Ctx.type == ShaderType::OPENCL_SHADER && IGC_IS_FLAG_ENABLED(ForceStatelessForQueueT)) {
136+
if (!F.arg_empty()) {
137+
KernelArgs kernelArgs(F, DL, pMdUtils, &modMD, Ctx.platform.getGRFSize());
138+
for (auto arg : kernelArgs) {
139+
const KernelArg::ArgType argTy = arg.getArgType();
140+
if (argTy == KernelArg::ArgType::PTR_DEVICE_QUEUE)
141+
{
142+
return false;
143+
}
144+
}
145+
}
146+
}
147+
148+
//
149+
// Each thread has up to 2 MB scratch space to use. That is, each WI
150+
// has up to (2*1024*1024 / 8) bytes of scratch space in SIMD8 mode.
151+
//
152+
auto funcInfoMD = pMdUtils->getFunctionsInfoItem(&F);
153+
bool isGeometryStageShader = Ctx.type == ShaderType::VERTEX_SHADER ||
154+
Ctx.type == ShaderType::HULL_SHADER ||
155+
Ctx.type == ShaderType::DOMAIN_SHADER ||
156+
Ctx.type == ShaderType::GEOMETRY_SHADER;
157+
158+
// Start with simd16, which allows the medium size of space per WI
159+
// (simd8: largest, simd32, smallest). In doing so, there will be
160+
// some space left for spilling in simd8 if spilling happens.
161+
int32_t simd_size = isGeometryStageShader ? numLanes(Ctx.platform.getMinDispatchMode()) :
162+
numLanes(SIMDMode::SIMD16);
163+
const int32_t subGrpSize = funcInfoMD->getSubGroupSize()->getSIMD_size();
164+
if (subGrpSize > simd_size)
165+
simd_size = std::min(subGrpSize, static_cast<int32_t>(numLanes(SIMDMode::SIMD32)));
166+
int32_t groupSize = IGCMD::IGCMetaDataHelper::getThreadGroupSize(*pMdUtils, &F);
167+
if (groupSize == 0)
168+
groupSize = IGCMD::IGCMetaDataHelper::getThreadGroupSizeHint(*pMdUtils, &F);
169+
if (groupSize > simd_size)
170+
simd_size = std::min(groupSize, static_cast<int32_t>(numLanes(SIMDMode::SIMD32)));
171+
172+
unsigned maxScratchSpaceBytes = Ctx.platform.maxPerThreadScratchSpace();
173+
unsigned scratchSpaceLimitPerWI = maxScratchSpaceBytes / simd_size;
174+
//
175+
// If spill happens, since the offset of scratch block rw send message
176+
// has only 12b, an assertion will be triggered if used scratch space
177+
// size >= 128 KB, here 128 KB = 2^12 * 256b.
178+
//
179+
const unsigned int totalPrivateMemPerWI = getTotalPrivateMemPerWI(&F);
180+
181+
if (totalPrivateMemPerWI > scratchSpaceLimitPerWI) {
182+
return false;
183+
}
184+
}
185+
186+
// It is safe to use scratch space for private memory.
187+
return true;
188+
}
189+
190+
unsigned ModuleAllocaAnalysis::getConstBufferOffset(AllocaInst* AI) const {
191+
IGC_ASSERT(isa<ConstantInt>(AI->getArraySize()));
192+
Function* F = AI->getParent()->getParent();
193+
return getFuncAllocaInfo(F)->AllocaDesc[AI].first;
194+
}
195+
196+
unsigned ModuleAllocaAnalysis::getConstBufferSize(AllocaInst* AI) const {
197+
IGC_ASSERT(isa<ConstantInt>(AI->getArraySize()));
198+
Function* F = AI->getParent()->getParent();
199+
return getFuncAllocaInfo(F)->AllocaDesc[AI].second;
200+
}
201+
202+
SmallVector<AllocaInst*, 8>& ModuleAllocaAnalysis::getAllocaInsts(Function* F) const {
203+
return getFuncAllocaInfo(F)->Allocas;
204+
}
205+
206+
unsigned ModuleAllocaAnalysis::getTotalPrivateMemPerWI(Function* F) const {
207+
auto FI = getFuncAllocaInfo(F);
208+
return FI ? FI->TotalSize : 0;
209+
}
210+
211+
ModuleAllocaAnalysis::FunctionAllocaInfo* ModuleAllocaAnalysis::getFuncAllocaInfo(Function* F) const {
212+
auto Iter = InfoMap.find(F);
213+
if (Iter != InfoMap.end())
214+
return Iter->second;
215+
return nullptr;
216+
}
217+
218+
ModuleAllocaAnalysis::FunctionAllocaInfo* ModuleAllocaAnalysis::getOrCreateFuncAllocaInfo(Function* F) {
219+
auto Iter = InfoMap.find(F);
220+
if (Iter != InfoMap.end())
221+
return Iter->second;
222+
223+
auto AllocaInfo = new FunctionAllocaInfo;
224+
InfoMap[F] = AllocaInfo;
225+
return AllocaInfo;
226+
}
227+
228+
void ModuleAllocaAnalysis::analyze() {
229+
if (FGA && FGA->getModule()) {
230+
IGC_ASSERT(FGA->getModule() == M);
231+
for (auto FG : *FGA)
232+
analyze(FG);
233+
}
234+
else {
235+
for (auto& F : M->getFunctionList()) {
236+
if (F.empty())
237+
continue;
238+
239+
unsigned Offset = 0;
240+
unsigned Alignment = 0;
241+
analyze(&F, Offset, Alignment);
242+
if (Alignment > 0)
243+
Offset = iSTD::Align(Offset, Alignment);
244+
getOrCreateFuncAllocaInfo(&F)->TotalSize = Offset;
245+
}
246+
}
247+
}
248+
249+
void ModuleAllocaAnalysis::analyze(IGC::FunctionGroup* FG)
250+
{
251+
// Calculate the size of private-memory we need to allocate to
252+
// every function-sub-group. Eache sub-group is led by a kernel or
253+
// a stack-call function.
254+
// Note that the function order does affect the final total amount of
255+
// private memory due to possible alignment constraints.
256+
//
257+
for (auto SubG : FG->Functions) {
258+
unsigned Offset = 0;
259+
unsigned Alignment = 0;
260+
for (Function* F : *SubG) {
261+
if (F->empty())
262+
continue;
263+
analyze(F, Offset, Alignment);
264+
}
265+
266+
// Use the final offset as the total size.
267+
if (Alignment > 0)
268+
Offset = iSTD::Align(Offset, Alignment);
269+
270+
// All functions in this group will get the same final size.
271+
for (Function* F : *SubG) {
272+
if (F->empty())
273+
continue;
274+
getOrCreateFuncAllocaInfo(F)->TotalSize = Offset;
275+
}
276+
}
277+
}
278+
279+
void ModuleAllocaAnalysis::analyze(Function* F, unsigned& Offset, unsigned& MaxAlignment)
280+
{
281+
const DataLayout* DL = &M->getDataLayout();
282+
283+
// Create alloca info even when there is no alloca, so that each function gets
284+
// an info entry.
285+
FunctionAllocaInfo* AllocaInfo = getOrCreateFuncAllocaInfo(F);
286+
287+
// Collect allocas.
288+
SmallVector<AllocaInst*, 8> Allocas;
289+
for (auto& BB : F->getBasicBlockList()) {
290+
for (auto& Inst : BB.getInstList()) {
291+
if (AllocaInst* AI = dyn_cast<AllocaInst>(&Inst)) {
292+
Allocas.push_back(AI);
293+
}
294+
}
295+
}
296+
297+
if (Allocas.empty())
298+
return;
299+
300+
// Group by alignment and smallest first.
301+
auto getAlignment = [=](AllocaInst* AI) -> unsigned {
302+
unsigned Alignment = AI->getAlignment();
303+
if (Alignment == 0)
304+
Alignment = DL->getABITypeAlignment(AI->getAllocatedType());
305+
return Alignment;
306+
};
307+
308+
std::sort(Allocas.begin(), Allocas.end(),
309+
[=](AllocaInst* AI1, AllocaInst* AI2) {
310+
return getAlignment(AI1) < getAlignment(AI2);
311+
});
312+
313+
for (auto AI : Allocas) {
314+
// Align alloca offset.
315+
unsigned Alignment = getAlignment(AI);
316+
Offset = iSTD::Align(Offset, Alignment);
317+
318+
// Keep track of the maximal alignment seen so far.
319+
if (Alignment > MaxAlignment)
320+
MaxAlignment = Alignment;
321+
322+
// Compute alloca size. We don't know the variable length
323+
// alloca size so skip it.
324+
if (!isa<ConstantInt>(AI->getArraySize())) {
325+
continue;
326+
}
327+
ConstantInt* const SizeVal = cast<ConstantInt>(AI->getArraySize());
328+
IGC_ASSERT(nullptr != SizeVal);
329+
unsigned CurSize = (unsigned)(SizeVal->getZExtValue() *
330+
DL->getTypeAllocSize(AI->getAllocatedType()));
331+
AllocaInfo->setAllocaDesc(AI, Offset, CurSize);
332+
333+
// Increment the current offset for the next alloca.
334+
Offset += CurSize;
335+
}
336+
337+
// Update collected allocas into the function alloca info object.
338+
AllocaInfo->Allocas.swap(Allocas);
339+
}
340+
341+
char ModuleAllocaAnalysis::ID = 0;
342+
343+
// Register pass to igc-opt
344+
#define PASS_FLAG "igc-module-alloca-info"
345+
#define PASS_DESCRIPTION "Analyse memory usage based on alloca instructions"
346+
#define PASS_CFG_ONLY true
347+
#define PASS_ANALYSIS true
348+
IGC_INITIALIZE_PASS_BEGIN(ModuleAllocaAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
349+
IGC_INITIALIZE_PASS_DEPENDENCY(MetaDataUtilsWrapper)
350+
IGC_INITIALIZE_PASS_DEPENDENCY(GenXFunctionGroupAnalysis)
351+
IGC_INITIALIZE_PASS_END(ModuleAllocaAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
352+
} // namespace IGC

0 commit comments

Comments
 (0)