1
+ /* ========================== begin_copyright_notice ============================
2
+
3
+ Copyright (C) 2021 Intel Corporation
4
+
5
+ SPDX-License-Identifier: MIT
6
+
7
+ ============================= end_copyright_notice ===========================*/
8
+
9
+ #include " ModuleAllocaAnalysis.hpp"
10
+ #include " Compiler/Optimizer/OpenCLPasses/KernelArgs.hpp"
11
+ #include " Compiler/MetaDataUtilsWrapper.h"
12
+ #include " Compiler/IGCPassSupport.h"
13
+ #include " Compiler/CISACodeGen/GenCodeGenModule.h"
14
+ #include " llvm/IR/Instructions.h"
15
+
16
+ using namespace llvm ;
17
+
18
+ namespace IGC
19
+ {
20
+ ModuleAllocaAnalysis::ModuleAllocaAnalysis () : ModulePass(ID)
21
+ {
22
+ initializeModuleAllocaAnalysisPass (*PassRegistry::getPassRegistry ());
23
+ }
24
+
25
+ ModuleAllocaAnalysis::~ModuleAllocaAnalysis ()
26
+ {
27
+ for (auto I = InfoMap.begin (), E = InfoMap.end (); I != E; ++I)
28
+ delete I->second ;
29
+ }
30
+
31
+ void ModuleAllocaAnalysis::getAnalysisUsage (AnalysisUsage& AU) const
32
+ {
33
+ AU.setPreservesAll ();
34
+ AU.addRequired <MetaDataUtilsWrapper>();
35
+ AU.addRequired <CodeGenContextWrapper>();
36
+ AU.addRequired <GenXFunctionGroupAnalysis>();
37
+ }
38
+
39
+ StringRef ModuleAllocaAnalysis::getPassName () const
40
+ {
41
+ return " ModuleAllocaAnalysis" ;
42
+ }
43
+
44
+ bool ModuleAllocaAnalysis::runOnModule (Module& mod)
45
+ {
46
+ M = &mod;
47
+ FGA = getAnalysisIfAvailable<GenXFunctionGroupAnalysis>();
48
+ analyze ();
49
+
50
+ return false ;
51
+ }
52
+
53
+ bool ModuleAllocaAnalysis::safeToUseScratchSpace () const
54
+ {
55
+ IGC_ASSERT (M);
56
+
57
+ // Get the analysis
58
+ IGCMD::MetaDataUtils* pMdUtils = getAnalysis<MetaDataUtilsWrapper>().getMetaDataUtils ();
59
+ const DataLayout* DL = &M->getDataLayout ();
60
+ ModuleMetaData& modMD = *getAnalysis<MetaDataUtilsWrapper>().getModuleMetaData ();
61
+ CodeGenContext& Ctx = *getAnalysis<CodeGenContextWrapper>().getCodeGenContext ();
62
+
63
+ //
64
+ // Update UseScratchSpacePrivateMemory based on WA and be consistent with
65
+ // the implementation of CEncoder::ByteScatter().
66
+ //
67
+ if (Ctx.m_DriverInfo .NeedWAToTransformA32MessagesToA64 ()
68
+ && Ctx.platform .getWATable ().WaNoA32ByteScatteredStatelessMessages )
69
+ {
70
+ return false ;
71
+ }
72
+
73
+ //
74
+ // For now, all APIs that use scratch space for private memory, must use scratch
75
+ // memory except OpenCL, which can also use non-scratch space. For debugging
76
+ // purpose, a registry key is used for OCL to turn ocl-use-scratch on/off.
77
+ //
78
+ bool supportsScratchSpacePrivateMemory = Ctx.m_DriverInfo .supportsScratchSpacePrivateMemory ();
79
+ bool supportsStatelessSpacePrivateMemory = Ctx.m_DriverInfo .supportsStatelessSpacePrivateMemory ();
80
+ bool bOCLLegacyStatelessCheck = true ;
81
+
82
+ if (Ctx.allocatePrivateAsGlobalBuffer ())
83
+ {
84
+ return false ;
85
+ }
86
+
87
+ if ((modMD.compOpt .OptDisable && bOCLLegacyStatelessCheck) || !supportsScratchSpacePrivateMemory)
88
+ {
89
+ return false ;
90
+ }
91
+
92
+ //
93
+ // Do not use scratch space if module has any stack call.
94
+ //
95
+ if (bOCLLegacyStatelessCheck) {
96
+ if (FGA) {
97
+ if (FGA->getModule () == M) {
98
+ for (auto & I : *FGA) {
99
+ if (I->hasStackCall ())
100
+ return false ;
101
+ }
102
+ }
103
+ }
104
+ }
105
+
106
+ for (auto & F : *M) {
107
+ if (F.isDeclaration ())
108
+ continue ;
109
+
110
+ // Check each instr of this function.
111
+ for (auto & BB : F) {
112
+ for (auto & I : BB) {
113
+ if (AddrSpaceCastInst* CI = dyn_cast<AddrSpaceCastInst>(&I)) {
114
+ // It is not safe to use scratch space as private memory if kernel does
115
+ // AS casting to ADDRESS_SPACE_GLOBAL_OR_PRIVATE or ADDRESS_SPACE_PRIVATE.
116
+ // See speical hack CI code generated at ProgramScopeConstantResolution
117
+ const ADDRESS_SPACE targetAS = (ADDRESS_SPACE)(cast<PointerType>(CI->getType ()))->getAddressSpace ();
118
+ if (targetAS == ADDRESS_SPACE_GLOBAL_OR_PRIVATE || targetAS == ADDRESS_SPACE_PRIVATE) {
119
+ return false ;
120
+ }
121
+ }
122
+ }
123
+ }
124
+
125
+ if (!isEntryFunc (pMdUtils, &F))
126
+ continue ;
127
+
128
+ //
129
+ // OCL kernel arguments with type like queue_t and struct are expressed as
130
+ // pointer type. Since there is no explicit AS associated with those pointers,
131
+ // e.g., %opencl.queue_t*, to have both host and device use the same pointer
132
+ // size for those arguments, it is better to disable the use of scratch memory.
133
+ //
134
+ // TODO: fixed those types (they should be in global address space)
135
+ if (Ctx.type == ShaderType::OPENCL_SHADER && IGC_IS_FLAG_ENABLED (ForceStatelessForQueueT)) {
136
+ if (!F.arg_empty ()) {
137
+ KernelArgs kernelArgs (F, DL, pMdUtils, &modMD, Ctx.platform .getGRFSize ());
138
+ for (auto arg : kernelArgs) {
139
+ const KernelArg::ArgType argTy = arg.getArgType ();
140
+ if (argTy == KernelArg::ArgType::PTR_DEVICE_QUEUE)
141
+ {
142
+ return false ;
143
+ }
144
+ }
145
+ }
146
+ }
147
+
148
+ //
149
+ // Each thread has up to 2 MB scratch space to use. That is, each WI
150
+ // has up to (2*1024*1024 / 8) bytes of scratch space in SIMD8 mode.
151
+ //
152
+ auto funcInfoMD = pMdUtils->getFunctionsInfoItem (&F);
153
+ bool isGeometryStageShader = Ctx.type == ShaderType::VERTEX_SHADER ||
154
+ Ctx.type == ShaderType::HULL_SHADER ||
155
+ Ctx.type == ShaderType::DOMAIN_SHADER ||
156
+ Ctx.type == ShaderType::GEOMETRY_SHADER;
157
+
158
+ // Start with simd16, which allows the medium size of space per WI
159
+ // (simd8: largest, simd32, smallest). In doing so, there will be
160
+ // some space left for spilling in simd8 if spilling happens.
161
+ int32_t simd_size = isGeometryStageShader ? numLanes (Ctx.platform .getMinDispatchMode ()) :
162
+ numLanes (SIMDMode::SIMD16);
163
+ const int32_t subGrpSize = funcInfoMD->getSubGroupSize ()->getSIMD_size ();
164
+ if (subGrpSize > simd_size)
165
+ simd_size = std::min (subGrpSize, static_cast <int32_t >(numLanes (SIMDMode::SIMD32)));
166
+ int32_t groupSize = IGCMD::IGCMetaDataHelper::getThreadGroupSize (*pMdUtils, &F);
167
+ if (groupSize == 0 )
168
+ groupSize = IGCMD::IGCMetaDataHelper::getThreadGroupSizeHint (*pMdUtils, &F);
169
+ if (groupSize > simd_size)
170
+ simd_size = std::min (groupSize, static_cast <int32_t >(numLanes (SIMDMode::SIMD32)));
171
+
172
+ unsigned maxScratchSpaceBytes = Ctx.platform .maxPerThreadScratchSpace ();
173
+ unsigned scratchSpaceLimitPerWI = maxScratchSpaceBytes / simd_size;
174
+ //
175
+ // If spill happens, since the offset of scratch block rw send message
176
+ // has only 12b, an assertion will be triggered if used scratch space
177
+ // size >= 128 KB, here 128 KB = 2^12 * 256b.
178
+ //
179
+ const unsigned int totalPrivateMemPerWI = getTotalPrivateMemPerWI (&F);
180
+
181
+ if (totalPrivateMemPerWI > scratchSpaceLimitPerWI) {
182
+ return false ;
183
+ }
184
+ }
185
+
186
+ // It is safe to use scratch space for private memory.
187
+ return true ;
188
+ }
189
+
190
+ unsigned ModuleAllocaAnalysis::getConstBufferOffset (AllocaInst* AI) const {
191
+ IGC_ASSERT (isa<ConstantInt>(AI->getArraySize ()));
192
+ Function* F = AI->getParent ()->getParent ();
193
+ return getFuncAllocaInfo (F)->AllocaDesc [AI].first ;
194
+ }
195
+
196
+ unsigned ModuleAllocaAnalysis::getConstBufferSize (AllocaInst* AI) const {
197
+ IGC_ASSERT (isa<ConstantInt>(AI->getArraySize ()));
198
+ Function* F = AI->getParent ()->getParent ();
199
+ return getFuncAllocaInfo (F)->AllocaDesc [AI].second ;
200
+ }
201
+
202
+ SmallVector<AllocaInst*, 8 >& ModuleAllocaAnalysis::getAllocaInsts (Function* F) const {
203
+ return getFuncAllocaInfo (F)->Allocas ;
204
+ }
205
+
206
+ unsigned ModuleAllocaAnalysis::getTotalPrivateMemPerWI (Function* F) const {
207
+ auto FI = getFuncAllocaInfo (F);
208
+ return FI ? FI->TotalSize : 0 ;
209
+ }
210
+
211
+ ModuleAllocaAnalysis::FunctionAllocaInfo* ModuleAllocaAnalysis::getFuncAllocaInfo (Function* F) const {
212
+ auto Iter = InfoMap.find (F);
213
+ if (Iter != InfoMap.end ())
214
+ return Iter->second ;
215
+ return nullptr ;
216
+ }
217
+
218
+ ModuleAllocaAnalysis::FunctionAllocaInfo* ModuleAllocaAnalysis::getOrCreateFuncAllocaInfo (Function* F) {
219
+ auto Iter = InfoMap.find (F);
220
+ if (Iter != InfoMap.end ())
221
+ return Iter->second ;
222
+
223
+ auto AllocaInfo = new FunctionAllocaInfo;
224
+ InfoMap[F] = AllocaInfo;
225
+ return AllocaInfo;
226
+ }
227
+
228
+ void ModuleAllocaAnalysis::analyze () {
229
+ if (FGA && FGA->getModule ()) {
230
+ IGC_ASSERT (FGA->getModule () == M);
231
+ for (auto FG : *FGA)
232
+ analyze (FG);
233
+ }
234
+ else {
235
+ for (auto & F : M->getFunctionList ()) {
236
+ if (F.empty ())
237
+ continue ;
238
+
239
+ unsigned Offset = 0 ;
240
+ unsigned Alignment = 0 ;
241
+ analyze (&F, Offset, Alignment);
242
+ if (Alignment > 0 )
243
+ Offset = iSTD::Align (Offset, Alignment);
244
+ getOrCreateFuncAllocaInfo (&F)->TotalSize = Offset;
245
+ }
246
+ }
247
+ }
248
+
249
+ void ModuleAllocaAnalysis::analyze (IGC::FunctionGroup* FG)
250
+ {
251
+ // Calculate the size of private-memory we need to allocate to
252
+ // every function-sub-group. Eache sub-group is led by a kernel or
253
+ // a stack-call function.
254
+ // Note that the function order does affect the final total amount of
255
+ // private memory due to possible alignment constraints.
256
+ //
257
+ for (auto SubG : FG->Functions ) {
258
+ unsigned Offset = 0 ;
259
+ unsigned Alignment = 0 ;
260
+ for (Function* F : *SubG) {
261
+ if (F->empty ())
262
+ continue ;
263
+ analyze (F, Offset, Alignment);
264
+ }
265
+
266
+ // Use the final offset as the total size.
267
+ if (Alignment > 0 )
268
+ Offset = iSTD::Align (Offset, Alignment);
269
+
270
+ // All functions in this group will get the same final size.
271
+ for (Function* F : *SubG) {
272
+ if (F->empty ())
273
+ continue ;
274
+ getOrCreateFuncAllocaInfo (F)->TotalSize = Offset;
275
+ }
276
+ }
277
+ }
278
+
279
+ void ModuleAllocaAnalysis::analyze (Function* F, unsigned & Offset, unsigned & MaxAlignment)
280
+ {
281
+ const DataLayout* DL = &M->getDataLayout ();
282
+
283
+ // Create alloca info even when there is no alloca, so that each function gets
284
+ // an info entry.
285
+ FunctionAllocaInfo* AllocaInfo = getOrCreateFuncAllocaInfo (F);
286
+
287
+ // Collect allocas.
288
+ SmallVector<AllocaInst*, 8 > Allocas;
289
+ for (auto & BB : F->getBasicBlockList ()) {
290
+ for (auto & Inst : BB.getInstList ()) {
291
+ if (AllocaInst* AI = dyn_cast<AllocaInst>(&Inst)) {
292
+ Allocas.push_back (AI);
293
+ }
294
+ }
295
+ }
296
+
297
+ if (Allocas.empty ())
298
+ return ;
299
+
300
+ // Group by alignment and smallest first.
301
+ auto getAlignment = [=](AllocaInst* AI) -> unsigned {
302
+ unsigned Alignment = AI->getAlignment ();
303
+ if (Alignment == 0 )
304
+ Alignment = DL->getABITypeAlignment (AI->getAllocatedType ());
305
+ return Alignment;
306
+ };
307
+
308
+ std::sort (Allocas.begin (), Allocas.end (),
309
+ [=](AllocaInst* AI1, AllocaInst* AI2) {
310
+ return getAlignment (AI1) < getAlignment (AI2);
311
+ });
312
+
313
+ for (auto AI : Allocas) {
314
+ // Align alloca offset.
315
+ unsigned Alignment = getAlignment (AI);
316
+ Offset = iSTD::Align (Offset, Alignment);
317
+
318
+ // Keep track of the maximal alignment seen so far.
319
+ if (Alignment > MaxAlignment)
320
+ MaxAlignment = Alignment;
321
+
322
+ // Compute alloca size. We don't know the variable length
323
+ // alloca size so skip it.
324
+ if (!isa<ConstantInt>(AI->getArraySize ())) {
325
+ continue ;
326
+ }
327
+ ConstantInt* const SizeVal = cast<ConstantInt>(AI->getArraySize ());
328
+ IGC_ASSERT (nullptr != SizeVal);
329
+ unsigned CurSize = (unsigned )(SizeVal->getZExtValue () *
330
+ DL->getTypeAllocSize (AI->getAllocatedType ()));
331
+ AllocaInfo->setAllocaDesc (AI, Offset, CurSize);
332
+
333
+ // Increment the current offset for the next alloca.
334
+ Offset += CurSize;
335
+ }
336
+
337
+ // Update collected allocas into the function alloca info object.
338
+ AllocaInfo->Allocas .swap (Allocas);
339
+ }
340
+
341
+ char ModuleAllocaAnalysis::ID = 0 ;
342
+
343
+ // Register pass to igc-opt
344
+ #define PASS_FLAG " igc-module-alloca-info"
345
+ #define PASS_DESCRIPTION " Analyse memory usage based on alloca instructions"
346
+ #define PASS_CFG_ONLY true
347
+ #define PASS_ANALYSIS true
348
+ IGC_INITIALIZE_PASS_BEGIN (ModuleAllocaAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
349
+ IGC_INITIALIZE_PASS_DEPENDENCY (MetaDataUtilsWrapper)
350
+ IGC_INITIALIZE_PASS_DEPENDENCY (GenXFunctionGroupAnalysis)
351
+ IGC_INITIALIZE_PASS_END (ModuleAllocaAnalysis, PASS_FLAG, PASS_DESCRIPTION, PASS_CFG_ONLY, PASS_ANALYSIS)
352
+ } // namespace IGC
0 commit comments