@@ -16,8 +16,27 @@ SPDX-License-Identifier: MIT
16
16
// / VISA regalloc makes sure that dests & sources of the generated instructions
17
17
// / are allocated to the proper predefined VISA regs
18
18
// /
19
+ // / Lets say before transformation we have call with arg loaded from surface:
20
+ // / %arg = call <8 x float> @llvm.genx.oword.ld.v8f32(i32 0, i32 1, i32 %addr)
21
+ // / %ret = call spir_func <8 x float> @foo(<8 x float> %arg)
22
+ // /
23
+ // / Then after we will see something like this:
24
+ // / %arg = call <8 x float> @llvm.genx.oword.ld.v8f32(i32 0, i32 1, i32 %addr)
25
+ // / %R8 = call <256 x float> read.predef.reg(i32 PREDEFINED_ARG, undef)
26
+ // / %NEWR8 = <256 x float> wrregion(<256 x float> %R8, <8 x float> %arg,
27
+ // / i32 0, i32 8, i32 1, OFFSET, ...)
28
+ // / ; Here OFFSET starts from 0 and is argument offset in predef register
29
+ // / %newarg = call <8 x float> write.predef.reg(i32 PREDEFINED_ARG,
30
+ // / <256 x float> %NEWR8)
31
+ // / %ret = call spir_func <8 x float> @foo(<8 x float> %newarg)
32
+ // /
33
+ // / So we have explicit stack layout relatively early to use 64-bit splitting
34
+ // / on later stages if 64-bit pointers are used as SP/FP
35
+ // /
19
36
// ===----------------------------------------------------------------------===//
20
37
38
+ #define DEBUG_TYPE " GENX_PROLOGUE"
39
+
21
40
#include " GenX.h"
22
41
#include " GenXIntrinsics.h"
23
42
#include " GenXModule.h"
@@ -42,6 +61,7 @@ SPDX-License-Identifier: MIT
42
61
#include " llvm/IR/InstVisitor.h"
43
62
#include " llvm/IR/Module.h"
44
63
#include " llvm/Pass.h"
64
+ #include " llvm/Support/Debug.h"
45
65
#include " llvm/Support/MathExtras.h"
46
66
47
67
#include " Probe/Assertion.h"
@@ -111,7 +131,13 @@ class GenXPrologEpilogInsertion
111
131
void generateKernelProlog (Function &F);
112
132
void generateFunctionProlog (Function &F);
113
133
void generateFunctionEpilog (Function &F, ReturnInst &I);
134
+
135
+ // caller side argument layout
114
136
void generateStackCall (CallInst *CI);
137
+
138
+ // generateStackCall subroutine
139
+ unsigned writeArgs (CallInst *CI, Value *SpArgs, IRBuilder<> &IRB);
140
+
115
141
void generateAlloca (CallInst *CI);
116
142
117
143
Value *push (Value *V, IRBuilder<> &IRB, Value *InitSP);
@@ -216,12 +242,16 @@ bool GenXPrologEpilogInsertion::runOnFunction(Function &F) {
216
242
.getGenXSubtarget ();
217
243
ArgRegSize = visa::ArgRegSizeInGRFs * ST->getGRFWidth ();
218
244
RetRegSize = visa::RetRegSizeInGRFs * ST->getGRFWidth ();
219
- if (!(BEConf->useNewStackBuilder () && ST->isOCLRuntime ()))
245
+ if (!(BEConf->useNewStackBuilder () && ST->isOCLRuntime ())) {
246
+ LLVM_DEBUG (dbgs () << " Old builder or CMRT used in " << F.getName () << " \n " );
220
247
return false ;
248
+ }
221
249
NumCalls = CallsCalculator ().getNumCalls (F);
222
250
UseGlobalMem =
223
251
F.getParent ()->getModuleFlag (ModuleMD::UseSVMStack) != nullptr ;
252
+ LLVM_DEBUG (dbgs () << " Visiting all calls in " << F.getName () << " \n " );
224
253
visit (F);
254
+ LLVM_DEBUG (dbgs () << " Visiting finished\n " );
225
255
if (genx::isKernel (&F)) {
226
256
generateKernelProlog (F);
227
257
// no epilog is required for kernels
@@ -260,6 +290,8 @@ void GenXPrologEpilogInsertion::visitReturnInst(ReturnInst &I) {
260
290
261
291
// FE_SP = PrivateBase + HWTID * PrivMemPerThread
262
292
void GenXPrologEpilogInsertion::generateKernelProlog (Function &F) {
293
+ LLVM_DEBUG (dbgs () << " Generating kernel prologue for " << F.getName ()
294
+ << " \n " );
263
295
IRBuilder<> IRB (&F.getEntryBlock ().front ());
264
296
Function *HWID = GenXIntrinsic::getGenXDeclaration (
265
297
F.getParent (), llvm::GenXIntrinsic::genx_get_hwid, {});
@@ -289,6 +321,8 @@ void GenXPrologEpilogInsertion::generateKernelProlog(Function &F) {
289
321
// else:
290
322
// read from stackmem
291
323
void GenXPrologEpilogInsertion::generateFunctionProlog (Function &F) {
324
+ LLVM_DEBUG (dbgs () << " Generating function prologue for " << F.getName ()
325
+ << " \n " );
292
326
IRBuilder<> IRB (&F.getEntryBlock ().front ());
293
327
unsigned Offset = 0 ;
294
328
Value *Sp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
@@ -346,6 +380,8 @@ void GenXPrologEpilogInsertion::generateFunctionProlog(Function &F) {
346
380
// write to stackmem
347
381
void GenXPrologEpilogInsertion::generateFunctionEpilog (Function &F,
348
382
ReturnInst &I) {
383
+ LLVM_DEBUG (dbgs () << " Generating function epilogue for " << F.getName ()
384
+ << " \n " );
349
385
IRBuilder<> IRB (&I);
350
386
unsigned RetSize = 0 ;
351
387
if (!F.getReturnType ()->isVoidTy ()) {
@@ -423,14 +459,20 @@ void GenXPrologEpilogInsertion::generateFunctionEpilog(Function &F,
423
459
divideCeil (RetSize, ST->getGRFWidth ())))));
424
460
}
425
461
426
- void GenXPrologEpilogInsertion::generateStackCall (CallInst *CI) {
427
- IRBuilder<> IRB (CI);
462
+ // write stack call args
463
+ // returns total offset
464
+ unsigned GenXPrologEpilogInsertion::writeArgs (CallInst *CI, Value *SpArgs,
465
+ IRBuilder<> &IRB) {
428
466
unsigned Offset = 0 ;
429
- Value *OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
430
- IRB.getInt64Ty (), true );
431
- auto *SpArgs = OrigSp;
432
- // write args
467
+ std::map<Value *, Value *> ReplaceArgs;
468
+
433
469
for (auto &Arg : CI->arg_operands ()) {
470
+ // it is tempting to skip here if Arg already is in ReplaceArgs map
471
+ // but it will be wrong to do so, because consider:
472
+ // foo(x, x, y, y, x, y)
473
+ // on callee side we are expecting 6 positions in predef args
474
+ // we can not optimize these out on caller side
475
+
434
476
auto *OrigTy = Arg->getType ();
435
477
if (OrigTy->getScalarType ()->isIntegerTy (1 )) {
436
478
if (!HandleMaskArgs)
@@ -460,11 +502,27 @@ void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
460
502
if (OrigTy->getScalarType ()->isIntegerTy (1 ))
461
503
ArgRegWrite = cast<Instruction>(
462
504
IRB.CreateBitOrPointerCast (ArgRegWrite,OrigTy));
463
- CI-> replaceUsesOfWith ( Arg, ArgRegWrite) ;
505
+ ReplaceArgs[ Arg] = ArgRegWrite;
464
506
Offset += ArgSize;
465
507
}
466
508
}
467
509
510
+ for (auto &&Pair : ReplaceArgs)
511
+ CI->replaceUsesOfWith (Pair.first , Pair.second );
512
+ return Offset;
513
+ }
514
+
515
+ // generate caller site of stack call
516
+ void GenXPrologEpilogInsertion::generateStackCall (CallInst *CI) {
517
+ LLVM_DEBUG (dbgs () << " Generating stack call for:\n " );
518
+ LLVM_DEBUG (CI->dump ());
519
+ LLVM_DEBUG (dbgs () << " \n " );
520
+ IRBuilder<> IRB (CI);
521
+ Value *OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
522
+ IRB.getInt64Ty (), true );
523
+ // write args, return total offset in arg register
524
+ unsigned Offset = writeArgs (CI, OrigSp, IRB);
525
+
468
526
CI->setMetadata (
469
527
InstMD::FuncArgSize,
470
528
MDNode::get (CI->getContext (),
0 commit comments