Skip to content

Commit f471182

Browse files
Konstantin Vladimirovigcbot
authored andcommitted
fix stackcalls bug with repeated arguments
We had situation where foo(x, y, y) was buggy because of wrong compiler optimization in prologue/epilogue insertion
1 parent 3614379 commit f471182

File tree

1 file changed

+66
-8
lines changed

1 file changed

+66
-8
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXPrologEpilogInsertion.cpp

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,27 @@ SPDX-License-Identifier: MIT
1616
/// VISA regalloc makes sure that dests & sources of the generated instructions
1717
/// are allocated to the proper predefined VISA regs
1818
///
19+
/// Lets say before transformation we have call with arg loaded from surface:
20+
/// %arg = call <8 x float> @llvm.genx.oword.ld.v8f32(i32 0, i32 1, i32 %addr)
21+
/// %ret = call spir_func <8 x float> @foo(<8 x float> %arg)
22+
///
23+
/// Then after we will see something like this:
24+
/// %arg = call <8 x float> @llvm.genx.oword.ld.v8f32(i32 0, i32 1, i32 %addr)
25+
/// %R8 = call <256 x float> read.predef.reg(i32 PREDEFINED_ARG, undef)
26+
/// %NEWR8 = <256 x float> wrregion(<256 x float> %R8, <8 x float> %arg,
27+
/// i32 0, i32 8, i32 1, OFFSET, ...)
28+
/// ; Here OFFSET starts from 0 and is argument offset in predef register
29+
/// %newarg = call <8 x float> write.predef.reg(i32 PREDEFINED_ARG,
30+
/// <256 x float> %NEWR8)
31+
/// %ret = call spir_func <8 x float> @foo(<8 x float> %newarg)
32+
///
33+
/// So we have explicit stack layout relatively early to use 64-bit splitting
34+
/// on later stages if 64-bit pointers are used as SP/FP
35+
///
1936
//===----------------------------------------------------------------------===//
2037

38+
#define DEBUG_TYPE "GENX_PROLOGUE"
39+
2140
#include "GenX.h"
2241
#include "GenXIntrinsics.h"
2342
#include "GenXModule.h"
@@ -42,6 +61,7 @@ SPDX-License-Identifier: MIT
4261
#include "llvm/IR/InstVisitor.h"
4362
#include "llvm/IR/Module.h"
4463
#include "llvm/Pass.h"
64+
#include "llvm/Support/Debug.h"
4565
#include "llvm/Support/MathExtras.h"
4666

4767
#include "Probe/Assertion.h"
@@ -111,7 +131,13 @@ class GenXPrologEpilogInsertion
111131
void generateKernelProlog(Function &F);
112132
void generateFunctionProlog(Function &F);
113133
void generateFunctionEpilog(Function &F, ReturnInst &I);
134+
135+
// caller side argument layout
114136
void generateStackCall(CallInst *CI);
137+
138+
// generateStackCall subroutine
139+
unsigned writeArgs(CallInst *CI, Value *SpArgs, IRBuilder<> &IRB);
140+
115141
void generateAlloca(CallInst *CI);
116142

117143
Value *push(Value *V, IRBuilder<> &IRB, Value *InitSP);
@@ -216,12 +242,16 @@ bool GenXPrologEpilogInsertion::runOnFunction(Function &F) {
216242
.getGenXSubtarget();
217243
ArgRegSize = visa::ArgRegSizeInGRFs * ST->getGRFWidth();
218244
RetRegSize = visa::RetRegSizeInGRFs * ST->getGRFWidth();
219-
if (!(BEConf->useNewStackBuilder() && ST->isOCLRuntime()))
245+
if (!(BEConf->useNewStackBuilder() && ST->isOCLRuntime())) {
246+
LLVM_DEBUG(dbgs() << "Old builder or CMRT used in " << F.getName() << "\n");
220247
return false;
248+
}
221249
NumCalls = CallsCalculator().getNumCalls(F);
222250
UseGlobalMem =
223251
F.getParent()->getModuleFlag(ModuleMD::UseSVMStack) != nullptr;
252+
LLVM_DEBUG(dbgs() << "Visiting all calls in " << F.getName() << "\n");
224253
visit(F);
254+
LLVM_DEBUG(dbgs() << "Visiting finished\n");
225255
if (genx::isKernel(&F)) {
226256
generateKernelProlog(F);
227257
// no epilog is required for kernels
@@ -260,6 +290,8 @@ void GenXPrologEpilogInsertion::visitReturnInst(ReturnInst &I) {
260290

261291
// FE_SP = PrivateBase + HWTID * PrivMemPerThread
262292
void GenXPrologEpilogInsertion::generateKernelProlog(Function &F) {
293+
LLVM_DEBUG(dbgs() << "Generating kernel prologue for " << F.getName()
294+
<< "\n");
263295
IRBuilder<> IRB(&F.getEntryBlock().front());
264296
Function *HWID = GenXIntrinsic::getGenXDeclaration(
265297
F.getParent(), llvm::GenXIntrinsic::genx_get_hwid, {});
@@ -289,6 +321,8 @@ void GenXPrologEpilogInsertion::generateKernelProlog(Function &F) {
289321
// else:
290322
// read from stackmem
291323
void GenXPrologEpilogInsertion::generateFunctionProlog(Function &F) {
324+
LLVM_DEBUG(dbgs() << "Generating function prologue for " << F.getName()
325+
<< "\n");
292326
IRBuilder<> IRB(&F.getEntryBlock().front());
293327
unsigned Offset = 0;
294328
Value *Sp = buildReadPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
@@ -346,6 +380,8 @@ void GenXPrologEpilogInsertion::generateFunctionProlog(Function &F) {
346380
// write to stackmem
347381
void GenXPrologEpilogInsertion::generateFunctionEpilog(Function &F,
348382
ReturnInst &I) {
383+
LLVM_DEBUG(dbgs() << "Generating function epilogue for " << F.getName()
384+
<< "\n");
349385
IRBuilder<> IRB(&I);
350386
unsigned RetSize = 0;
351387
if (!F.getReturnType()->isVoidTy()) {
@@ -423,14 +459,20 @@ void GenXPrologEpilogInsertion::generateFunctionEpilog(Function &F,
423459
divideCeil(RetSize, ST->getGRFWidth())))));
424460
}
425461

426-
void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
427-
IRBuilder<> IRB(CI);
462+
// write stack call args
463+
// returns total offset
464+
unsigned GenXPrologEpilogInsertion::writeArgs(CallInst *CI, Value *SpArgs,
465+
IRBuilder<> &IRB) {
428466
unsigned Offset = 0;
429-
Value *OrigSp = buildReadPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
430-
IRB.getInt64Ty(), true);
431-
auto *SpArgs = OrigSp;
432-
// write args
467+
std::map<Value *, Value *> ReplaceArgs;
468+
433469
for (auto &Arg : CI->arg_operands()) {
470+
// it is tempting to skip here if Arg already is in ReplaceArgs map
471+
// but it will be wrong to do so, because consider:
472+
// foo(x, x, y, y, x, y)
473+
// on callee side we are expecting 6 positions in predef args
474+
// we can not optimize these out on caller side
475+
434476
auto *OrigTy = Arg->getType();
435477
if (OrigTy->getScalarType()->isIntegerTy(1)) {
436478
if (!HandleMaskArgs)
@@ -460,11 +502,27 @@ void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
460502
if (OrigTy->getScalarType()->isIntegerTy(1))
461503
ArgRegWrite = cast<Instruction>(
462504
IRB.CreateBitOrPointerCast(ArgRegWrite,OrigTy));
463-
CI->replaceUsesOfWith(Arg, ArgRegWrite);
505+
ReplaceArgs[Arg] = ArgRegWrite;
464506
Offset += ArgSize;
465507
}
466508
}
467509

510+
for (auto &&Pair : ReplaceArgs)
511+
CI->replaceUsesOfWith(Pair.first, Pair.second);
512+
return Offset;
513+
}
514+
515+
// generate caller site of stack call
516+
void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
517+
LLVM_DEBUG(dbgs() << "Generating stack call for:\n");
518+
LLVM_DEBUG(CI->dump());
519+
LLVM_DEBUG(dbgs() << "\n");
520+
IRBuilder<> IRB(CI);
521+
Value *OrigSp = buildReadPredefReg(PreDefined_Vars::PREDEFINED_FE_SP, IRB,
522+
IRB.getInt64Ty(), true);
523+
// write args, return total offset in arg register
524+
unsigned Offset = writeArgs(CI, OrigSp, IRB);
525+
468526
CI->setMetadata(
469527
InstMD::FuncArgSize,
470528
MDNode::get(CI->getContext(),

0 commit comments

Comments
 (0)