@@ -135,8 +135,11 @@ class GenXPrologEpilogInsertion
135
135
// caller side argument layout
136
136
void generateStackCall (CallInst *CI);
137
137
138
- // generateStackCall subroutine
138
+ // generateStackCall subroutines: writing args, extracting args
139
139
unsigned writeArgs (CallInst *CI, Value *SpArgs, IRBuilder<> &IRB);
140
+ std::vector<std::pair<Instruction *, Instruction *>>
141
+ buildWorkList (CallInst *CI, Value *OrigSp, bool UseMemForRet);
142
+ void extractResults (CallInst *CI, Value *OrigSp, IRBuilder<> &IRB);
140
143
141
144
void generateAlloca (CallInst *CI);
142
145
@@ -464,15 +467,15 @@ void GenXPrologEpilogInsertion::generateFunctionEpilog(Function &F,
464
467
unsigned GenXPrologEpilogInsertion::writeArgs (CallInst *CI, Value *SpArgs,
465
468
IRBuilder<> &IRB) {
466
469
unsigned Offset = 0 ;
467
- std::map<Value *, Value *> ReplaceArgs;
470
+ std::vector<std::pair<int , Value *>> ReplaceArgs; // ArgNo, Arg
471
+ ReplaceArgs.reserve (CI->getNumArgOperands ());
468
472
469
473
for (auto &Arg : CI->arg_operands ()) {
470
474
// it is tempting to skip here if Arg already is in ReplaceArgs map
471
475
// but it will be wrong to do so, because consider:
472
476
// foo(x, x, y, y, x, y)
473
477
// on callee side we are expecting 6 positions in predef args
474
478
// we can not optimize these out on caller side
475
-
476
479
auto *OrigTy = Arg->getType ();
477
480
if (OrigTy->getScalarType ()->isIntegerTy (1 )) {
478
481
if (!HandleMaskArgs)
@@ -502,51 +505,28 @@ unsigned GenXPrologEpilogInsertion::writeArgs(CallInst *CI, Value *SpArgs,
502
505
if (OrigTy->getScalarType ()->isIntegerTy (1 ))
503
506
ArgRegWrite = cast<Instruction>(
504
507
IRB.CreateBitOrPointerCast (ArgRegWrite,OrigTy));
505
- ReplaceArgs[ Arg] = ArgRegWrite;
508
+ ReplaceArgs. emplace_back ( Arg. getOperandNo (), ArgRegWrite) ;
506
509
Offset += ArgSize;
507
510
}
508
511
}
509
512
510
- for (auto &&Pair : ReplaceArgs)
511
- CI->replaceUsesOfWith (Pair.first , Pair.second );
513
+ // here ">=" used to account for memory-passing of argument tail
514
+ IGC_ASSERT_MESSAGE (CI->getNumArgOperands () >= ReplaceArgs.size (),
515
+ " ReplaceArgs too large" );
516
+ for (auto &&NewArg : ReplaceArgs)
517
+ CI->setArgOperand (NewArg.first , NewArg.second );
512
518
return Offset;
513
519
}
514
520
515
- // generate caller site of stack call
516
- void GenXPrologEpilogInsertion::generateStackCall (CallInst *CI) {
517
- LLVM_DEBUG (dbgs () << " Generating stack call for:\n " );
518
- LLVM_DEBUG (CI->dump ());
519
- LLVM_DEBUG (dbgs () << " \n " );
520
- IRBuilder<> IRB (CI);
521
- Value *OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
522
- IRB.getInt64Ty (), true );
523
- // write args, return total offset in arg register
524
- unsigned Offset = writeArgs (CI, OrigSp, IRB);
525
-
526
- CI->setMetadata (
527
- InstMD::FuncArgSize,
528
- MDNode::get (CI->getContext (),
529
- ConstantAsMetadata::get (IRB.getInt32 (
530
- (Offset + ST->getGRFWidth () - 1 ) / ST->getGRFWidth ()))));
531
- bool isVoidCall = CI->getType ()->isVoidTy ();
532
- CI->setMetadata (
533
- InstMD::FuncRetSize,
534
- MDNode::get (CI->getContext (),
535
- ConstantAsMetadata::get (IRB.getInt32 (divideCeil (
536
- (isVoidCall ? 0
537
- : (DL->getTypeSizeInBits (CI->getType ())) /
538
- genx::ByteBits),
539
- ST->getGRFWidth ())))));
540
- if (isVoidCall)
541
- return ;
542
- IRB.SetInsertPoint (CI->getNextNode ());
543
- bool UseMemForRet =
544
- ForceRetMemPassing ||
545
- DL->getTypeSizeInBits (CI->getType ()) / genx::ByteBits > RetRegSize;
546
- if (UseMemForRet)
547
- OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
548
- IRB.getInt64Ty (), CI, true );
549
- // read retvalue
521
+ // build worklist for extraction
522
+ // worklist entry format:
523
+ // first: actual return
524
+ // second: return insertion point
525
+ // this might be critical for structure return due to odd agreement of
526
+ // returning structures
527
+ std::vector<std::pair<Instruction *, Instruction *>>
528
+ GenXPrologEpilogInsertion::buildWorkList (CallInst *CI, Value *OrigSp,
529
+ bool UseMemForRet) {
550
530
std::vector<std::pair<Instruction *, Instruction *>> Worklist;
551
531
if (isa<StructType>(CI->getType ())) {
552
532
for (auto *U : CI->users ()) {
@@ -556,7 +536,26 @@ void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
556
536
Worklist.push_back ({cast<Instruction>(U), cast<Instruction>(U)});
557
537
}
558
538
} else
539
+ // OrigSP as instruction is read.predef.reg
559
540
Worklist.push_back ({CI, UseMemForRet ? cast<Instruction>(OrigSp) : CI});
541
+ return Worklist;
542
+ }
543
+
544
+ // extract results from stack call return
545
+ void GenXPrologEpilogInsertion::extractResults (CallInst *CI, Value *OrigSp,
546
+ IRBuilder<> &IRB) {
547
+ IRB.SetInsertPoint (CI->getNextNode ());
548
+ bool UseMemForRet =
549
+ ForceRetMemPassing ||
550
+ DL->getTypeSizeInBits (CI->getType ()) / genx::ByteBits > RetRegSize;
551
+ if (UseMemForRet)
552
+ OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
553
+ IRB.getInt64Ty (), CI, true );
554
+
555
+ // collect return slots
556
+ auto Worklist = buildWorkList (CI, OrigSp, UseMemForRet);
557
+
558
+ // process return slots
560
559
for (auto &I : Worklist) {
561
560
auto *ActualRet = I.first ;
562
561
IRB.SetInsertPoint (I.second ->getNextNode ());
@@ -628,6 +627,38 @@ void GenXPrologEpilogInsertion::generateStackCall(CallInst *CI) {
628
627
}
629
628
}
630
629
630
+ // generate caller site of stack call
631
+ void GenXPrologEpilogInsertion::generateStackCall (CallInst *CI) {
632
+ LLVM_DEBUG (dbgs () << " Generating stack call for:\n " );
633
+ LLVM_DEBUG (CI->dump ());
634
+ LLVM_DEBUG (dbgs () << " \n " );
635
+ IRBuilder<> IRB (CI);
636
+ Value *OrigSp = buildReadPredefReg (PreDefined_Vars::PREDEFINED_FE_SP, IRB,
637
+ IRB.getInt64Ty (), true );
638
+ // write args, return total offset in arg register
639
+ unsigned Offset = writeArgs (CI, OrigSp, IRB);
640
+
641
+ CI->setMetadata (
642
+ InstMD::FuncArgSize,
643
+ MDNode::get (CI->getContext (),
644
+ ConstantAsMetadata::get (IRB.getInt32 (
645
+ (Offset + ST->getGRFWidth () - 1 ) / ST->getGRFWidth ()))));
646
+ bool isVoidCall = CI->getType ()->isVoidTy ();
647
+ CI->setMetadata (
648
+ InstMD::FuncRetSize,
649
+ MDNode::get (CI->getContext (),
650
+ ConstantAsMetadata::get (IRB.getInt32 (divideCeil (
651
+ (isVoidCall ? 0
652
+ : (DL->getTypeSizeInBits (CI->getType ())) /
653
+ genx::ByteBits),
654
+ ST->getGRFWidth ())))));
655
+ if (isVoidCall)
656
+ return ;
657
+
658
+ // read retvalue
659
+ extractResults (CI, OrigSp, IRB);
660
+ }
661
+
631
662
// alloca_base = FE_SP
632
663
// FE_SP += sizeof(alloca)
633
664
void GenXPrologEpilogInsertion::generateAlloca (CallInst *CI) {
0 commit comments