Skip to content

Commit aba71d6

Browse files
dlei6gigcbot
authored andcommitted
Create FunctionCloningThreshold flag to limit the number of times functions can be cloned.
Functions are cloned for each function group they belong to. This can cause a function to be compiled N times for N function groups that all call this function. When N is large, this will greatly increase the compile time and memory usage. This flag limits the number of times a function can be cloned, such that if it's exceeded, we convert it to an indirectly called function. It will only be compiled once, and runtime relocation is used to patch the function address to each function group which it belongs to. The default value is 0, which means there is no limit.
1 parent c13b45e commit aba71d6

File tree

4 files changed

+57
-11
lines changed

4 files changed

+57
-11
lines changed

IGC/AdaptorCommon/ProcessFuncAttributes.cpp

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -887,22 +887,23 @@ bool InsertDummyKernelForSymbolTable::runOnModule(Module& M)
887887

888888
bool needDummyKernel = false;
889889

890-
// Creates an empty dummy kernel.
891-
// This kernel will only be used for creating the symbol table.
892-
// All indirectly called functions will also be attached to this kernel's binary.
893-
if (IGC_IS_FLAG_ENABLED(EnableFunctionPointer) &&
894-
pCtx->type == ShaderType::OPENCL_SHADER)
890+
// Check when we need to generate a dummy kernel. This is only useful for attaching
891+
// the symbol table to its program output for indirect calls and global variable relocation.
892+
if (IGC_IS_FLAG_ENABLED(EnableFunctionPointer) && pCtx->type == ShaderType::OPENCL_SHADER)
895893
{
896-
if (pCtx->m_enableFunctionPointer)
897-
{
894+
if (pCtx->m_enableFunctionPointer) {
898895
// Symbols are needed for external functions and function pointers
899896
needDummyKernel = true;
900897
}
901-
else if (!modMD->inlineProgramScopeOffsets.empty())
902-
{
898+
else if (!modMD->inlineProgramScopeOffsets.empty()) {
903899
// Create one also if global variables are present and require symbols
904900
needDummyKernel = true;
905901
}
902+
else if (IGC_GET_FLAG_VALUE(FunctionCloningThreshold) > 0 && pCtx->enableFunctionCall()) {
903+
// If this flag is enabled and there are any function calls, conservatively create a dummy kernel
904+
// in case we need to transform normal calls into indirect calls to avoid cloning in GenCodeGenModule.cpp
905+
needDummyKernel = true;
906+
}
906907
}
907908

908909
if (needDummyKernel)
@@ -925,7 +926,7 @@ bool InsertDummyKernelForSymbolTable::runOnModule(Module& M)
925926

926927
// Promote SIMD size information from kernels, which has indirectly called
927928
// functions. All such functions will be connected to the default kernel in
928-
// GenCodeGenModule.cpp (addIndirectFuncsToKernelGroup)
929+
// GenCodeGenModule.cpp
929930
for (auto I = M.begin(), E = M.end(); I != E; ++I)
930931
{
931932
Function* F = &(*I);

IGC/Compiler/CISACodeGen/GenCodeGenModule.cpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,25 @@ void GenXCodeGenModule::processFunction(Function& F)
149149
}
150150

151151
IGC_ASSERT(CallerFGs.size() >= 1);
152+
153+
// Get the cloning threshold. If the number of function groups a function belongs to
154+
// exceeds the threshold, instead of cloning the function N times, make it an indirect call
155+
// and use relocation instead. The function will only be compiled once and runtime must relocate
156+
// its address for each caller. This greatly saves on compile time when there are many function
157+
// groups that all call the same function.
158+
auto cloneTheshold = IGC_GET_FLAG_VALUE(FunctionCloningThreshold);
159+
if (cloneTheshold > 0 && CallerFGs.size() > cloneTheshold)
160+
{
161+
auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
162+
auto IFG = FGA->getIndirectCallGroup();
163+
IGC_ASSERT(IFG);
164+
F.addFnAttr("referenced-indirectly");
165+
F.addFnAttr("visaStackCall");
166+
pCtx->m_enableFunctionPointer = true;
167+
FGA->addToFunctionGroup(&F, IFG, &F);
168+
return;
169+
}
170+
152171
bool FirstPair = true;
153172
for (auto FGPair : CallerFGs)
154173
{
@@ -209,6 +228,25 @@ void GenXCodeGenModule::processSCC(std::vector<llvm::CallGraphNode*>* SCCNodes)
209228
}
210229
}
211230
IGC_ASSERT(CallerFGs.size() >= 1);
231+
232+
// Use the same cloning threshold for single function SCCs, but making every stack function
233+
// in the SCC indirect calls to prevent cloning the entire SCC N times.
234+
auto cloneTheshold = IGC_GET_FLAG_VALUE(FunctionCloningThreshold);
235+
if (cloneTheshold > 0 && CallerFGs.size() > cloneTheshold)
236+
{
237+
auto pCtx = getAnalysis<CodeGenContextWrapper>().getCodeGenContext();
238+
for (CallGraphNode* Node : (*SCCNodes))
239+
{
240+
Function* F = Node->getFunction();
241+
auto IFG = FGA->getIndirectCallGroup();
242+
IGC_ASSERT(IFG && F->hasFnAttribute("visaStackCall"));
243+
F->addFnAttr("referenced-indirectly");
244+
pCtx->m_enableFunctionPointer = true;
245+
FGA->addToFunctionGroup(F, IFG, F);
246+
}
247+
return;
248+
}
249+
212250
bool FirstPair = true;
213251
for (auto FG : CallerFGs)
214252
{
@@ -602,7 +640,6 @@ void GenXFunctionGroupAnalysis::addIndirectFuncsToKernelGroup(llvm::Module* pMod
602640
Function* F = &(*I);
603641
if (F->isDeclaration() || isEntryFunc(pMdUtils, F)) continue;
604642

605-
// Add non-used function to default group
606643
if (F->hasFnAttribute("referenced-indirectly") || F->getNumUses() == 0)
607644
{
608645
IGC_ASSERT(getGroup(F) == nullptr);

IGC/Compiler/CISACodeGen/GenCodeGenModule.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -241,6 +241,10 @@ namespace IGC {
241241
return FG != nullptr && FG == IndirectCallGroup;
242242
}
243243

244+
FunctionGroup* getIndirectCallGroup() {
245+
return IndirectCallGroup;
246+
}
247+
244248
/// \brief Check whether this is a group header.
245249
bool isGroupHead(llvm::Function* F) {
246250
return getGroupForHead(F) != nullptr;

IGC/common/igc_flags.def

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,10 @@ DECLARE_IGC_REGKEY(bool, EnableLTODebug, false, "Enable debug inf
360360
DECLARE_IGC_REGKEY(DWORD, FunctionControl, 0, "Control function inlining/subroutine/stackcall. See value defs in igc_flags.hpp.", true)
361361
DECLARE_IGC_REGKEY(bool, EnableStackCallFuncCall, false, "If enabled, the default function call mode will be set to stack call. Otherwise, subroutine call is used.", false)
362362
DECLARE_IGC_REGKEY(bool, ForceInlineStackCallWithImplArg, false, "If enabled, stack calls that uses implicit args will be force inlined.", true)
363+
DECLARE_IGC_REGKEY(DWORD, FunctionCloningThreshold, 0,
364+
"Limits how many times functions can be cloned when called from multiple function groups." \
365+
"If exceeding the cloning threshold, compile the function only once and use address relocation instead." \
366+
"A value of '0' means no limit on times it can be cloned", true)
363367
DECLARE_IGC_REGKEY(DWORD, OCLInlineThreshold, 512, "Setting OCL inline thershold", true)
364368
DECLARE_IGC_REGKEY(bool, DisableAddingAlwaysAttribute, false, "Disable adding always attribute", true)
365369
DECLARE_IGC_REGKEY(bool, EnableForceGroupSize, false, "Enable forcing thread Group Size ForceGroupSizeX and ForceGroupSizeY", false)

0 commit comments

Comments
 (0)