Skip to content

Commit fb42ffb

Browse files
pkwasnie-inteligcbot
authored andcommitted
precompiled emulation inlining improvements
Improvements for inlining mechanism of precompiled emulation functions (int64 math, dp math etc.). Instead of no inlining at all when total number of inlined instructions reaches threshold, inline as much as possible until threshold is reached.
1 parent 1174017 commit fb42ffb

File tree

6 files changed

+368
-66
lines changed

6 files changed

+368
-66
lines changed

IGC/Compiler/Optimizer/PreCompiledFuncImport.cpp

Lines changed: 216 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
1717
#include "llvm/IR/InstIterator.h"
1818
#include "llvm/Support/MemoryBuffer.h"
1919
#include "llvm/Support/GenericDomTree.h"
20+
#include "llvm/Transforms/Utils/Cloning.h"
2021
#include "llvm/Bitcode/BitcodeReader.h"
2122
#include "llvm/Bitcode/BitcodeWriter.h"
2223
#include "llvm/Linker/Linker.h"
@@ -632,11 +633,16 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
632633
m_changed = false;
633634

634635
// When we test it, we need to set emuKind
635-
if (IGC_IS_FLAG_ENABLED(TestIGCPreCompiledFunctions))
636+
if (IGC_GET_FLAG_VALUE(TestIGCPreCompiledFunctions) == 1)
636637
{
637638
m_emuKind = EmuKind::EMU_DP;
638639
checkAndSetEnableSubroutine();
639640
}
641+
else if (IGC_GET_FLAG_VALUE(TestIGCPreCompiledFunctions) == 2)
642+
{
643+
m_emuKind = EmuKind::EMU_DP_DIV_SQRT;
644+
checkAndSetEnableSubroutine();
645+
}
640646
// sanity check
641647
if (m_emuKind == 0) {
642648
// Nothing to emulate
@@ -826,12 +832,11 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
826832
}
827833
}
828834

829-
unsigned totalNumberOfInlinedInst = 0;
835+
llvm::SmallVector<ImportedFunction, 32> importedFunctions;
836+
unsigned totalNumberOfInlinedInst = 0, totalNumberOfPotentiallyInlinedInst = 0;
830837
int emuFC = (int)IGC_GET_FLAG_VALUE(EmulationFunctionControl);
831838

832-
// Post processing, set those imported functions as internal linkage
833-
// and alwaysinline. Also count how many instructions would be added
834-
// to the shader if inlining occurred.
839+
// Post processing, set those imported functions as internal linkage.
835840
for (auto II = M.begin(), IE = M.end(); II != IE; )
836841
{
837842
Function* Func = &(*II);
@@ -853,92 +858,106 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
853858
continue;
854859
}
855860

856-
// Remove noinline/AlwaysInline attr if present.
857-
Func->removeFnAttr(llvm::Attribute::NoInline);
861+
if (std::find(importedFunctions.begin(), importedFunctions.end(), Func) == importedFunctions.end())
862+
importedFunctions.push_back(Func);
863+
}
864+
else
865+
{
866+
// Make sure original func isn't inlined accidentally.
858867
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
868+
}
869+
}
859870

860-
if (m_enableCallForEmulation &&
861-
emuFC != FLAG_FCALL_DEFAULT &&
862-
emuFC != FLAG_FCALL_FORCE_INLINE)
863-
{
864-
// Disable inlining completely.
865-
continue;
866-
}
867-
868-
if (Func->hasOneUse() || emuFC == FLAG_FCALL_FORCE_INLINE)
869-
{
870-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
871-
continue;
872-
}
871+
// Sort imported instructions in preferred inlining order.
872+
std::sort(importedFunctions.begin(), importedFunctions.end(), ImportedFunction::compare);
873873

874-
// Count number of instructions in the function
875-
unsigned NumInst = 0;
876-
for (BasicBlock& BB : Func->getBasicBlockList()) {
877-
NumInst += BB.getInstList().size();
878-
}
874+
// Post processing, set those imported functions as alwaysinline.
875+
// Also count how many instructions would be added to the shader
876+
// if inlining occurred.
877+
for (auto II = importedFunctions.begin(), IE = importedFunctions.end(); II != IE; ++II)
878+
{
879+
Function* Func = II->F;
879880

880-
// Don't want to subroutine small functions
881-
if (NumInst <= 5)
882-
{
883-
// Add AlwaysInline attribute to force inlining all calls.
884-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
881+
// Remove noinline/AlwaysInline attr if present.
882+
Func->removeFnAttr(llvm::Attribute::NoInline);
883+
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
885884

886-
continue;
887-
}
885+
if (m_enableCallForEmulation &&
886+
emuFC != FLAG_FCALL_DEFAULT &&
887+
emuFC != FLAG_FCALL_FORCE_INLINE)
888+
{
889+
// Disable inlining completely.
890+
continue;
891+
}
888892

889-
totalNumberOfInlinedInst += NumInst * Func->getNumUses();
893+
if (Func->hasOneUse() || emuFC == FLAG_FCALL_FORCE_INLINE)
894+
{
895+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
896+
continue;
890897
}
891-
else
898+
899+
// Don't want to subroutine small functions
900+
if (II->funcInstructions <= 5)
892901
{
893-
// Make sure original func isn't inlined accidentally.
894-
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
902+
// Add AlwaysInline attribute to force inlining all calls.
903+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
904+
905+
continue;
895906
}
896-
}
897907

898-
// If true, it is a slow version of DP emu functions. Those functions
899-
// are the original ones for just passing conformance, not for perf.
900-
auto isSlowDPEmuFunc = [](Function* F) {
901-
StringRef FN = F->getName();
902-
if (FN.equals("__igcbuiltin_dp_add") ||
903-
FN.equals("__igcbuiltin_dp_sub") ||
904-
FN.equals("__igcbuiltin_dp_fma") ||
905-
FN.equals("__igcbuiltin_dp_mul") ||
906-
FN.equals("__igcbuiltin_dp_div") ||
907-
FN.equals("__igcbuiltin_dp_cmp") ||
908-
FN.equals("__igcbuiltin_dp_to_int32") ||
909-
FN.equals("__igcbuiltin_dp_to_uint32") ||
910-
FN.equals("__igcbuiltin_int32_to_dp") ||
911-
FN.equals("__igcbuiltin_uint32_to_dp") ||
912-
FN.equals("__igcbuiltin_dp_to_sp") ||
913-
FN.equals("__igcbuiltin_sp_to_dp") ||
914-
FN.equals("__igcbuiltin_dp_sqrt")) {
915-
return true;
908+
// Don't inline original slow DP emu functions, they are only for passing
909+
// conformance, not for perf.
910+
if (isDPEmu() && II->isSlowDPEmuFunc())
911+
continue;
912+
913+
totalNumberOfPotentiallyInlinedInst += II->totalInstructions;
914+
915+
// If function fits in threshold, always inline.
916+
if (totalNumberOfInlinedInst + II->totalInstructions <= (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold))
917+
{
918+
totalNumberOfInlinedInst += II->totalInstructions;
919+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
916920
}
917-
return false;
918-
};
921+
}
919922

920-
for (auto II = M.begin(), IE = M.end(); II != IE; )
923+
// Check if more functions can fit in threshold if they would be split into inline/noinline copies.
924+
if (m_enableCallForEmulation && emuFC == FLAG_FCALL_DEFAULT && totalNumberOfInlinedInst < (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold))
921925
{
922-
Function* Func = &(*II);
923-
++II;
924-
if (!Func || Func->isDeclaration())
926+
for (auto II = importedFunctions.begin(); II != importedFunctions.end(); ++II)
925927
{
926-
continue;
928+
Function* Func = II->F;
929+
930+
if (Func->hasFnAttribute(llvm::Attribute::AlwaysInline) || (isDPEmu() && II->isSlowDPEmuFunc()))
931+
continue;
932+
933+
unsigned calls = ((unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) - totalNumberOfInlinedInst) / II->funcInstructions;
934+
if (calls > 0)
935+
{
936+
// Split function into inline/no-inline copies.
937+
ImportedFunction copy = createInlinedCopy(*II, calls);
938+
importedFunctions.push_back(copy);
939+
totalNumberOfInlinedInst += copy.totalInstructions;
940+
}
927941
}
942+
}
943+
944+
for (auto II = importedFunctions.begin(), IE = importedFunctions.end(); II != IE; ++II)
945+
{
946+
Function* Func = II->F;
928947

929-
if (!origFunctions.count(Func) && !Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
948+
if (!Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
930949
{
931950
// Special handling of DP functions: any one that has not been marked as inline
932951
// at this point, it will be either subroutine or stackcall.
933-
const bool isDPCallFunc = (isDPEmu() && isSlowDPEmuFunc(Func));
952+
const bool isDPCallFunc = (isDPEmu() && II->isSlowDPEmuFunc());
934953

935954
// Use subroutine/stackcall for some DP emulation functions if
936955
// EmulationFunctionControl is set so, or
937956
// use subroutines if total number of instructions added when
938957
// all emulated functions are inlined exceed InlinedEmulationThreshold.
939958
// If Func is a slow version of DP emu func, perf isn't important.
940959
if (m_enableCallForEmulation &&
941-
(totalNumberOfInlinedInst > (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) ||
960+
(totalNumberOfPotentiallyInlinedInst > (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) ||
942961
isDPCallFunc))
943962
{
944963
Func->addFnAttr(llvm::Attribute::NoInline);
@@ -1003,6 +1022,128 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
10031022
return m_changed;
10041023
}
10051024

1025+
PreCompiledFuncImport::ImportedFunction::ImportedFunction(Function* F)
1026+
: F(F), type(EmuType::OTHER), funcInstructions(0), totalInstructions(0)
1027+
{
1028+
// Count number of new instructions added by inlining.
1029+
for (BasicBlock& BB : F->getBasicBlockList())
1030+
funcInstructions += BB.getInstList().size();
1031+
1032+
updateUses();
1033+
1034+
// Get type of imported function.
1035+
StringRef name = F->getName();
1036+
1037+
if (name.equals("__igcbuiltin_dp_div_nomadm_ieee") ||
1038+
name.equals("__igcbuiltin_dp_div_nomadm_fast") ||
1039+
name.equals("__igcbuiltin_dp_sqrt_nomadm_ieee") ||
1040+
name.equals("__igcbuiltin_dp_sqrt_nomadm_fast"))
1041+
{
1042+
type = EmuType::FASTDP;
1043+
}
1044+
else if (name.equals("__igcbuiltin_dp_add") ||
1045+
name.equals("__igcbuiltin_dp_sub") ||
1046+
name.equals("__igcbuiltin_dp_fma") ||
1047+
name.equals("__igcbuiltin_dp_mul") ||
1048+
name.equals("__igcbuiltin_dp_div") ||
1049+
name.equals("__igcbuiltin_dp_cmp") ||
1050+
name.equals("__igcbuiltin_dp_to_int32") ||
1051+
name.equals("__igcbuiltin_dp_to_uint32") ||
1052+
name.equals("__igcbuiltin_int32_to_dp") ||
1053+
name.equals("__igcbuiltin_uint32_to_dp") ||
1054+
name.equals("__igcbuiltin_dp_to_sp") ||
1055+
name.equals("__igcbuiltin_sp_to_dp") ||
1056+
name.equals("__igcbuiltin_dp_sqrt"))
1057+
{
1058+
// If true, it is a slow version of DP emu functions. Those functions
1059+
// are the original ones for just passing conformance, not for perf.
1060+
type = EmuType::SLOWDP;
1061+
}
1062+
else
1063+
{
1064+
for (int i = 0; i < NUM_FUNCTIONS && type == EmuType::OTHER; ++i)
1065+
{
1066+
for (int j = 0; j < NUM_TYPES && type == EmuType::OTHER; ++j)
1067+
{
1068+
if (name.equals(m_Int64SpDivRemFunctionNames[i][j]) ||
1069+
name.equals(m_Int64DpDivRemFunctionNames[i][j]))
1070+
{
1071+
type = EmuType::INT64;
1072+
}
1073+
}
1074+
}
1075+
}
1076+
}
1077+
1078+
void PreCompiledFuncImport::ImportedFunction::updateUses()
1079+
{
1080+
totalInstructions = funcInstructions * F->getNumUses();
1081+
}
1082+
1083+
PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::ImportedFunction::copy(ImportedFunction& other)
1084+
{
1085+
ValueToValueMapTy VM;
1086+
Function* copy = CloneFunction(other.F, VM);
1087+
return PreCompiledFuncImport::ImportedFunction(copy, other.type, other.funcInstructions, 0);
1088+
}
1089+
1090+
// Compare two imported functions in order preferred for inlining.
1091+
bool PreCompiledFuncImport::ImportedFunction::compare(ImportedFunction& L, ImportedFunction& R)
1092+
{
1093+
// First sort by preferred type of emulation.
1094+
if (L.type != R.type)
1095+
return L.type < R.type;
1096+
1097+
// Then sort by number of inlined instructions.
1098+
return L.totalInstructions < R.totalInstructions;
1099+
};
1100+
1101+
PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::createInlinedCopy(ImportedFunction& IF, unsigned n)
1102+
{
1103+
std::vector<CallInst*> toDelete;
1104+
1105+
// Make copy that is always inlined.
1106+
ImportedFunction copy = ImportedFunction::copy(IF);
1107+
copy.F->setName(IF.F->getName() + "_always_inline");
1108+
copy.F->addFnAttr(llvm::Attribute::AlwaysInline);
1109+
1110+
// Collect first n calls to replace with copy.
1111+
llvm::SmallVector<CallInst*, 8> calls;
1112+
auto it = IF.F->user_begin();
1113+
for (unsigned i = 0; i < n; ++i)
1114+
{
1115+
CallInst* oldCall = dyn_cast<CallInst>(*(it++));
1116+
IGC_ASSERT(oldCall);
1117+
calls.push_back(oldCall);
1118+
}
1119+
1120+
// Replace with always inlined copy.
1121+
for (CallInst* oldCall : calls)
1122+
{
1123+
std::vector<Value*> args;
1124+
for (unsigned arg = 0; arg < IGCLLVM::getNumArgOperands(oldCall); ++arg)
1125+
args.push_back(oldCall->getArgOperand(arg));
1126+
1127+
// Create new call and insert it before old one
1128+
CallInst* newCall = CallInst::Create(copy.F, args, "", oldCall);
1129+
1130+
newCall->setCallingConv(copy.F->getCallingConv());
1131+
newCall->setAttributes(oldCall->getAttributes());
1132+
newCall->setDebugLoc(oldCall->getDebugLoc());
1133+
1134+
oldCall->replaceAllUsesWith(newCall);
1135+
toDelete.push_back(oldCall);
1136+
}
1137+
1138+
for (auto C : toDelete)
1139+
C->eraseFromParent();
1140+
1141+
copy.updateUses();
1142+
IF.updateUses();
1143+
1144+
return copy;
1145+
}
1146+
10061147
void PreCompiledFuncImport::visitBinaryOperator(BinaryOperator& I)
10071148
{
10081149
if (I.getOperand(0)->getType()->isIntOrIntVectorTy())
@@ -2547,6 +2688,7 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
25472688
bool SPDiv = isSPDiv();
25482689
bool DPEmu = isDPEmu();
25492690
bool DPDivSqrtEmu = isDPDivSqrtEmu();
2691+
bool I64DivRem = isI64DivRem();
25502692

25512693
Module* M = m_pCtx->getModule();
25522694
for (auto FI = M->begin(), FE = M->end(); FI != FE; ++FI)
@@ -2589,6 +2731,15 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
25892731
m_enableCallForEmulation = true;
25902732
}
25912733
break;
2734+
case Instruction::UDiv:
2735+
case Instruction::URem:
2736+
case Instruction::SDiv:
2737+
case Instruction::SRem:
2738+
if (I64DivRem && I->getOperand(0)->getType()->isIntegerTy(64))
2739+
{
2740+
m_enableCallForEmulation = true;
2741+
}
2742+
break;
25922743
}
25932744

25942745
GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(I);

0 commit comments

Comments
 (0)