Skip to content

Commit c8e1ef7

Browse files
pkwasnie-inteligcbot
authored andcommitted
Revert "precompiled emulation inlining improvements"
This reverts commit fb42ffb.
1 parent 7c5f3d2 commit c8e1ef7

File tree

6 files changed

+66
-368
lines changed

6 files changed

+66
-368
lines changed

IGC/Compiler/Optimizer/PreCompiledFuncImport.cpp

Lines changed: 65 additions & 216 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ SPDX-License-Identifier: MIT
1717
#include "llvm/IR/InstIterator.h"
1818
#include "llvm/Support/MemoryBuffer.h"
1919
#include "llvm/Support/GenericDomTree.h"
20-
#include "llvm/Transforms/Utils/Cloning.h"
2120
#include "llvm/Bitcode/BitcodeReader.h"
2221
#include "llvm/Bitcode/BitcodeWriter.h"
2322
#include "llvm/Linker/Linker.h"
@@ -633,16 +632,11 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
633632
m_changed = false;
634633

635634
// When we test it, we need to set emuKind
636-
if (IGC_GET_FLAG_VALUE(TestIGCPreCompiledFunctions) == 1)
635+
if (IGC_IS_FLAG_ENABLED(TestIGCPreCompiledFunctions))
637636
{
638637
m_emuKind = EmuKind::EMU_DP;
639638
checkAndSetEnableSubroutine();
640639
}
641-
else if (IGC_GET_FLAG_VALUE(TestIGCPreCompiledFunctions) == 2)
642-
{
643-
m_emuKind = EmuKind::EMU_DP_DIV_SQRT;
644-
checkAndSetEnableSubroutine();
645-
}
646640
// sanity check
647641
if (m_emuKind == 0) {
648642
// Nothing to emulate
@@ -832,11 +826,12 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
832826
}
833827
}
834828

835-
llvm::SmallVector<ImportedFunction, 32> importedFunctions;
836-
unsigned totalNumberOfInlinedInst = 0, totalNumberOfPotentiallyInlinedInst = 0;
829+
unsigned totalNumberOfInlinedInst = 0;
837830
int emuFC = (int)IGC_GET_FLAG_VALUE(EmulationFunctionControl);
838831

839-
// Post processing, set those imported functions as internal linkage.
832+
// Post processing, set those imported functions as internal linkage
833+
// and alwaysinline. Also count how many instructions would be added
834+
// to the shader if inlining occurred.
840835
for (auto II = M.begin(), IE = M.end(); II != IE; )
841836
{
842837
Function* Func = &(*II);
@@ -858,106 +853,92 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
858853
continue;
859854
}
860855

861-
if (std::find(importedFunctions.begin(), importedFunctions.end(), Func) == importedFunctions.end())
862-
importedFunctions.push_back(Func);
863-
}
864-
else
865-
{
866-
// Make sure original func isn't inlined accidentally.
856+
// Remove noinline/AlwaysInline attr if present.
857+
Func->removeFnAttr(llvm::Attribute::NoInline);
867858
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
868-
}
869-
}
870-
871-
// Sort imported instructions in preferred inlining order.
872-
std::sort(importedFunctions.begin(), importedFunctions.end(), ImportedFunction::compare);
873859

874-
// Post processing, set those imported functions as alwaysinline.
875-
// Also count how many instructions would be added to the shader
876-
// if inlining occurred.
877-
for (auto II = importedFunctions.begin(), IE = importedFunctions.end(); II != IE; ++II)
878-
{
879-
Function* Func = II->F;
860+
if (m_enableCallForEmulation &&
861+
emuFC != FLAG_FCALL_DEFAULT &&
862+
emuFC != FLAG_FCALL_FORCE_INLINE)
863+
{
864+
// Disable inlining completely.
865+
continue;
866+
}
880867

881-
// Remove noinline/AlwaysInline attr if present.
882-
Func->removeFnAttr(llvm::Attribute::NoInline);
883-
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
868+
if (Func->hasOneUse() || emuFC == FLAG_FCALL_FORCE_INLINE)
869+
{
870+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
871+
continue;
872+
}
884873

885-
if (m_enableCallForEmulation &&
886-
emuFC != FLAG_FCALL_DEFAULT &&
887-
emuFC != FLAG_FCALL_FORCE_INLINE)
888-
{
889-
// Disable inlining completely.
890-
continue;
891-
}
874+
// Count number of instructions in the function
875+
unsigned NumInst = 0;
876+
for (BasicBlock& BB : Func->getBasicBlockList()) {
877+
NumInst += BB.getInstList().size();
878+
}
892879

893-
if (Func->hasOneUse() || emuFC == FLAG_FCALL_FORCE_INLINE)
894-
{
895-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
896-
continue;
897-
}
880+
// Don't want to subroutine small functions
881+
if (NumInst <= 5)
882+
{
883+
// Add AlwaysInline attribute to force inlining all calls.
884+
Func->addFnAttr(llvm::Attribute::AlwaysInline);
898885

899-
// Don't want to subroutine small functions
900-
if (II->funcInstructions <= 5)
901-
{
902-
// Add AlwaysInline attribute to force inlining all calls.
903-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
886+
continue;
887+
}
904888

905-
continue;
889+
totalNumberOfInlinedInst += NumInst * Func->getNumUses();
906890
}
907-
908-
// Don't inline original slow DP emu functions, they are only for passing
909-
// conformance, not for perf.
910-
if (isDPEmu() && II->isSlowDPEmuFunc())
911-
continue;
912-
913-
totalNumberOfPotentiallyInlinedInst += II->totalInstructions;
914-
915-
// If function fits in threshold, always inline.
916-
if (totalNumberOfInlinedInst + II->totalInstructions <= (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold))
891+
else
917892
{
918-
totalNumberOfInlinedInst += II->totalInstructions;
919-
Func->addFnAttr(llvm::Attribute::AlwaysInline);
893+
// Make sure original func isn't inlined accidentally.
894+
Func->removeFnAttr(llvm::Attribute::AlwaysInline);
920895
}
921896
}
922897

923-
// Check if more functions can fit in threshold if they would be split into inline/noinline copies.
924-
if (m_enableCallForEmulation && emuFC == FLAG_FCALL_DEFAULT && totalNumberOfInlinedInst < (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold))
925-
{
926-
for (auto II = importedFunctions.begin(); II != importedFunctions.end(); ++II)
927-
{
928-
Function* Func = II->F;
929-
930-
if (Func->hasFnAttribute(llvm::Attribute::AlwaysInline) || (isDPEmu() && II->isSlowDPEmuFunc()))
931-
continue;
932-
933-
unsigned calls = ((unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) - totalNumberOfInlinedInst) / II->funcInstructions;
934-
if (calls > 0)
935-
{
936-
// Split function into inline/no-inline copies.
937-
ImportedFunction copy = createInlinedCopy(*II, calls);
938-
importedFunctions.push_back(copy);
939-
totalNumberOfInlinedInst += copy.totalInstructions;
940-
}
898+
// If true, it is a slow version of DP emu functions. Those functions
899+
// are the original ones for just passing conformance, not for perf.
900+
auto isSlowDPEmuFunc = [](Function* F) {
901+
StringRef FN = F->getName();
902+
if (FN.equals("__igcbuiltin_dp_add") ||
903+
FN.equals("__igcbuiltin_dp_sub") ||
904+
FN.equals("__igcbuiltin_dp_fma") ||
905+
FN.equals("__igcbuiltin_dp_mul") ||
906+
FN.equals("__igcbuiltin_dp_div") ||
907+
FN.equals("__igcbuiltin_dp_cmp") ||
908+
FN.equals("__igcbuiltin_dp_to_int32") ||
909+
FN.equals("__igcbuiltin_dp_to_uint32") ||
910+
FN.equals("__igcbuiltin_int32_to_dp") ||
911+
FN.equals("__igcbuiltin_uint32_to_dp") ||
912+
FN.equals("__igcbuiltin_dp_to_sp") ||
913+
FN.equals("__igcbuiltin_sp_to_dp") ||
914+
FN.equals("__igcbuiltin_dp_sqrt")) {
915+
return true;
941916
}
942-
}
917+
return false;
918+
};
943919

944-
for (auto II = importedFunctions.begin(), IE = importedFunctions.end(); II != IE; ++II)
920+
for (auto II = M.begin(), IE = M.end(); II != IE; )
945921
{
946-
Function* Func = II->F;
922+
Function* Func = &(*II);
923+
++II;
924+
if (!Func || Func->isDeclaration())
925+
{
926+
continue;
927+
}
947928

948-
if (!Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
929+
if (!origFunctions.count(Func) && !Func->hasFnAttribute(llvm::Attribute::AlwaysInline))
949930
{
950931
// Special handling of DP functions: any one that has not been marked as inline
951932
// at this point, it will be either subroutine or stackcall.
952-
const bool isDPCallFunc = (isDPEmu() && II->isSlowDPEmuFunc());
933+
const bool isDPCallFunc = (isDPEmu() && isSlowDPEmuFunc(Func));
953934

954935
// Use subroutine/stackcall for some DP emulation functions if
955936
// EmulationFunctionControl is set so, or
956937
// use subroutines if total number of instructions added when
957938
// all emulated functions are inlined exceed InlinedEmulationThreshold.
958939
// If Func is a slow version of DP emu func, perf isn't important.
959940
if (m_enableCallForEmulation &&
960-
(totalNumberOfPotentiallyInlinedInst > (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) ||
941+
(totalNumberOfInlinedInst > (unsigned)IGC_GET_FLAG_VALUE(InlinedEmulationThreshold) ||
961942
isDPCallFunc))
962943
{
963944
Func->addFnAttr(llvm::Attribute::NoInline);
@@ -1022,128 +1003,6 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
10221003
return m_changed;
10231004
}
10241005

1025-
PreCompiledFuncImport::ImportedFunction::ImportedFunction(Function* F)
1026-
: F(F), type(EmuType::OTHER), funcInstructions(0), totalInstructions(0)
1027-
{
1028-
// Count number of new instructions added by inlining.
1029-
for (BasicBlock& BB : F->getBasicBlockList())
1030-
funcInstructions += BB.getInstList().size();
1031-
1032-
updateUses();
1033-
1034-
// Get type of imported function.
1035-
StringRef name = F->getName();
1036-
1037-
if (name.equals("__igcbuiltin_dp_div_nomadm_ieee") ||
1038-
name.equals("__igcbuiltin_dp_div_nomadm_fast") ||
1039-
name.equals("__igcbuiltin_dp_sqrt_nomadm_ieee") ||
1040-
name.equals("__igcbuiltin_dp_sqrt_nomadm_fast"))
1041-
{
1042-
type = EmuType::FASTDP;
1043-
}
1044-
else if (name.equals("__igcbuiltin_dp_add") ||
1045-
name.equals("__igcbuiltin_dp_sub") ||
1046-
name.equals("__igcbuiltin_dp_fma") ||
1047-
name.equals("__igcbuiltin_dp_mul") ||
1048-
name.equals("__igcbuiltin_dp_div") ||
1049-
name.equals("__igcbuiltin_dp_cmp") ||
1050-
name.equals("__igcbuiltin_dp_to_int32") ||
1051-
name.equals("__igcbuiltin_dp_to_uint32") ||
1052-
name.equals("__igcbuiltin_int32_to_dp") ||
1053-
name.equals("__igcbuiltin_uint32_to_dp") ||
1054-
name.equals("__igcbuiltin_dp_to_sp") ||
1055-
name.equals("__igcbuiltin_sp_to_dp") ||
1056-
name.equals("__igcbuiltin_dp_sqrt"))
1057-
{
1058-
// If true, it is a slow version of DP emu functions. Those functions
1059-
// are the original ones for just passing conformance, not for perf.
1060-
type = EmuType::SLOWDP;
1061-
}
1062-
else
1063-
{
1064-
for (int i = 0; i < NUM_FUNCTIONS && type == EmuType::OTHER; ++i)
1065-
{
1066-
for (int j = 0; j < NUM_TYPES && type == EmuType::OTHER; ++j)
1067-
{
1068-
if (name.equals(m_Int64SpDivRemFunctionNames[i][j]) ||
1069-
name.equals(m_Int64DpDivRemFunctionNames[i][j]))
1070-
{
1071-
type = EmuType::INT64;
1072-
}
1073-
}
1074-
}
1075-
}
1076-
}
1077-
1078-
void PreCompiledFuncImport::ImportedFunction::updateUses()
1079-
{
1080-
totalInstructions = funcInstructions * F->getNumUses();
1081-
}
1082-
1083-
PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::ImportedFunction::copy(ImportedFunction& other)
1084-
{
1085-
ValueToValueMapTy VM;
1086-
Function* copy = CloneFunction(other.F, VM);
1087-
return PreCompiledFuncImport::ImportedFunction(copy, other.type, other.funcInstructions, 0);
1088-
}
1089-
1090-
// Compare two imported functions in order preferred for inlining.
1091-
bool PreCompiledFuncImport::ImportedFunction::compare(ImportedFunction& L, ImportedFunction& R)
1092-
{
1093-
// First sort by preferred type of emulation.
1094-
if (L.type != R.type)
1095-
return L.type < R.type;
1096-
1097-
// Then sort by number of inlined instructions.
1098-
return L.totalInstructions < R.totalInstructions;
1099-
};
1100-
1101-
PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::createInlinedCopy(ImportedFunction& IF, unsigned n)
1102-
{
1103-
std::vector<CallInst*> toDelete;
1104-
1105-
// Make copy that is always inlined.
1106-
ImportedFunction copy = ImportedFunction::copy(IF);
1107-
copy.F->setName(IF.F->getName() + "_always_inline");
1108-
copy.F->addFnAttr(llvm::Attribute::AlwaysInline);
1109-
1110-
// Collect first n calls to replace with copy.
1111-
llvm::SmallVector<CallInst*, 8> calls;
1112-
auto it = IF.F->user_begin();
1113-
for (unsigned i = 0; i < n; ++i)
1114-
{
1115-
CallInst* oldCall = dyn_cast<CallInst>(*(it++));
1116-
IGC_ASSERT(oldCall);
1117-
calls.push_back(oldCall);
1118-
}
1119-
1120-
// Replace with always inlined copy.
1121-
for (CallInst* oldCall : calls)
1122-
{
1123-
std::vector<Value*> args;
1124-
for (unsigned arg = 0; arg < IGCLLVM::getNumArgOperands(oldCall); ++arg)
1125-
args.push_back(oldCall->getArgOperand(arg));
1126-
1127-
// Create new call and insert it before old one
1128-
CallInst* newCall = CallInst::Create(copy.F, args, "", oldCall);
1129-
1130-
newCall->setCallingConv(copy.F->getCallingConv());
1131-
newCall->setAttributes(oldCall->getAttributes());
1132-
newCall->setDebugLoc(oldCall->getDebugLoc());
1133-
1134-
oldCall->replaceAllUsesWith(newCall);
1135-
toDelete.push_back(oldCall);
1136-
}
1137-
1138-
for (auto C : toDelete)
1139-
C->eraseFromParent();
1140-
1141-
copy.updateUses();
1142-
IF.updateUses();
1143-
1144-
return copy;
1145-
}
1146-
11471006
void PreCompiledFuncImport::visitBinaryOperator(BinaryOperator& I)
11481007
{
11491008
if (I.getOperand(0)->getType()->isIntOrIntVectorTy())
@@ -2688,7 +2547,6 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
26882547
bool SPDiv = isSPDiv();
26892548
bool DPEmu = isDPEmu();
26902549
bool DPDivSqrtEmu = isDPDivSqrtEmu();
2691-
bool I64DivRem = isI64DivRem();
26922550

26932551
Module* M = m_pCtx->getModule();
26942552
for (auto FI = M->begin(), FE = M->end(); FI != FE; ++FI)
@@ -2731,15 +2589,6 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
27312589
m_enableCallForEmulation = true;
27322590
}
27332591
break;
2734-
case Instruction::UDiv:
2735-
case Instruction::URem:
2736-
case Instruction::SDiv:
2737-
case Instruction::SRem:
2738-
if (I64DivRem && I->getOperand(0)->getType()->isIntegerTy(64))
2739-
{
2740-
m_enableCallForEmulation = true;
2741-
}
2742-
break;
27432592
}
27442593

27452594
GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(I);

0 commit comments

Comments
 (0)