@@ -17,6 +17,7 @@ SPDX-License-Identifier: MIT
17
17
#include " llvm/IR/InstIterator.h"
18
18
#include " llvm/Support/MemoryBuffer.h"
19
19
#include " llvm/Support/GenericDomTree.h"
20
+ #include " llvm/Transforms/Utils/Cloning.h"
20
21
#include " llvm/Bitcode/BitcodeReader.h"
21
22
#include " llvm/Bitcode/BitcodeWriter.h"
22
23
#include " llvm/Linker/Linker.h"
@@ -632,11 +633,16 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
632
633
m_changed = false ;
633
634
634
635
// When we test it, we need to set emuKind
635
- if (IGC_IS_FLAG_ENABLED (TestIGCPreCompiledFunctions))
636
+ if (IGC_GET_FLAG_VALUE (TestIGCPreCompiledFunctions) == 1 )
636
637
{
637
638
m_emuKind = EmuKind::EMU_DP;
638
639
checkAndSetEnableSubroutine ();
639
640
}
641
+ else if (IGC_GET_FLAG_VALUE (TestIGCPreCompiledFunctions) == 2 )
642
+ {
643
+ m_emuKind = EmuKind::EMU_DP_DIV_SQRT;
644
+ checkAndSetEnableSubroutine ();
645
+ }
640
646
// sanity check
641
647
if (m_emuKind == 0 ) {
642
648
// Nothing to emulate
@@ -826,12 +832,11 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
826
832
}
827
833
}
828
834
829
- unsigned totalNumberOfInlinedInst = 0 ;
835
+ llvm::SmallVector<ImportedFunction, 32 > importedFunctions;
836
+ unsigned totalNumberOfInlinedInst = 0 , totalNumberOfPotentiallyInlinedInst = 0 ;
830
837
int emuFC = (int )IGC_GET_FLAG_VALUE (EmulationFunctionControl);
831
838
832
- // Post processing, set those imported functions as internal linkage
833
- // and alwaysinline. Also count how many instructions would be added
834
- // to the shader if inlining occurred.
839
+ // Post processing, set those imported functions as internal linkage.
835
840
for (auto II = M.begin (), IE = M.end (); II != IE; )
836
841
{
837
842
Function* Func = &(*II);
@@ -853,92 +858,106 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
853
858
continue ;
854
859
}
855
860
856
- // Remove noinline/AlwaysInline attr if present.
857
- Func->removeFnAttr (llvm::Attribute::NoInline);
861
+ if (std::find (importedFunctions.begin (), importedFunctions.end (), Func) == importedFunctions.end ())
862
+ importedFunctions.push_back (Func);
863
+ }
864
+ else
865
+ {
866
+ // Make sure original func isn't inlined accidentally.
858
867
Func->removeFnAttr (llvm::Attribute::AlwaysInline);
868
+ }
869
+ }
859
870
860
- if (m_enableCallForEmulation &&
861
- emuFC != FLAG_FCALL_DEFAULT &&
862
- emuFC != FLAG_FCALL_FORCE_INLINE)
863
- {
864
- // Disable inlining completely.
865
- continue ;
866
- }
867
-
868
- if (Func->hasOneUse () || emuFC == FLAG_FCALL_FORCE_INLINE)
869
- {
870
- Func->addFnAttr (llvm::Attribute::AlwaysInline);
871
- continue ;
872
- }
871
+ // Sort imported instructions in preferred inlining order.
872
+ std::sort (importedFunctions.begin (), importedFunctions.end (), ImportedFunction::compare);
873
873
874
- // Count number of instructions in the function
875
- unsigned NumInst = 0 ;
876
- for (BasicBlock& BB : Func->getBasicBlockList ()) {
877
- NumInst += BB.getInstList ().size ();
878
- }
874
+ // Post processing, set those imported functions as alwaysinline.
875
+ // Also count how many instructions would be added to the shader
876
+ // if inlining occurred.
877
+ for (auto II = importedFunctions.begin (), IE = importedFunctions.end (); II != IE; ++II)
878
+ {
879
+ Function* Func = II->F ;
879
880
880
- // Don't want to subroutine small functions
881
- if (NumInst <= 5 )
882
- {
883
- // Add AlwaysInline attribute to force inlining all calls.
884
- Func->addFnAttr (llvm::Attribute::AlwaysInline);
881
+ // Remove noinline/AlwaysInline attr if present.
882
+ Func->removeFnAttr (llvm::Attribute::NoInline);
883
+ Func->removeFnAttr (llvm::Attribute::AlwaysInline);
885
884
886
- continue ;
887
- }
885
+ if (m_enableCallForEmulation &&
886
+ emuFC != FLAG_FCALL_DEFAULT &&
887
+ emuFC != FLAG_FCALL_FORCE_INLINE)
888
+ {
889
+ // Disable inlining completely.
890
+ continue ;
891
+ }
888
892
889
- totalNumberOfInlinedInst += NumInst * Func->getNumUses ();
893
+ if (Func->hasOneUse () || emuFC == FLAG_FCALL_FORCE_INLINE)
894
+ {
895
+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
896
+ continue ;
890
897
}
891
- else
898
+
899
+ // Don't want to subroutine small functions
900
+ if (II->funcInstructions <= 5 )
892
901
{
893
- // Make sure original func isn't inlined accidentally.
894
- Func->removeFnAttr (llvm::Attribute::AlwaysInline);
902
+ // Add AlwaysInline attribute to force inlining all calls.
903
+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
904
+
905
+ continue ;
895
906
}
896
- }
897
907
898
- // If true, it is a slow version of DP emu functions. Those functions
899
- // are the original ones for just passing conformance, not for perf.
900
- auto isSlowDPEmuFunc = [](Function* F) {
901
- StringRef FN = F->getName ();
902
- if (FN.equals (" __igcbuiltin_dp_add" ) ||
903
- FN.equals (" __igcbuiltin_dp_sub" ) ||
904
- FN.equals (" __igcbuiltin_dp_fma" ) ||
905
- FN.equals (" __igcbuiltin_dp_mul" ) ||
906
- FN.equals (" __igcbuiltin_dp_div" ) ||
907
- FN.equals (" __igcbuiltin_dp_cmp" ) ||
908
- FN.equals (" __igcbuiltin_dp_to_int32" ) ||
909
- FN.equals (" __igcbuiltin_dp_to_uint32" ) ||
910
- FN.equals (" __igcbuiltin_int32_to_dp" ) ||
911
- FN.equals (" __igcbuiltin_uint32_to_dp" ) ||
912
- FN.equals (" __igcbuiltin_dp_to_sp" ) ||
913
- FN.equals (" __igcbuiltin_sp_to_dp" ) ||
914
- FN.equals (" __igcbuiltin_dp_sqrt" )) {
915
- return true ;
908
+ // Don't inline original slow DP emu functions, they are only for passing
909
+ // conformance, not for perf.
910
+ if (isDPEmu () && II->isSlowDPEmuFunc ())
911
+ continue ;
912
+
913
+ totalNumberOfPotentiallyInlinedInst += II->totalInstructions ;
914
+
915
+ // If function fits in threshold, always inline.
916
+ if (totalNumberOfInlinedInst + II->totalInstructions <= (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold))
917
+ {
918
+ totalNumberOfInlinedInst += II->totalInstructions ;
919
+ Func->addFnAttr (llvm::Attribute::AlwaysInline);
916
920
}
917
- return false ;
918
- };
921
+ }
919
922
920
- for (auto II = M.begin (), IE = M.end (); II != IE; )
923
+ // Check if more functions can fit in threshold if they would be split into inline/noinline copies.
924
+ if (m_enableCallForEmulation && emuFC == FLAG_FCALL_DEFAULT && totalNumberOfInlinedInst < (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold))
921
925
{
922
- Function* Func = &(*II);
923
- ++II;
924
- if (!Func || Func->isDeclaration ())
926
+ for (auto II = importedFunctions.begin (); II != importedFunctions.end (); ++II)
925
927
{
926
- continue ;
928
+ Function* Func = II->F ;
929
+
930
+ if (Func->hasFnAttribute (llvm::Attribute::AlwaysInline) || (isDPEmu () && II->isSlowDPEmuFunc ()))
931
+ continue ;
932
+
933
+ unsigned calls = ((unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) - totalNumberOfInlinedInst) / II->funcInstructions ;
934
+ if (calls > 0 )
935
+ {
936
+ // Split function into inline/no-inline copies.
937
+ ImportedFunction copy = createInlinedCopy (*II, calls);
938
+ importedFunctions.push_back (copy);
939
+ totalNumberOfInlinedInst += copy.totalInstructions ;
940
+ }
927
941
}
942
+ }
943
+
944
+ for (auto II = importedFunctions.begin (), IE = importedFunctions.end (); II != IE; ++II)
945
+ {
946
+ Function* Func = II->F ;
928
947
929
- if (!origFunctions. count (Func) && ! Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
948
+ if (!Func->hasFnAttribute (llvm::Attribute::AlwaysInline))
930
949
{
931
950
// Special handling of DP functions: any one that has not been marked as inline
932
951
// at this point, it will be either subroutine or stackcall.
933
- const bool isDPCallFunc = (isDPEmu () && isSlowDPEmuFunc (Func ));
952
+ const bool isDPCallFunc = (isDPEmu () && II-> isSlowDPEmuFunc ());
934
953
935
954
// Use subroutine/stackcall for some DP emulation functions if
936
955
// EmulationFunctionControl is set so, or
937
956
// use subroutines if total number of instructions added when
938
957
// all emulated functions are inlined exceed InlinedEmulationThreshold.
939
958
// If Func is a slow version of DP emu func, perf isn't important.
940
959
if (m_enableCallForEmulation &&
941
- (totalNumberOfInlinedInst > (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) ||
960
+ (totalNumberOfPotentiallyInlinedInst > (unsigned )IGC_GET_FLAG_VALUE (InlinedEmulationThreshold) ||
942
961
isDPCallFunc))
943
962
{
944
963
Func->addFnAttr (llvm::Attribute::NoInline);
@@ -1003,6 +1022,128 @@ bool PreCompiledFuncImport::runOnModule(Module& M)
1003
1022
return m_changed;
1004
1023
}
1005
1024
1025
+ PreCompiledFuncImport::ImportedFunction::ImportedFunction (Function* F)
1026
+ : F(F), type(EmuType::OTHER), funcInstructions(0 ), totalInstructions(0 )
1027
+ {
1028
+ // Count number of new instructions added by inlining.
1029
+ for (BasicBlock& BB : F->getBasicBlockList ())
1030
+ funcInstructions += BB.getInstList ().size ();
1031
+
1032
+ updateUses ();
1033
+
1034
+ // Get type of imported function.
1035
+ StringRef name = F->getName ();
1036
+
1037
+ if (name.equals (" __igcbuiltin_dp_div_nomadm_ieee" ) ||
1038
+ name.equals (" __igcbuiltin_dp_div_nomadm_fast" ) ||
1039
+ name.equals (" __igcbuiltin_dp_sqrt_nomadm_ieee" ) ||
1040
+ name.equals (" __igcbuiltin_dp_sqrt_nomadm_fast" ))
1041
+ {
1042
+ type = EmuType::FASTDP;
1043
+ }
1044
+ else if (name.equals (" __igcbuiltin_dp_add" ) ||
1045
+ name.equals (" __igcbuiltin_dp_sub" ) ||
1046
+ name.equals (" __igcbuiltin_dp_fma" ) ||
1047
+ name.equals (" __igcbuiltin_dp_mul" ) ||
1048
+ name.equals (" __igcbuiltin_dp_div" ) ||
1049
+ name.equals (" __igcbuiltin_dp_cmp" ) ||
1050
+ name.equals (" __igcbuiltin_dp_to_int32" ) ||
1051
+ name.equals (" __igcbuiltin_dp_to_uint32" ) ||
1052
+ name.equals (" __igcbuiltin_int32_to_dp" ) ||
1053
+ name.equals (" __igcbuiltin_uint32_to_dp" ) ||
1054
+ name.equals (" __igcbuiltin_dp_to_sp" ) ||
1055
+ name.equals (" __igcbuiltin_sp_to_dp" ) ||
1056
+ name.equals (" __igcbuiltin_dp_sqrt" ))
1057
+ {
1058
+ // If true, it is a slow version of DP emu functions. Those functions
1059
+ // are the original ones for just passing conformance, not for perf.
1060
+ type = EmuType::SLOWDP;
1061
+ }
1062
+ else
1063
+ {
1064
+ for (int i = 0 ; i < NUM_FUNCTIONS && type == EmuType::OTHER; ++i)
1065
+ {
1066
+ for (int j = 0 ; j < NUM_TYPES && type == EmuType::OTHER; ++j)
1067
+ {
1068
+ if (name.equals (m_Int64SpDivRemFunctionNames[i][j]) ||
1069
+ name.equals (m_Int64DpDivRemFunctionNames[i][j]))
1070
+ {
1071
+ type = EmuType::INT64;
1072
+ }
1073
+ }
1074
+ }
1075
+ }
1076
+ }
1077
+
1078
+ void PreCompiledFuncImport::ImportedFunction::updateUses ()
1079
+ {
1080
+ totalInstructions = funcInstructions * F->getNumUses ();
1081
+ }
1082
+
1083
+ PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::ImportedFunction::copy (ImportedFunction& other)
1084
+ {
1085
+ ValueToValueMapTy VM;
1086
+ Function* copy = CloneFunction (other.F , VM);
1087
+ return PreCompiledFuncImport::ImportedFunction (copy, other.type , other.funcInstructions , 0 );
1088
+ }
1089
+
1090
+ // Compare two imported functions in order preferred for inlining.
1091
+ bool PreCompiledFuncImport::ImportedFunction::compare (ImportedFunction& L, ImportedFunction& R)
1092
+ {
1093
+ // First sort by preferred type of emulation.
1094
+ if (L.type != R.type )
1095
+ return L.type < R.type ;
1096
+
1097
+ // Then sort by number of inlined instructions.
1098
+ return L.totalInstructions < R.totalInstructions ;
1099
+ };
1100
+
1101
+ PreCompiledFuncImport::ImportedFunction PreCompiledFuncImport::createInlinedCopy (ImportedFunction& IF, unsigned n)
1102
+ {
1103
+ std::vector<CallInst*> toDelete;
1104
+
1105
+ // Make copy that is always inlined.
1106
+ ImportedFunction copy = ImportedFunction::copy (IF);
1107
+ copy.F ->setName (IF.F ->getName () + " _always_inline" );
1108
+ copy.F ->addFnAttr (llvm::Attribute::AlwaysInline);
1109
+
1110
+ // Collect first n calls to replace with copy.
1111
+ llvm::SmallVector<CallInst*, 8 > calls;
1112
+ auto it = IF.F ->user_begin ();
1113
+ for (unsigned i = 0 ; i < n; ++i)
1114
+ {
1115
+ CallInst* oldCall = dyn_cast<CallInst>(*(it++));
1116
+ IGC_ASSERT (oldCall);
1117
+ calls.push_back (oldCall);
1118
+ }
1119
+
1120
+ // Replace with always inlined copy.
1121
+ for (CallInst* oldCall : calls)
1122
+ {
1123
+ std::vector<Value*> args;
1124
+ for (unsigned arg = 0 ; arg < IGCLLVM::getNumArgOperands (oldCall); ++arg)
1125
+ args.push_back (oldCall->getArgOperand (arg));
1126
+
1127
+ // Create new call and insert it before old one
1128
+ CallInst* newCall = CallInst::Create (copy.F , args, " " , oldCall);
1129
+
1130
+ newCall->setCallingConv (copy.F ->getCallingConv ());
1131
+ newCall->setAttributes (oldCall->getAttributes ());
1132
+ newCall->setDebugLoc (oldCall->getDebugLoc ());
1133
+
1134
+ oldCall->replaceAllUsesWith (newCall);
1135
+ toDelete.push_back (oldCall);
1136
+ }
1137
+
1138
+ for (auto C : toDelete)
1139
+ C->eraseFromParent ();
1140
+
1141
+ copy.updateUses ();
1142
+ IF.updateUses ();
1143
+
1144
+ return copy;
1145
+ }
1146
+
1006
1147
void PreCompiledFuncImport::visitBinaryOperator (BinaryOperator& I)
1007
1148
{
1008
1149
if (I.getOperand (0 )->getType ()->isIntOrIntVectorTy ())
@@ -2547,6 +2688,7 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
2547
2688
bool SPDiv = isSPDiv ();
2548
2689
bool DPEmu = isDPEmu ();
2549
2690
bool DPDivSqrtEmu = isDPDivSqrtEmu ();
2691
+ bool I64DivRem = isI64DivRem ();
2550
2692
2551
2693
Module* M = m_pCtx->getModule ();
2552
2694
for (auto FI = M->begin (), FE = M->end (); FI != FE; ++FI)
@@ -2589,6 +2731,15 @@ void PreCompiledFuncImport::checkAndSetEnableSubroutine()
2589
2731
m_enableCallForEmulation = true ;
2590
2732
}
2591
2733
break ;
2734
+ case Instruction::UDiv:
2735
+ case Instruction::URem:
2736
+ case Instruction::SDiv:
2737
+ case Instruction::SRem:
2738
+ if (I64DivRem && I->getOperand (0 )->getType ()->isIntegerTy (64 ))
2739
+ {
2740
+ m_enableCallForEmulation = true ;
2741
+ }
2742
+ break ;
2592
2743
}
2593
2744
2594
2745
GenIntrinsicInst* GII = dyn_cast<GenIntrinsicInst>(I);
0 commit comments