@@ -178,6 +178,12 @@ static cl::opt<bool>
178
178
cl::desc (" Salvage stale MemProf profile" ),
179
179
cl::init(false ), cl::Hidden);
180
180
181
+ static cl::opt<bool > ClMemProfAttachCalleeGuids (
182
+ " memprof-attach-calleeguids" ,
183
+ cl::desc (
184
+ " Attach calleeguids as value profile metadata for indirect calls." ),
185
+ cl::init(true ), cl::Hidden);
186
+
181
187
extern cl::opt<bool > MemProfReportHintedSizes;
182
188
extern cl::opt<unsigned > MinClonedColdBytePercent;
183
189
extern cl::opt<unsigned > MinCallsiteColdBytePercent;
@@ -952,6 +958,46 @@ undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps,
952
958
UndriftCallStack (CS.Frames );
953
959
}
954
960
961
+ // Helper function to process CalleeGuids and create value profile metadata
962
+ static void addVPMetadata (Module &M, Instruction &I,
963
+ ArrayRef<GlobalValue::GUID> CalleeGuids) {
964
+ if (!ClMemProfAttachCalleeGuids || CalleeGuids.empty ())
965
+ return ;
966
+
967
+ if (I.getMetadata (LLVMContext::MD_prof)) {
968
+ uint64_t Unused;
969
+ // TODO: When merging is implemented, increase this to a typical ICP value
970
+ // (e.g., 3-6) For now, we only need to check if existing data exists, so 1
971
+ // is sufficient
972
+ auto ExistingVD = getValueProfDataFromInst (I, IPVK_IndirectCallTarget,
973
+ /* MaxNumValueData=*/ 1 , Unused);
974
+ // We don't know how to merge value profile data yet.
975
+ if (!ExistingVD.empty ()) {
976
+ return ;
977
+ }
978
+ }
979
+
980
+ SmallVector<InstrProfValueData, 4 > VDs;
981
+ uint64_t TotalCount = 0 ;
982
+
983
+ for (const GlobalValue::GUID CalleeGUID : CalleeGuids) {
984
+ InstrProfValueData VD;
985
+ VD.Value = CalleeGUID;
986
+ // For MemProf, we don't have actual call counts, so we assign
987
+ // a weight of 1 to each potential target.
988
+ // TODO: Consider making this weight configurable or increasing it to
989
+ // improve effectiveness for ICP.
990
+ VD.Count = 1 ;
991
+ VDs.push_back (VD);
992
+ TotalCount += VD.Count ;
993
+ }
994
+
995
+ if (!VDs.empty ()) {
996
+ annotateValueSite (M, I, VDs, TotalCount, IPVK_IndirectCallTarget,
997
+ VDs.size ());
998
+ }
999
+ }
1000
+
955
1001
static void
956
1002
readMemprof (Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
957
1003
const TargetLibraryInfo &TLI,
@@ -1020,15 +1066,35 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
1020
1066
// Build maps of the location hash to all profile data with that leaf location
1021
1067
// (allocation info and the callsites).
1022
1068
std::map<uint64_t , std::set<const AllocationInfo *>> LocHashToAllocInfo;
1023
- // A hash function for std::unordered_set<ArrayRef<Frame>> to work.
1024
- struct CallStackHash {
1025
- size_t operator ()(ArrayRef<Frame> CS) const {
1026
- return computeFullStackId (CS);
1069
+
1070
+ // Helper struct for maintaining refs to callsite data. As an alternative we
1071
+ // could store a pointer to the CallSiteInfo struct but we also need the frame
1072
+ // index. Using ArrayRefs instead makes it a little easier to read.
1073
+ struct CallSiteEntry {
1074
+ // Subset of frames for the corresponding CallSiteInfo.
1075
+ ArrayRef<Frame> Frames;
1076
+ // Potential targets for indirect calls.
1077
+ ArrayRef<GlobalValue::GUID> CalleeGuids;
1078
+
1079
+ // Only compare Frame contents.
1080
+ // Use pointer-based equality instead of ArrayRef's operator== which does
1081
+ // element-wise comparison. We want to check if it's the same slice of the
1082
+ // underlying array, not just equivalent content.
1083
+ bool operator ==(const CallSiteEntry &Other) const {
1084
+ return Frames.data () == Other.Frames .data () &&
1085
+ Frames.size () == Other.Frames .size ();
1086
+ }
1087
+ };
1088
+
1089
+ struct CallSiteEntryHash {
1090
+ size_t operator ()(const CallSiteEntry &Entry) const {
1091
+ return computeFullStackId (Entry.Frames );
1027
1092
}
1028
1093
};
1094
+
1029
1095
// For the callsites we need to record slices of the frame array (see comments
1030
- // below where the map entries are added).
1031
- std::map<uint64_t , std::unordered_set<ArrayRef<Frame>, CallStackHash >>
1096
+ // below where the map entries are added) along with their CalleeGuids .
1097
+ std::map<uint64_t , std::unordered_set<CallSiteEntry, CallSiteEntryHash >>
1032
1098
LocHashToCallSites;
1033
1099
for (auto &AI : MemProfRec->AllocSites ) {
1034
1100
NumOfMemProfAllocContextProfiles++;
@@ -1046,8 +1112,10 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
1046
1112
unsigned Idx = 0 ;
1047
1113
for (auto &StackFrame : CS.Frames ) {
1048
1114
uint64_t StackId = computeStackId (StackFrame);
1049
- LocHashToCallSites[StackId].insert (
1050
- ArrayRef<Frame>(CS.Frames ).drop_front (Idx++));
1115
+ ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames ).drop_front (Idx++);
1116
+ ArrayRef<GlobalValue::GUID> CalleeGuids (CS.CalleeGuids );
1117
+ LocHashToCallSites[StackId].insert ({FrameSlice, CalleeGuids});
1118
+
1051
1119
ProfileHasColumns |= StackFrame.Column ;
1052
1120
// Once we find this function, we can stop recording.
1053
1121
if (StackFrame.Function == FuncGUID)
@@ -1191,13 +1259,18 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
1191
1259
// Otherwise, add callsite metadata. If we reach here then we found the
1192
1260
// instruction's leaf location in the callsites map and not the allocation
1193
1261
// map.
1194
- for (auto CallStackIdx : CallSitesIter->second ) {
1262
+ for (const auto &CallSiteEntry : CallSitesIter->second ) {
1195
1263
// If we found and thus matched all frames on the call, create and
1196
1264
// attach call stack metadata.
1197
- if (stackFrameIncludesInlinedCallStack (CallStackIdx ,
1265
+ if (stackFrameIncludesInlinedCallStack (CallSiteEntry. Frames ,
1198
1266
InlinedCallStack)) {
1199
1267
NumOfMemProfMatchedCallSites++;
1200
1268
addCallsiteMetadata (I, InlinedCallStack, Ctx);
1269
+
1270
+ // Try to attach indirect call metadata if possible.
1271
+ if (!CalledFunction)
1272
+ addVPMetadata (M, I, CallSiteEntry.CalleeGuids );
1273
+
1201
1274
// Only need to find one with a matching call stack and add a single
1202
1275
// callsite metadata.
1203
1276
0 commit comments