|
23 | 23 | #include "llvm/Analysis/BlockFrequencyInfo.h"
|
24 | 24 | #include "llvm/Analysis/CallGraph.h"
|
25 | 25 | #include "llvm/Analysis/CaptureTracking.h"
|
| 26 | +#include "llvm/Analysis/CtxProfAnalysis.h" |
26 | 27 | #include "llvm/Analysis/IndirectCallVisitor.h"
|
27 | 28 | #include "llvm/Analysis/InstructionSimplify.h"
|
28 | 29 | #include "llvm/Analysis/MemoryProfileInfo.h"
|
|
46 | 47 | #include "llvm/IR/Dominators.h"
|
47 | 48 | #include "llvm/IR/EHPersonalities.h"
|
48 | 49 | #include "llvm/IR/Function.h"
|
| 50 | +#include "llvm/IR/GlobalVariable.h" |
49 | 51 | #include "llvm/IR/IRBuilder.h"
|
50 | 52 | #include "llvm/IR/InlineAsm.h"
|
51 | 53 | #include "llvm/IR/InstrTypes.h"
|
|
71 | 73 | #include <algorithm>
|
72 | 74 | #include <cassert>
|
73 | 75 | #include <cstdint>
|
| 76 | +#include <deque> |
74 | 77 | #include <iterator>
|
75 | 78 | #include <limits>
|
76 | 79 | #include <optional>
|
@@ -2116,6 +2119,203 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
|
2116 | 2119 | }
|
2117 | 2120 | }
|
2118 | 2121 |
|
| 2122 | +// In contextual profiling, when an inline succeeds, we want to remap the |
| 2123 | +// indices of the callee in the index space of the caller. We can't just leave |
| 2124 | +// them as-is because the same callee may appear in other places in this caller |
| 2125 | +// (other callsites), and its (callee's) counters and sub-contextual profile |
| 2126 | +// tree would be potentially different. |
| 2127 | +// Not all BBs of the callee may survive the opportunistic DCE InlineFunction |
| 2128 | +// does (same goes for callsites in the callee). |
| 2129 | +// We will return a pair of vectors, one for basic block IDs and one for |
| 2130 | +// callsites. For such a vector V, V[Idx] will be -1 if the callee |
| 2131 | +// instrumentation with index Idx did not survive inlining, and a new value |
| 2132 | +// otherwise. |
| 2133 | +// This function will update the instrumentation intrinsics accordingly, |
| 2134 | +// mapping indices as described above. We also replace the "name" operand |
| 2135 | +// because we use it to distinguish between "own" instrumentation and "from |
| 2136 | +// callee" instrumentation when performing the traversal of the CFG of the |
| 2137 | +// caller. We traverse depth-first from the callsite's BB and up to the point we |
| 2138 | +// hit owned BBs. |
| 2139 | +// The return values will be then used to update the contextual |
| 2140 | +// profile. Note: we only update the "name" and "index" operands in the |
| 2141 | +// instrumentation intrinsics, we leave the hash and total nr of indices as-is, |
| 2142 | +// it's not worth updating those. |
| 2143 | +static const std::pair<std::vector<int64_t>, std::vector<int64_t>> |
| 2144 | +remapIndices(Function &Caller, BasicBlock *StartBB, |
| 2145 | + CtxProfAnalysis::Result &CtxProf, uint32_t CalleeCounters, |
| 2146 | + uint32_t CalleeCallsites) { |
| 2147 | + // We'll allocate a new ID to imported callsite counters and callsites. We're |
| 2148 | + // using -1 to indicate a counter we delete. Most likely the entry, for |
| 2149 | + // example, will be deleted - we don't want 2 IDs in the same BB, and the |
| 2150 | + // entry would have been cloned in the callsite's old BB. |
| 2151 | + std::vector<int64_t> CalleeCounterMap; |
| 2152 | + std::vector<int64_t> CalleeCallsiteMap; |
| 2153 | + CalleeCounterMap.resize(CalleeCounters, -1); |
| 2154 | + CalleeCallsiteMap.resize(CalleeCallsites, -1); |
| 2155 | + |
| 2156 | + auto RewriteInstrIfNeeded = [&](InstrProfIncrementInst &Ins) -> bool { |
| 2157 | + if (Ins.getNameValue() == &Caller) |
| 2158 | + return false; |
| 2159 | + const auto OldID = static_cast<uint32_t>(Ins.getIndex()->getZExtValue()); |
| 2160 | + if (CalleeCounterMap[OldID] == -1) |
| 2161 | + CalleeCounterMap[OldID] = CtxProf.allocateNextCounterIndex(Caller); |
| 2162 | + const auto NewID = static_cast<uint32_t>(CalleeCounterMap[OldID]); |
| 2163 | + |
| 2164 | + Ins.setNameValue(&Caller); |
| 2165 | + Ins.setIndex(NewID); |
| 2166 | + return true; |
| 2167 | + }; |
| 2168 | + |
| 2169 | + auto RewriteCallsiteInsIfNeeded = [&](InstrProfCallsite &Ins) -> bool { |
| 2170 | + if (Ins.getNameValue() == &Caller) |
| 2171 | + return false; |
| 2172 | + const auto OldID = static_cast<uint32_t>(Ins.getIndex()->getZExtValue()); |
| 2173 | + if (CalleeCallsiteMap[OldID] == -1) |
| 2174 | + CalleeCallsiteMap[OldID] = CtxProf.allocateNextCallsiteIndex(Caller); |
| 2175 | + const auto NewID = static_cast<uint32_t>(CalleeCallsiteMap[OldID]); |
| 2176 | + |
| 2177 | + Ins.setNameValue(&Caller); |
| 2178 | + Ins.setIndex(NewID); |
| 2179 | + return true; |
| 2180 | + }; |
| 2181 | + |
| 2182 | + std::deque<BasicBlock *> Worklist; |
| 2183 | + DenseSet<const BasicBlock *> Seen; |
| 2184 | + // We will traverse the BBs starting from the callsite BB. The callsite BB |
| 2185 | + // will have at least a BB ID - maybe its own, and in any case the one coming |
| 2186 | + // from the cloned function's entry BB. The other BBs we'll start seeing from |
| 2187 | + // there on may or may not have BB IDs. BBs with IDs belonging to our caller |
| 2188 | + // are definitely not coming from the imported function and form a boundary |
| 2189 | + // past which we don't need to traverse anymore. BBs may have no |
| 2190 | + // instrumentation (because we originally inserted instrumentation as per |
| 2191 | + // MST), in which case we'll traverse past them. An invariant we'll keep is |
| 2192 | + // that a BB will have at most 1 BB ID. For example, in the callsite BB, we |
| 2193 | + // will delete the callee BB's instrumentation. This doesn't result in |
| 2194 | + // information loss: the entry BB of the caller will have the same count as |
| 2195 | + // the callsite's BB. At the end of this traversal, all the callee's |
| 2196 | + // instrumentation would be mapped into the caller's instrumentation index |
| 2197 | + // space. Some of the callee's counters may be deleted (as mentioned, this |
| 2198 | + // should result in no loss of information). |
| 2199 | + Worklist.push_back(StartBB); |
| 2200 | + while (!Worklist.empty()) { |
| 2201 | + auto *BB = Worklist.front(); |
| 2202 | + Worklist.pop_front(); |
| 2203 | + bool Changed = false; |
| 2204 | + auto *BBID = CtxProfAnalysis::getBBInstrumentation(*BB); |
| 2205 | + if (BBID) { |
| 2206 | + Changed |= RewriteInstrIfNeeded(*BBID); |
| 2207 | + // this may be the entryblock from the inlined callee, coming into a BB |
| 2208 | + // that didn't have instrumentation because of MST decisions. Let's make |
| 2209 | + // sure it's placed accordingly. This is a noop elsewhere. |
| 2210 | + BBID->moveBefore(&*BB->getFirstInsertionPt()); |
| 2211 | + } |
| 2212 | + for (auto &I : llvm::make_early_inc_range(*BB)) { |
| 2213 | + if (auto *Inc = dyn_cast<InstrProfIncrementInst>(&I)) { |
| 2214 | + if (Inc != BBID) { |
| 2215 | + Inc->eraseFromParent(); |
| 2216 | + Changed = true; |
| 2217 | + } |
| 2218 | + } else if (auto *CS = dyn_cast<InstrProfCallsite>(&I)) { |
| 2219 | + Changed |= RewriteCallsiteInsIfNeeded(*CS); |
| 2220 | + } |
| 2221 | + } |
| 2222 | + if (!BBID || Changed) |
| 2223 | + for (auto *Succ : successors(BB)) |
| 2224 | + if (Seen.insert(Succ).second) |
| 2225 | + Worklist.push_back(Succ); |
| 2226 | + } |
| 2227 | + return {std::move(CalleeCounterMap), std::move(CalleeCallsiteMap)}; |
| 2228 | +} |
| 2229 | + |
| 2230 | +llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, |
| 2231 | + CtxProfAnalysis::Result &CtxProf, |
| 2232 | + bool MergeAttributes, |
| 2233 | + AAResults *CalleeAAR, |
| 2234 | + bool InsertLifetime, |
| 2235 | + Function *ForwardVarArgsTo) { |
| 2236 | + if (!CtxProf) |
| 2237 | + return InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, |
| 2238 | + ForwardVarArgsTo); |
| 2239 | + |
| 2240 | + auto &Caller = *CB.getCaller(); |
| 2241 | + auto &Callee = *CB.getCalledFunction(); |
| 2242 | + auto *StartBB = CB.getParent(); |
| 2243 | + |
| 2244 | + // Get some preliminary data about the callsite before it might get inlined. |
| 2245 | + // Inlining shouldn't delete the callee, but it's cleaner (and low-cost) to |
| 2246 | + // get this data upfront and rely less on InlineFunction's behavior. |
| 2247 | + const auto CalleeGUID = AssignGUIDPass::getGUID(Callee); |
| 2248 | + auto *CallsiteIDIns = CtxProfAnalysis::getCallsiteInstrumentation(CB); |
| 2249 | + const auto CallsiteID = |
| 2250 | + static_cast<uint32_t>(CallsiteIDIns->getIndex()->getZExtValue()); |
| 2251 | + |
| 2252 | + const auto NrCalleeCounters = CtxProf.getNrCounters(Callee); |
| 2253 | + const auto NrCalleeCallsites = CtxProf.getNrCallsites(Callee); |
| 2254 | + |
| 2255 | + auto Ret = InlineFunction(CB, IFI, MergeAttributes, CalleeAAR, InsertLifetime, |
| 2256 | + ForwardVarArgsTo); |
| 2257 | + if (!Ret.isSuccess()) |
| 2258 | + return Ret; |
| 2259 | + |
| 2260 | + // Inlining succeeded, we don't need the instrumentation of the inlined |
| 2261 | + // callsite. |
| 2262 | + CallsiteIDIns->eraseFromParent(); |
| 2263 | + |
| 2264 | + // Assinging Maps and then capturing references into it in the lambda because |
| 2265 | + // captured structured bindings are a C++20 extension. We do also need a |
| 2266 | + // capture here, though. |
| 2267 | + const auto IndicesMaps = remapIndices(Caller, StartBB, CtxProf, |
| 2268 | + NrCalleeCounters, NrCalleeCallsites); |
| 2269 | + const uint32_t NewCountersSize = CtxProf.getNrCounters(Caller); |
| 2270 | + |
| 2271 | + auto Updater = [&](PGOCtxProfContext &Ctx) { |
| 2272 | + assert(Ctx.guid() == AssignGUIDPass::getGUID(Caller)); |
| 2273 | + const auto &[CalleeCounterMap, CalleeCallsiteMap] = IndicesMaps; |
| 2274 | + assert( |
| 2275 | + (Ctx.counters().size() + |
| 2276 | + llvm::count_if(CalleeCounterMap, [](auto V) { return V != -1; }) == |
| 2277 | + NewCountersSize) && |
| 2278 | + "The caller's counters size should have grown by the number of new " |
| 2279 | + "distinct counters inherited from the inlined callee."); |
| 2280 | + Ctx.resizeCounters(NewCountersSize); |
| 2281 | + // If the callsite wasn't exercised in this context, the value of the |
| 2282 | + // counters coming from it is 0 - which it is right now, after resizing them |
| 2283 | + // - and so we're done. |
| 2284 | + auto CSIt = Ctx.callsites().find(CallsiteID); |
| 2285 | + if (CSIt == Ctx.callsites().end()) |
| 2286 | + return; |
| 2287 | + auto CalleeCtxIt = CSIt->second.find(CalleeGUID); |
| 2288 | + // The callsite was exercised, but not with this callee (so presumably this |
| 2289 | + // is an indirect callsite). Again, we're done here. |
| 2290 | + if (CalleeCtxIt == CSIt->second.end()) |
| 2291 | + return; |
| 2292 | + |
| 2293 | + // Let's pull in the counter values and the subcontexts coming from the |
| 2294 | + // inlined callee. |
| 2295 | + auto &CalleeCtx = CalleeCtxIt->second; |
| 2296 | + assert(CalleeCtx.guid() == CalleeGUID); |
| 2297 | + |
| 2298 | + for (auto I = 0U; I < CalleeCtx.counters().size(); ++I) { |
| 2299 | + const int64_t NewIndex = CalleeCounterMap[I]; |
| 2300 | + if (NewIndex >= 0) |
| 2301 | + Ctx.counters()[NewIndex] = CalleeCtx.counters()[I]; |
| 2302 | + } |
| 2303 | + for (auto &[I, OtherSet] : CalleeCtx.callsites()) { |
| 2304 | + const int64_t NewCSIdx = CalleeCallsiteMap[I]; |
| 2305 | + if (NewCSIdx >= 0) |
| 2306 | + Ctx.ingestAllContexts(NewCSIdx, std::move(OtherSet)); |
| 2307 | + } |
| 2308 | + // We know the traversal is preorder, so it wouldn't have yet looked at the |
| 2309 | + // sub-contexts of this context that it's currently visiting. Meaning, the |
| 2310 | + // erase below invalidates no iterators. |
| 2311 | + auto Deleted = Ctx.callsites().erase(CallsiteID); |
| 2312 | + assert(Deleted); |
| 2313 | + (void)Deleted; |
| 2314 | + }; |
| 2315 | + CtxProf.update(Updater, &Caller); |
| 2316 | + return Ret; |
| 2317 | +} |
| 2318 | + |
2119 | 2319 | /// This function inlines the called function into the basic block of the
|
2120 | 2320 | /// caller. This returns false if it is not possible to inline this call.
|
2121 | 2321 | /// The program is still in a well defined state if this occurs though.
|
|
0 commit comments