1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <deque>
48 #include <sstream>
49 #include <unordered_map>
50 #include <vector>
51 using namespace llvm;
52 using namespace llvm::memprof;
53
54 #define DEBUG_TYPE "memprof-context-disambiguation"
55
56 STATISTIC(FunctionClonesAnalysis,
57 "Number of function clones created during whole program analysis");
58 STATISTIC(FunctionClonesThinBackend,
59 "Number of function clones created during ThinLTO backend");
60 STATISTIC(FunctionsClonedThinBackend,
61 "Number of functions that had clones created during ThinLTO backend");
62 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63 "cloned) during whole program analysis");
64 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65 "during whole program analysis");
66 STATISTIC(AllocTypeNotColdThinBackend,
67 "Number of not cold static allocations (possibly cloned) during "
68 "ThinLTO backend");
69 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70 "(possibly cloned) during ThinLTO backend");
71 STATISTIC(OrigAllocsThinBackend,
72 "Number of original (not cloned) allocations with memprof profiles "
73 "during ThinLTO backend");
74 STATISTIC(
75 AllocVersionsThinBackend,
76 "Number of allocation versions (including clones) during ThinLTO backend");
77 STATISTIC(MaxAllocVersionsThinBackend,
78 "Maximum number of allocation versions created for an original "
79 "allocation during ThinLTO backend");
80 STATISTIC(UnclonableAllocsThinBackend,
81 "Number of unclonable ambigous allocations during ThinLTO backend");
82 STATISTIC(RemovedEdgesWithMismatchedCallees,
83 "Number of edges removed due to mismatched callees (profiled vs IR)");
84 STATISTIC(FoundProfiledCalleeCount,
85 "Number of profiled callees found via tail calls");
86 STATISTIC(FoundProfiledCalleeDepth,
87 "Aggregate depth of profiled callees found via tail calls");
88 STATISTIC(FoundProfiledCalleeMaxDepth,
89 "Maximum depth of profiled callees found via tail calls");
90 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91 "Number of profiled callees found via multiple tail call chains");
92
93 static cl::opt<std::string> DotFilePathPrefix(
94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95 cl::value_desc("filename"),
96 cl::desc("Specify the path prefix of the MemProf dot files."));
97
98 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
99 cl::Hidden,
100 cl::desc("Export graph to dot files."));
101
102 static cl::opt<bool>
103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104 cl::desc("Dump CallingContextGraph to stdout after each stage."));
105
106 static cl::opt<bool>
107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108 cl::desc("Perform verification checks on CallingContextGraph."));
109
110 static cl::opt<bool>
111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112 cl::desc("Perform frequent verification checks on nodes."));
113
114 static cl::opt<std::string> MemProfImportSummary(
115 "memprof-import-summary",
116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117 cl::Hidden);
118
119 static cl::opt<unsigned>
120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
121 cl::Hidden,
122 cl::desc("Max depth to recursively search for missing "
123 "frames through tail calls."));
124
125 namespace llvm {
126 cl::opt<bool> EnableMemProfContextDisambiguation(
127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129
130 // Indicate we are linking with an allocator that supports hot/cold operator
131 // new interfaces.
132 cl::opt<bool> SupportsHotColdNew(
133 "supports-hot-cold-new", cl::init(false), cl::Hidden,
134 cl::desc("Linking with hot/cold operator new interfaces"));
135 } // namespace llvm
136
137 extern cl::opt<bool> MemProfReportHintedSizes;
138
139 namespace {
140 /// CRTP base for graphs built from either IR or ThinLTO summary index.
141 ///
142 /// The graph represents the call contexts in all memprof metadata on allocation
143 /// calls, with nodes for the allocations themselves, as well as for the calls
144 /// in each context. The graph is initially built from the allocation memprof
145 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
146 /// metadata onto the nodes, updating it to reflect any inlining performed on
147 /// those calls.
148 ///
149 /// Each MIB (representing an allocation's call context with allocation
150 /// behavior) is assigned a unique context id during the graph build. The edges
151 /// and nodes in the graph are decorated with the context ids they carry. This
152 /// is used to correctly update the graph when cloning is performed so that we
153 /// can uniquify the context for a single (possibly cloned) allocation.
154 template <typename DerivedCCG, typename FuncTy, typename CallTy>
155 class CallsiteContextGraph {
156 public:
157 CallsiteContextGraph() = default;
158 CallsiteContextGraph(const CallsiteContextGraph &) = default;
159 CallsiteContextGraph(CallsiteContextGraph &&) = default;
160
161 /// Main entry point to perform analysis and transformations on graph.
162 bool process();
163
164 /// Perform cloning on the graph necessary to uniquely identify the allocation
165 /// behavior of an allocation based on its context.
166 void identifyClones();
167
168 /// Assign callsite clones to functions, cloning functions as needed to
169 /// accommodate the combinations of their callsite clones reached by callers.
170 /// For regular LTO this clones functions and callsites in the IR, but for
171 /// ThinLTO the cloning decisions are noted in the summaries and later applied
172 /// in applyImport.
173 bool assignFunctions();
174
175 void dump() const;
176 void print(raw_ostream &OS) const;
177 void printTotalSizes(raw_ostream &OS) const;
178
operator <<(raw_ostream & OS,const CallsiteContextGraph & CCG)179 friend raw_ostream &operator<<(raw_ostream &OS,
180 const CallsiteContextGraph &CCG) {
181 CCG.print(OS);
182 return OS;
183 }
184
185 friend struct GraphTraits<
186 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
187 friend struct DOTGraphTraits<
188 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
189
190 void exportToDot(std::string Label) const;
191
192 /// Represents a function clone via FuncTy pointer and clone number pair.
193 struct FuncInfo final
194 : public std::pair<FuncTy *, unsigned /*Clone number*/> {
195 using Base = std::pair<FuncTy *, unsigned>;
FuncInfo__anonf01a58700111::CallsiteContextGraph::FuncInfo196 FuncInfo(const Base &B) : Base(B) {}
FuncInfo__anonf01a58700111::CallsiteContextGraph::FuncInfo197 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
operator bool__anonf01a58700111::CallsiteContextGraph::FuncInfo198 explicit operator bool() const { return this->first != nullptr; }
func__anonf01a58700111::CallsiteContextGraph::FuncInfo199 FuncTy *func() const { return this->first; }
cloneNo__anonf01a58700111::CallsiteContextGraph::FuncInfo200 unsigned cloneNo() const { return this->second; }
201 };
202
203 /// Represents a callsite clone via CallTy and clone number pair.
204 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
205 using Base = std::pair<CallTy, unsigned>;
CallInfo__anonf01a58700111::CallsiteContextGraph::CallInfo206 CallInfo(const Base &B) : Base(B) {}
CallInfo__anonf01a58700111::CallsiteContextGraph::CallInfo207 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
208 : Base(Call, CloneNo) {}
operator bool__anonf01a58700111::CallsiteContextGraph::CallInfo209 explicit operator bool() const { return (bool)this->first; }
call__anonf01a58700111::CallsiteContextGraph::CallInfo210 CallTy call() const { return this->first; }
cloneNo__anonf01a58700111::CallsiteContextGraph::CallInfo211 unsigned cloneNo() const { return this->second; }
setCloneNo__anonf01a58700111::CallsiteContextGraph::CallInfo212 void setCloneNo(unsigned N) { this->second = N; }
print__anonf01a58700111::CallsiteContextGraph::CallInfo213 void print(raw_ostream &OS) const {
214 if (!operator bool()) {
215 assert(!cloneNo());
216 OS << "null Call";
217 return;
218 }
219 call()->print(OS);
220 OS << "\t(clone " << cloneNo() << ")";
221 }
dump__anonf01a58700111::CallsiteContextGraph::CallInfo222 void dump() const {
223 print(dbgs());
224 dbgs() << "\n";
225 }
operator <<(raw_ostream & OS,const CallInfo & Call)226 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
227 Call.print(OS);
228 return OS;
229 }
230 };
231
232 struct ContextEdge;
233
234 /// Node in the Callsite Context Graph
235 struct ContextNode {
236 // Keep this for now since in the IR case where we have an Instruction* it
237 // is not as immediately discoverable. Used for printing richer information
238 // when dumping graph.
239 bool IsAllocation;
240
241 // Keeps track of when the Call was reset to null because there was
242 // recursion.
243 bool Recursive = false;
244
245 // The corresponding allocation or interior call.
246 CallInfo Call;
247
248 // For alloc nodes this is a unique id assigned when constructed, and for
249 // callsite stack nodes it is the original stack id when the node is
250 // constructed from the memprof MIB metadata on the alloc nodes. Note that
251 // this is only used when matching callsite metadata onto the stack nodes
252 // created when processing the allocation memprof MIBs, and for labeling
253 // nodes in the dot graph. Therefore we don't bother to assign a value for
254 // clones.
255 uint64_t OrigStackOrAllocId = 0;
256
257 // This will be formed by ORing together the AllocationType enum values
258 // for contexts including this node.
259 uint8_t AllocTypes = 0;
260
261 // Edges to all callees in the profiled call stacks.
262 // TODO: Should this be a map (from Callee node) for more efficient lookup?
263 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
264
265 // Edges to all callers in the profiled call stacks.
266 // TODO: Should this be a map (from Caller node) for more efficient lookup?
267 std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
268
269 // Get the list of edges from which we can compute allocation information
270 // such as the context ids and allocation type of this node.
271 const std::vector<std::shared_ptr<ContextEdge>> *
getEdgesWithAllocInfo__anonf01a58700111::CallsiteContextGraph::ContextNode272 getEdgesWithAllocInfo() const {
273 // If node has any callees, compute from those, otherwise compute from
274 // callers (i.e. if this is the leaf allocation node).
275 if (!CalleeEdges.empty())
276 return &CalleeEdges;
277 if (!CallerEdges.empty()) {
278 // A node with caller edges but no callee edges must be the allocation
279 // node.
280 assert(IsAllocation);
281 return &CallerEdges;
282 }
283 return nullptr;
284 }
285
286 // Compute the context ids for this node from the union of its edge context
287 // ids.
getContextIds__anonf01a58700111::CallsiteContextGraph::ContextNode288 DenseSet<uint32_t> getContextIds() const {
289 DenseSet<uint32_t> ContextIds;
290 auto *Edges = getEdgesWithAllocInfo();
291 if (!Edges)
292 return {};
293 unsigned Count = 0;
294 for (auto &Edge : *Edges)
295 Count += Edge->getContextIds().size();
296 ContextIds.reserve(Count);
297 for (auto &Edge : *Edges)
298 ContextIds.insert(Edge->getContextIds().begin(),
299 Edge->getContextIds().end());
300 return ContextIds;
301 }
302
303 // Compute the allocation type for this node from the OR of its edge
304 // allocation types.
computeAllocType__anonf01a58700111::CallsiteContextGraph::ContextNode305 uint8_t computeAllocType() const {
306 auto *Edges = getEdgesWithAllocInfo();
307 if (!Edges)
308 return (uint8_t)AllocationType::None;
309 uint8_t BothTypes =
310 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
311 uint8_t AllocType = (uint8_t)AllocationType::None;
312 for (auto &Edge : *Edges) {
313 AllocType |= Edge->AllocTypes;
314 // Bail early if alloc type reached both, no further refinement.
315 if (AllocType == BothTypes)
316 return AllocType;
317 }
318 return AllocType;
319 }
320
321 // The context ids set for this node is empty if its edge context ids are
322 // also all empty.
emptyContextIds__anonf01a58700111::CallsiteContextGraph::ContextNode323 bool emptyContextIds() const {
324 auto *Edges = getEdgesWithAllocInfo();
325 if (!Edges)
326 return true;
327 for (auto &Edge : *Edges) {
328 if (!Edge->getContextIds().empty())
329 return false;
330 }
331 return true;
332 }
333
334 // List of clones of this ContextNode, initially empty.
335 std::vector<ContextNode *> Clones;
336
337 // If a clone, points to the original uncloned node.
338 ContextNode *CloneOf = nullptr;
339
ContextNode__anonf01a58700111::CallsiteContextGraph::ContextNode340 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
341
ContextNode__anonf01a58700111::CallsiteContextGraph::ContextNode342 ContextNode(bool IsAllocation, CallInfo C)
343 : IsAllocation(IsAllocation), Call(C) {}
344
addClone__anonf01a58700111::CallsiteContextGraph::ContextNode345 void addClone(ContextNode *Clone) {
346 if (CloneOf) {
347 CloneOf->Clones.push_back(Clone);
348 Clone->CloneOf = CloneOf;
349 } else {
350 Clones.push_back(Clone);
351 assert(!Clone->CloneOf);
352 Clone->CloneOf = this;
353 }
354 }
355
getOrigNode__anonf01a58700111::CallsiteContextGraph::ContextNode356 ContextNode *getOrigNode() {
357 if (!CloneOf)
358 return this;
359 return CloneOf;
360 }
361
362 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
363 unsigned int ContextId);
364
365 ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
366 ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
367 void eraseCalleeEdge(const ContextEdge *Edge);
368 void eraseCallerEdge(const ContextEdge *Edge);
369
setCall__anonf01a58700111::CallsiteContextGraph::ContextNode370 void setCall(CallInfo C) { Call = C; }
371
hasCall__anonf01a58700111::CallsiteContextGraph::ContextNode372 bool hasCall() const { return (bool)Call.call(); }
373
printCall__anonf01a58700111::CallsiteContextGraph::ContextNode374 void printCall(raw_ostream &OS) const { Call.print(OS); }
375
376 // True if this node was effectively removed from the graph, in which case
377 // it should have an allocation type of None and empty context ids.
isRemoved__anonf01a58700111::CallsiteContextGraph::ContextNode378 bool isRemoved() const {
379 assert((AllocTypes == (uint8_t)AllocationType::None) ==
380 emptyContextIds());
381 return AllocTypes == (uint8_t)AllocationType::None;
382 }
383
384 void dump() const;
385 void print(raw_ostream &OS) const;
386
operator <<(raw_ostream & OS,const ContextNode & Node)387 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
388 Node.print(OS);
389 return OS;
390 }
391 };
392
393 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
394 /// callee.
395 struct ContextEdge {
396 ContextNode *Callee;
397 ContextNode *Caller;
398
399 // This will be formed by ORing together the AllocationType enum values
400 // for contexts including this edge.
401 uint8_t AllocTypes = 0;
402
403 // The set of IDs for contexts including this edge.
404 DenseSet<uint32_t> ContextIds;
405
ContextEdge__anonf01a58700111::CallsiteContextGraph::ContextEdge406 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
407 DenseSet<uint32_t> ContextIds)
408 : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
409 ContextIds(std::move(ContextIds)) {}
410
getContextIds__anonf01a58700111::CallsiteContextGraph::ContextEdge411 DenseSet<uint32_t> &getContextIds() { return ContextIds; }
412
413 void dump() const;
414 void print(raw_ostream &OS) const;
415
operator <<(raw_ostream & OS,const ContextEdge & Edge)416 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
417 Edge.print(OS);
418 return OS;
419 }
420 };
421
422 /// Helpers to remove callee edges that have allocation type None (due to not
423 /// carrying any context ids) after transformations.
424 void removeNoneTypeCalleeEdges(ContextNode *Node);
425 void
426 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
427 DenseSet<const ContextNode *> &Visited);
428
429 protected:
430 /// Get a list of nodes corresponding to the stack ids in the given callsite
431 /// context.
432 template <class NodeT, class IteratorT>
433 std::vector<uint64_t>
434 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
435
436 /// Adds nodes for the given allocation and any stack ids on its memprof MIB
437 /// metadata (or summary).
438 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
439
440 /// Adds nodes for the given MIB stack ids.
441 template <class NodeT, class IteratorT>
442 void addStackNodesForMIB(ContextNode *AllocNode,
443 CallStack<NodeT, IteratorT> &StackContext,
444 CallStack<NodeT, IteratorT> &CallsiteContext,
445 AllocationType AllocType, uint64_t TotalSize);
446
447 /// Matches all callsite metadata (or summary) to the nodes created for
448 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
449 /// inlining performed on those callsite instructions.
450 void updateStackNodes();
451
452 /// Update graph to conservatively handle any callsite stack nodes that target
453 /// multiple different callee target functions.
454 void handleCallsitesWithMultipleTargets();
455
456 /// Save lists of calls with MemProf metadata in each function, for faster
457 /// iteration.
458 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
459
460 /// Map from callsite node to the enclosing caller function.
461 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
462
463 private:
464 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
465
466 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
467 const FuncTy *, DenseSet<uint32_t>>;
468
469 /// Assigns the given Node to calls at or inlined into the location with
470 /// the Node's stack id, after post order traversing and processing its
471 /// caller nodes. Uses the call information recorded in the given
472 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
473 /// as needed. Called by updateStackNodes which sets up the given
474 /// StackIdToMatchingCalls map.
475 void assignStackNodesPostOrder(
476 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
477 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
478
479 /// Duplicates the given set of context ids, updating the provided
480 /// map from each original id with the newly generated context ids,
481 /// and returning the new duplicated id set.
482 DenseSet<uint32_t> duplicateContextIds(
483 const DenseSet<uint32_t> &StackSequenceContextIds,
484 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
485
486 /// Propagates all duplicated context ids across the graph.
487 void propagateDuplicateContextIds(
488 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
489
490 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
491 /// else to its callers. Also updates OrigNode's edges to remove any context
492 /// ids moved to the newly created edge.
493 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
494 bool TowardsCallee,
495 DenseSet<uint32_t> RemainingContextIds);
496
497 /// Get the stack id corresponding to the given Id or Index (for IR this will
498 /// return itself, for a summary index this will return the id recorded in the
499 /// index for that stack id index value).
getStackId(uint64_t IdOrIndex) const500 uint64_t getStackId(uint64_t IdOrIndex) const {
501 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
502 }
503
504 /// Returns true if the given call targets the callee of the given edge, or if
505 /// we were able to identify the call chain through intermediate tail calls.
506 /// In the latter case new context nodes are added to the graph for the
507 /// identified tail calls, and their synthesized nodes are added to
508 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
509 /// the updated edges and to prepare it for an increment in the caller.
510 bool
511 calleesMatch(CallTy Call, EdgeIter &EI,
512 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
513
514 /// Returns true if the given call targets the given function, or if we were
515 /// able to identify the call chain through intermediate tail calls (in which
516 /// case FoundCalleeChain will be populated).
calleeMatchesFunc(CallTy Call,const FuncTy * Func,const FuncTy * CallerFunc,std::vector<std::pair<CallTy,FuncTy * >> & FoundCalleeChain)517 bool calleeMatchesFunc(
518 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
519 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
520 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
521 Call, Func, CallerFunc, FoundCalleeChain);
522 }
523
524 /// Get a list of nodes corresponding to the stack ids in the given
525 /// callsite's context.
getStackIdsWithContextNodesForCall(CallTy Call)526 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
527 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
528 Call);
529 }
530
531 /// Get the last stack id in the context for callsite.
getLastStackId(CallTy Call)532 uint64_t getLastStackId(CallTy Call) {
533 return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
534 }
535
536 /// Update the allocation call to record type of allocated memory.
updateAllocationCall(CallInfo & Call,AllocationType AllocType)537 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
538 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
539 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
540 }
541
542 /// Update non-allocation call to invoke (possibly cloned) function
543 /// CalleeFunc.
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)544 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
545 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
546 }
547
548 /// Clone the given function for the given callsite, recording mapping of all
549 /// of the functions tracked calls to their new versions in the CallMap.
550 /// Assigns new clones to clone number CloneNo.
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)551 FuncInfo cloneFunctionForCallsite(
552 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
553 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
554 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
555 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
556 }
557
558 /// Gets a label to use in the dot graph for the given call clone in the given
559 /// function.
getLabel(const FuncTy * Func,const CallTy Call,unsigned CloneNo) const560 std::string getLabel(const FuncTy *Func, const CallTy Call,
561 unsigned CloneNo) const {
562 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
563 }
564
565 /// Helpers to find the node corresponding to the given call or stackid.
566 ContextNode *getNodeForInst(const CallInfo &C);
567 ContextNode *getNodeForAlloc(const CallInfo &C);
568 ContextNode *getNodeForStackId(uint64_t StackId);
569
570 /// Computes the alloc type corresponding to the given context ids, by
571 /// unioning their recorded alloc types.
572 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
573
574 /// Returns the allocation type of the intersection of the contexts of two
575 /// nodes (based on their provided context id sets), optimized for the case
576 /// when Node1Ids is smaller than Node2Ids.
577 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
578 const DenseSet<uint32_t> &Node2Ids);
579
580 /// Returns the allocation type of the intersection of the contexts of two
581 /// nodes (based on their provided context id sets).
582 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
583 const DenseSet<uint32_t> &Node2Ids);
584
585 /// Create a clone of Edge's callee and move Edge to that new callee node,
586 /// performing the necessary context id and allocation type updates.
587 /// If callee's caller edge iterator is supplied, it is updated when removing
588 /// the edge from that list. If ContextIdsToMove is non-empty, only that
589 /// subset of Edge's ids are moved to an edge to the new callee.
590 ContextNode *
591 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
592 EdgeIter *CallerEdgeI = nullptr,
593 DenseSet<uint32_t> ContextIdsToMove = {});
594
595 /// Change the callee of Edge to existing callee clone NewCallee, performing
596 /// the necessary context id and allocation type updates.
597 /// If callee's caller edge iterator is supplied, it is updated when removing
598 /// the edge from that list. If ContextIdsToMove is non-empty, only that
599 /// subset of Edge's ids are moved to an edge to the new callee.
600 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
601 ContextNode *NewCallee,
602 EdgeIter *CallerEdgeI = nullptr,
603 bool NewClone = false,
604 DenseSet<uint32_t> ContextIdsToMove = {});
605
606 /// Recursively perform cloning on the graph for the given Node and its
607 /// callers, in order to uniquely identify the allocation behavior of an
608 /// allocation given its context. The context ids of the allocation being
609 /// processed are given in AllocContextIds.
610 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
611 const DenseSet<uint32_t> &AllocContextIds);
612
613 /// Map from each context ID to the AllocationType assigned to that context.
614 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
615
616 /// Map from each contextID to the profiled aggregate allocation size,
617 /// optionally populated when requested (via MemProfReportHintedSizes).
618 DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
619
620 /// Identifies the context node created for a stack id when adding the MIB
621 /// contexts to the graph. This is used to locate the context nodes when
622 /// trying to assign the corresponding callsites with those stack ids to these
623 /// nodes.
624 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
625
626 /// Maps to track the calls to their corresponding nodes in the graph.
627 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
628 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
629
630 /// Owner of all ContextNode unique_ptrs.
631 std::vector<std::unique_ptr<ContextNode>> NodeOwner;
632
633 /// Perform sanity checks on graph when requested.
634 void check() const;
635
636 /// Keeps track of the last unique context id assigned.
637 unsigned int LastContextId = 0;
638 };
639
640 template <typename DerivedCCG, typename FuncTy, typename CallTy>
641 using ContextNode =
642 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
643 template <typename DerivedCCG, typename FuncTy, typename CallTy>
644 using ContextEdge =
645 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
646 template <typename DerivedCCG, typename FuncTy, typename CallTy>
647 using FuncInfo =
648 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
649 template <typename DerivedCCG, typename FuncTy, typename CallTy>
650 using CallInfo =
651 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
652
653 /// CRTP derived class for graphs built from IR (regular LTO).
654 class ModuleCallsiteContextGraph
655 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
656 Instruction *> {
657 public:
658 ModuleCallsiteContextGraph(
659 Module &M,
660 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
661
662 private:
663 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
664 Instruction *>;
665
666 uint64_t getStackId(uint64_t IdOrIndex) const;
667 bool calleeMatchesFunc(
668 Instruction *Call, const Function *Func, const Function *CallerFunc,
669 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
670 bool findProfiledCalleeThroughTailCalls(
671 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
672 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
673 bool &FoundMultipleCalleeChains);
674 uint64_t getLastStackId(Instruction *Call);
675 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
676 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
677 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
678 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
679 Instruction *>::FuncInfo
680 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
681 std::map<CallInfo, CallInfo> &CallMap,
682 std::vector<CallInfo> &CallsWithMetadataInFunc,
683 unsigned CloneNo);
684 std::string getLabel(const Function *Func, const Instruction *Call,
685 unsigned CloneNo) const;
686
687 const Module &Mod;
688 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
689 };
690
691 /// Represents a call in the summary index graph, which can either be an
692 /// allocation or an interior callsite node in an allocation's context.
693 /// Holds a pointer to the corresponding data structure in the index.
694 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
IndexCall__anonf01a58700111::IndexCall695 IndexCall() : PointerUnion() {}
IndexCall__anonf01a58700111::IndexCall696 IndexCall(std::nullptr_t) : IndexCall() {}
IndexCall__anonf01a58700111::IndexCall697 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
IndexCall__anonf01a58700111::IndexCall698 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
IndexCall__anonf01a58700111::IndexCall699 IndexCall(PointerUnion PT) : PointerUnion(PT) {}
700
operator ->__anonf01a58700111::IndexCall701 IndexCall *operator->() { return this; }
702
getBase__anonf01a58700111::IndexCall703 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
704
print__anonf01a58700111::IndexCall705 void print(raw_ostream &OS) const {
706 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
707 OS << *AI;
708 } else {
709 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
710 assert(CI);
711 OS << *CI;
712 }
713 }
714 };
715
716 /// CRTP derived class for graphs built from summary index (ThinLTO).
717 class IndexCallsiteContextGraph
718 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
719 IndexCall> {
720 public:
721 IndexCallsiteContextGraph(
722 ModuleSummaryIndex &Index,
723 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
724 isPrevailing);
725
~IndexCallsiteContextGraph()726 ~IndexCallsiteContextGraph() {
727 // Now that we are done with the graph it is safe to add the new
728 // CallsiteInfo structs to the function summary vectors. The graph nodes
729 // point into locations within these vectors, so we don't want to add them
730 // any earlier.
731 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
732 auto *FS = I.first;
733 for (auto &Callsite : I.second)
734 FS->addCallsite(*Callsite.second);
735 }
736 }
737
738 private:
739 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
740 IndexCall>;
741
742 uint64_t getStackId(uint64_t IdOrIndex) const;
743 bool calleeMatchesFunc(
744 IndexCall &Call, const FunctionSummary *Func,
745 const FunctionSummary *CallerFunc,
746 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
747 bool findProfiledCalleeThroughTailCalls(
748 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
749 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
750 bool &FoundMultipleCalleeChains);
751 uint64_t getLastStackId(IndexCall &Call);
752 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
753 void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
754 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
755 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
756 IndexCall>::FuncInfo
757 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
758 std::map<CallInfo, CallInfo> &CallMap,
759 std::vector<CallInfo> &CallsWithMetadataInFunc,
760 unsigned CloneNo);
761 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
762 unsigned CloneNo) const;
763
764 // Saves mapping from function summaries containing memprof records back to
765 // its VI, for use in checking and debugging.
766 std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
767
768 const ModuleSummaryIndex &Index;
769 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
770 isPrevailing;
771
772 // Saves/owns the callsite info structures synthesized for missing tail call
773 // frames that we discover while building the graph.
774 // It maps from the summary of the function making the tail call, to a map
775 // of callee ValueInfo to corresponding synthesized callsite info.
776 std::unordered_map<FunctionSummary *,
777 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
778 FunctionCalleesToSynthesizedCallsiteInfos;
779 };
780 } // namespace
781
782 namespace llvm {
783 template <>
784 struct DenseMapInfo<typename CallsiteContextGraph<
785 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
786 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
787 template <>
788 struct DenseMapInfo<typename CallsiteContextGraph<
789 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
790 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
791 template <>
792 struct DenseMapInfo<IndexCall>
793 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
794 } // end namespace llvm
795
796 namespace {
797
798 struct FieldSeparator {
799 bool Skip = true;
800 const char *Sep;
801
FieldSeparator__anonf01a58700211::FieldSeparator802 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
803 };
804
operator <<(raw_ostream & OS,FieldSeparator & FS)805 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
806 if (FS.Skip) {
807 FS.Skip = false;
808 return OS;
809 }
810 return OS << FS.Sep;
811 }
812
813 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
814 // type we should actually use on the corresponding allocation.
815 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
816 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
817 // from NotCold.
allocTypeToUse(uint8_t AllocTypes)818 AllocationType allocTypeToUse(uint8_t AllocTypes) {
819 assert(AllocTypes != (uint8_t)AllocationType::None);
820 if (AllocTypes ==
821 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
822 return AllocationType::NotCold;
823 else
824 return (AllocationType)AllocTypes;
825 }
826
827 // Helper to check if the alloc types for all edges recorded in the
828 // InAllocTypes vector match the alloc types for all edges in the Edges
829 // vector.
830 template <typename DerivedCCG, typename FuncTy, typename CallTy>
allocTypesMatch(const std::vector<uint8_t> & InAllocTypes,const std::vector<std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>>> & Edges)831 bool allocTypesMatch(
832 const std::vector<uint8_t> &InAllocTypes,
833 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
834 &Edges) {
835 return std::equal(
836 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
837 [](const uint8_t &l,
838 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
839 // Can share if one of the edges is None type - don't
840 // care about the type along that edge as it doesn't
841 // exist for those context ids.
842 if (l == (uint8_t)AllocationType::None ||
843 r->AllocTypes == (uint8_t)AllocationType::None)
844 return true;
845 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
846 });
847 }
848
849 } // end anonymous namespace
850
851 template <typename DerivedCCG, typename FuncTy, typename CallTy>
852 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForInst(const CallInfo & C)853 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
854 const CallInfo &C) {
855 ContextNode *Node = getNodeForAlloc(C);
856 if (Node)
857 return Node;
858
859 return NonAllocationCallToContextNodeMap.lookup(C);
860 }
861
862 template <typename DerivedCCG, typename FuncTy, typename CallTy>
863 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForAlloc(const CallInfo & C)864 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
865 const CallInfo &C) {
866 return AllocationCallToContextNodeMap.lookup(C);
867 }
868
869 template <typename DerivedCCG, typename FuncTy, typename CallTy>
870 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForStackId(uint64_t StackId)871 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
872 uint64_t StackId) {
873 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
874 if (StackEntryNode != StackEntryIdToContextNodeMap.end())
875 return StackEntryNode->second;
876 return nullptr;
877 }
878
879 template <typename DerivedCCG, typename FuncTy, typename CallTy>
880 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
addOrUpdateCallerEdge(ContextNode * Caller,AllocationType AllocType,unsigned int ContextId)881 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
882 unsigned int ContextId) {
883 for (auto &Edge : CallerEdges) {
884 if (Edge->Caller == Caller) {
885 Edge->AllocTypes |= (uint8_t)AllocType;
886 Edge->getContextIds().insert(ContextId);
887 return;
888 }
889 }
890 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
891 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
892 CallerEdges.push_back(Edge);
893 Caller->CalleeEdges.push_back(Edge);
894 }
895
896 template <typename DerivedCCG, typename FuncTy, typename CallTy>
897 void CallsiteContextGraph<
removeNoneTypeCalleeEdges(ContextNode * Node)898 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
899 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
900 auto Edge = *EI;
901 if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
902 assert(Edge->ContextIds.empty());
903 Edge->Callee->eraseCallerEdge(Edge.get());
904 EI = Node->CalleeEdges.erase(EI);
905 } else
906 ++EI;
907 }
908 }
909
910 template <typename DerivedCCG, typename FuncTy, typename CallTy>
911 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
912 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCallee(const ContextNode * Callee)913 findEdgeFromCallee(const ContextNode *Callee) {
914 for (const auto &Edge : CalleeEdges)
915 if (Edge->Callee == Callee)
916 return Edge.get();
917 return nullptr;
918 }
919
920 template <typename DerivedCCG, typename FuncTy, typename CallTy>
921 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
922 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCaller(const ContextNode * Caller)923 findEdgeFromCaller(const ContextNode *Caller) {
924 for (const auto &Edge : CallerEdges)
925 if (Edge->Caller == Caller)
926 return Edge.get();
927 return nullptr;
928 }
929
930 template <typename DerivedCCG, typename FuncTy, typename CallTy>
931 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCalleeEdge(const ContextEdge * Edge)932 eraseCalleeEdge(const ContextEdge *Edge) {
933 auto EI = llvm::find_if(
934 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
935 return CalleeEdge.get() == Edge;
936 });
937 assert(EI != CalleeEdges.end());
938 CalleeEdges.erase(EI);
939 }
940
941 template <typename DerivedCCG, typename FuncTy, typename CallTy>
942 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCallerEdge(const ContextEdge * Edge)943 eraseCallerEdge(const ContextEdge *Edge) {
944 auto EI = llvm::find_if(
945 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
946 return CallerEdge.get() == Edge;
947 });
948 assert(EI != CallerEdges.end());
949 CallerEdges.erase(EI);
950 }
951
952 template <typename DerivedCCG, typename FuncTy, typename CallTy>
computeAllocType(DenseSet<uint32_t> & ContextIds)953 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
954 DenseSet<uint32_t> &ContextIds) {
955 uint8_t BothTypes =
956 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
957 uint8_t AllocType = (uint8_t)AllocationType::None;
958 for (auto Id : ContextIds) {
959 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
960 // Bail early if alloc type reached both, no further refinement.
961 if (AllocType == BothTypes)
962 return AllocType;
963 }
964 return AllocType;
965 }
966
967 template <typename DerivedCCG, typename FuncTy, typename CallTy>
968 uint8_t
intersectAllocTypesImpl(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids)969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
970 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
971 uint8_t BothTypes =
972 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
973 uint8_t AllocType = (uint8_t)AllocationType::None;
974 for (auto Id : Node1Ids) {
975 if (!Node2Ids.count(Id))
976 continue;
977 AllocType |= (uint8_t)ContextIdToAllocationType[Id];
978 // Bail early if alloc type reached both, no further refinement.
979 if (AllocType == BothTypes)
980 return AllocType;
981 }
982 return AllocType;
983 }
984
985 template <typename DerivedCCG, typename FuncTy, typename CallTy>
intersectAllocTypes(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids)986 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
987 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
988 if (Node1Ids.size() < Node2Ids.size())
989 return intersectAllocTypesImpl(Node1Ids, Node2Ids);
990 else
991 return intersectAllocTypesImpl(Node2Ids, Node1Ids);
992 }
993
994 template <typename DerivedCCG, typename FuncTy, typename CallTy>
995 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
addAllocNode(CallInfo Call,const FuncTy * F)996 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
997 CallInfo Call, const FuncTy *F) {
998 assert(!getNodeForAlloc(Call));
999 NodeOwner.push_back(
1000 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
1001 ContextNode *AllocNode = NodeOwner.back().get();
1002 AllocationCallToContextNodeMap[Call] = AllocNode;
1003 NodeToCallingFunc[AllocNode] = F;
1004 // Use LastContextId as a uniq id for MIB allocation nodes.
1005 AllocNode->OrigStackOrAllocId = LastContextId;
1006 // Alloc type should be updated as we add in the MIBs. We should assert
1007 // afterwards that it is not still None.
1008 AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1009
1010 return AllocNode;
1011 }
1012
getAllocTypeString(uint8_t AllocTypes)1013 static std::string getAllocTypeString(uint8_t AllocTypes) {
1014 if (!AllocTypes)
1015 return "None";
1016 std::string Str;
1017 if (AllocTypes & (uint8_t)AllocationType::NotCold)
1018 Str += "NotCold";
1019 if (AllocTypes & (uint8_t)AllocationType::Cold)
1020 Str += "Cold";
1021 return Str;
1022 }
1023
1024 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1025 template <class NodeT, class IteratorT>
addStackNodesForMIB(ContextNode * AllocNode,CallStack<NodeT,IteratorT> & StackContext,CallStack<NodeT,IteratorT> & CallsiteContext,AllocationType AllocType,uint64_t TotalSize)1026 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1027 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1028 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1029 uint64_t TotalSize) {
1030 assert(!MemProfReportHintedSizes || TotalSize > 0);
1031 // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1032 // is done.
1033 if (AllocType == AllocationType::Hot)
1034 AllocType = AllocationType::NotCold;
1035
1036 ContextIdToAllocationType[++LastContextId] = AllocType;
1037
1038 if (MemProfReportHintedSizes) {
1039 assert(TotalSize);
1040 ContextIdToTotalSize[LastContextId] = TotalSize;
1041 }
1042
1043 // Update alloc type and context ids for this MIB.
1044 AllocNode->AllocTypes |= (uint8_t)AllocType;
1045
1046 // Now add or update nodes for each stack id in alloc's context.
1047 // Later when processing the stack ids on non-alloc callsites we will adjust
1048 // for any inlining in the context.
1049 ContextNode *PrevNode = AllocNode;
1050 // Look for recursion (direct recursion should have been collapsed by
1051 // module summary analysis, here we should just be detecting mutual
1052 // recursion). Mark these nodes so we don't try to clone.
1053 SmallSet<uint64_t, 8> StackIdSet;
1054 // Skip any on the allocation call (inlining).
1055 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1056 ContextIter != StackContext.end(); ++ContextIter) {
1057 auto StackId = getStackId(*ContextIter);
1058 ContextNode *StackNode = getNodeForStackId(StackId);
1059 if (!StackNode) {
1060 NodeOwner.push_back(
1061 std::make_unique<ContextNode>(/*IsAllocation=*/false));
1062 StackNode = NodeOwner.back().get();
1063 StackEntryIdToContextNodeMap[StackId] = StackNode;
1064 StackNode->OrigStackOrAllocId = StackId;
1065 }
1066 auto Ins = StackIdSet.insert(StackId);
1067 if (!Ins.second)
1068 StackNode->Recursive = true;
1069 StackNode->AllocTypes |= (uint8_t)AllocType;
1070 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1071 PrevNode = StackNode;
1072 }
1073 }
1074
1075 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1076 DenseSet<uint32_t>
duplicateContextIds(const DenseSet<uint32_t> & StackSequenceContextIds,DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)1077 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1078 const DenseSet<uint32_t> &StackSequenceContextIds,
1079 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1080 DenseSet<uint32_t> NewContextIds;
1081 for (auto OldId : StackSequenceContextIds) {
1082 NewContextIds.insert(++LastContextId);
1083 OldToNewContextIds[OldId].insert(LastContextId);
1084 assert(ContextIdToAllocationType.count(OldId));
1085 // The new context has the same allocation type as original.
1086 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1087 // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
1088 // up the size. Assume that if we are able to duplicate context ids that we
1089 // will be able to disambiguate all copies.
1090 ContextIdToTotalSize[LastContextId] = 0;
1091 }
1092 return NewContextIds;
1093 }
1094
1095 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1096 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
propagateDuplicateContextIds(const DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)1097 propagateDuplicateContextIds(
1098 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1099 // Build a set of duplicated context ids corresponding to the input id set.
1100 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1101 DenseSet<uint32_t> NewIds;
1102 for (auto Id : ContextIds)
1103 if (auto NewId = OldToNewContextIds.find(Id);
1104 NewId != OldToNewContextIds.end())
1105 NewIds.insert(NewId->second.begin(), NewId->second.end());
1106 return NewIds;
1107 };
1108
1109 // Recursively update context ids sets along caller edges.
1110 auto UpdateCallers = [&](ContextNode *Node,
1111 DenseSet<const ContextEdge *> &Visited,
1112 auto &&UpdateCallers) -> void {
1113 for (const auto &Edge : Node->CallerEdges) {
1114 auto Inserted = Visited.insert(Edge.get());
1115 if (!Inserted.second)
1116 continue;
1117 ContextNode *NextNode = Edge->Caller;
1118 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1119 // Only need to recursively iterate to NextNode via this caller edge if
1120 // it resulted in any added ids to NextNode.
1121 if (!NewIdsToAdd.empty()) {
1122 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1123 UpdateCallers(NextNode, Visited, UpdateCallers);
1124 }
1125 }
1126 };
1127
1128 DenseSet<const ContextEdge *> Visited;
1129 for (auto &Entry : AllocationCallToContextNodeMap) {
1130 auto *Node = Entry.second;
1131 UpdateCallers(Node, Visited, UpdateCallers);
1132 }
1133 }
1134
1135 template <typename DerivedCCG, typename FuncTy, typename CallTy>
connectNewNode(ContextNode * NewNode,ContextNode * OrigNode,bool TowardsCallee,DenseSet<uint32_t> RemainingContextIds)1136 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1137 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1138 // This must be passed by value to make a copy since it will be adjusted
1139 // as ids are moved.
1140 DenseSet<uint32_t> RemainingContextIds) {
1141 auto &OrigEdges =
1142 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1143 // Increment iterator in loop so that we can remove edges as needed.
1144 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1145 auto Edge = *EI;
1146 // Remove any matching context ids from Edge, return set that were found and
1147 // removed, these are the new edge's context ids. Also update the remaining
1148 // (not found ids).
1149 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1150 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1151 NotFoundContextIds);
1152 RemainingContextIds.swap(NotFoundContextIds);
1153 // If no matching context ids for this edge, skip it.
1154 if (NewEdgeContextIds.empty()) {
1155 ++EI;
1156 continue;
1157 }
1158 if (TowardsCallee) {
1159 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1160 auto NewEdge = std::make_shared<ContextEdge>(
1161 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1162 NewNode->CalleeEdges.push_back(NewEdge);
1163 NewEdge->Callee->CallerEdges.push_back(NewEdge);
1164 } else {
1165 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1166 auto NewEdge = std::make_shared<ContextEdge>(
1167 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1168 NewNode->CallerEdges.push_back(NewEdge);
1169 NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1170 }
1171 // Remove old edge if context ids empty.
1172 if (Edge->getContextIds().empty()) {
1173 if (TowardsCallee) {
1174 Edge->Callee->eraseCallerEdge(Edge.get());
1175 EI = OrigNode->CalleeEdges.erase(EI);
1176 } else {
1177 Edge->Caller->eraseCalleeEdge(Edge.get());
1178 EI = OrigNode->CallerEdges.erase(EI);
1179 }
1180 continue;
1181 }
1182 ++EI;
1183 }
1184 }
1185
1186 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkEdge(const std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>> & Edge)1187 static void checkEdge(
1188 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1189 // Confirm that alloc type is not None and that we have at least one context
1190 // id.
1191 assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1192 assert(!Edge->ContextIds.empty());
1193 }
1194
1195 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkNode(const ContextNode<DerivedCCG,FuncTy,CallTy> * Node,bool CheckEdges=true)1196 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1197 bool CheckEdges = true) {
1198 if (Node->isRemoved())
1199 return;
1200 #ifndef NDEBUG
1201 // Compute node's context ids once for use in asserts.
1202 auto NodeContextIds = Node->getContextIds();
1203 #endif
1204 // Node's context ids should be the union of both its callee and caller edge
1205 // context ids.
1206 if (Node->CallerEdges.size()) {
1207 DenseSet<uint32_t> CallerEdgeContextIds(
1208 Node->CallerEdges.front()->ContextIds);
1209 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1210 if (CheckEdges)
1211 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1212 set_union(CallerEdgeContextIds, Edge->ContextIds);
1213 }
1214 // Node can have more context ids than callers if some contexts terminate at
1215 // node and some are longer.
1216 assert(NodeContextIds == CallerEdgeContextIds ||
1217 set_is_subset(CallerEdgeContextIds, NodeContextIds));
1218 }
1219 if (Node->CalleeEdges.size()) {
1220 DenseSet<uint32_t> CalleeEdgeContextIds(
1221 Node->CalleeEdges.front()->ContextIds);
1222 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1223 if (CheckEdges)
1224 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1225 set_union(CalleeEdgeContextIds, Edge->getContextIds());
1226 }
1227 assert(NodeContextIds == CalleeEdgeContextIds);
1228 }
1229 }
1230
1231 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1232 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseMap<uint64_t,std::vector<CallContextInfo>> & StackIdToMatchingCalls)1233 assignStackNodesPostOrder(ContextNode *Node,
1234 DenseSet<const ContextNode *> &Visited,
1235 DenseMap<uint64_t, std::vector<CallContextInfo>>
1236 &StackIdToMatchingCalls) {
1237 auto Inserted = Visited.insert(Node);
1238 if (!Inserted.second)
1239 return;
1240 // Post order traversal. Iterate over a copy since we may add nodes and
1241 // therefore new callers during the recursive call, invalidating any
1242 // iterator over the original edge vector. We don't need to process these
1243 // new nodes as they were already processed on creation.
1244 auto CallerEdges = Node->CallerEdges;
1245 for (auto &Edge : CallerEdges) {
1246 // Skip any that have been removed during the recursion.
1247 if (!Edge)
1248 continue;
1249 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1250 }
1251
1252 // If this node's stack id is in the map, update the graph to contain new
1253 // nodes representing any inlining at interior callsites. Note we move the
1254 // associated context ids over to the new nodes.
1255
1256 // Ignore this node if it is for an allocation or we didn't record any
1257 // stack id lists ending at it.
1258 if (Node->IsAllocation ||
1259 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1260 return;
1261
1262 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1263 // Handle the simple case first. A single call with a single stack id.
1264 // In this case there is no need to create any new context nodes, simply
1265 // assign the context node for stack id to this Call.
1266 if (Calls.size() == 1) {
1267 auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1268 if (Ids.size() == 1) {
1269 assert(SavedContextIds.empty());
1270 // It should be this Node
1271 assert(Node == getNodeForStackId(Ids[0]));
1272 if (Node->Recursive)
1273 return;
1274 Node->setCall(Call);
1275 NonAllocationCallToContextNodeMap[Call] = Node;
1276 NodeToCallingFunc[Node] = Func;
1277 return;
1278 }
1279 }
1280
1281 // Find the node for the last stack id, which should be the same
1282 // across all calls recorded for this id, and is this node's id.
1283 uint64_t LastId = Node->OrigStackOrAllocId;
1284 ContextNode *LastNode = getNodeForStackId(LastId);
1285 // We should only have kept stack ids that had nodes.
1286 assert(LastNode);
1287
1288 for (unsigned I = 0; I < Calls.size(); I++) {
1289 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1290 // Skip any for which we didn't assign any ids, these don't get a node in
1291 // the graph.
1292 if (SavedContextIds.empty())
1293 continue;
1294
1295 assert(LastId == Ids.back());
1296
1297 ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1298 assert(FirstNode);
1299
1300 // Recompute the context ids for this stack id sequence (the
1301 // intersection of the context ids of the corresponding nodes).
1302 // Start with the ids we saved in the map for this call, which could be
1303 // duplicated context ids. We have to recompute as we might have overlap
1304 // overlap between the saved context ids for different last nodes, and
1305 // removed them already during the post order traversal.
1306 set_intersect(SavedContextIds, FirstNode->getContextIds());
1307 ContextNode *PrevNode = nullptr;
1308 for (auto Id : Ids) {
1309 ContextNode *CurNode = getNodeForStackId(Id);
1310 // We should only have kept stack ids that had nodes and weren't
1311 // recursive.
1312 assert(CurNode);
1313 assert(!CurNode->Recursive);
1314 if (!PrevNode) {
1315 PrevNode = CurNode;
1316 continue;
1317 }
1318 auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1319 if (!Edge) {
1320 SavedContextIds.clear();
1321 break;
1322 }
1323 PrevNode = CurNode;
1324 set_intersect(SavedContextIds, Edge->getContextIds());
1325
1326 // If we now have no context ids for clone, skip this call.
1327 if (SavedContextIds.empty())
1328 break;
1329 }
1330 if (SavedContextIds.empty())
1331 continue;
1332
1333 // Create new context node.
1334 NodeOwner.push_back(
1335 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1336 ContextNode *NewNode = NodeOwner.back().get();
1337 NodeToCallingFunc[NewNode] = Func;
1338 NonAllocationCallToContextNodeMap[Call] = NewNode;
1339 NewNode->AllocTypes = computeAllocType(SavedContextIds);
1340
1341 // Connect to callees of innermost stack frame in inlined call chain.
1342 // This updates context ids for FirstNode's callee's to reflect those
1343 // moved to NewNode.
1344 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1345
1346 // Connect to callers of outermost stack frame in inlined call chain.
1347 // This updates context ids for FirstNode's caller's to reflect those
1348 // moved to NewNode.
1349 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1350
1351 // Now we need to remove context ids from edges/nodes between First and
1352 // Last Node.
1353 PrevNode = nullptr;
1354 for (auto Id : Ids) {
1355 ContextNode *CurNode = getNodeForStackId(Id);
1356 // We should only have kept stack ids that had nodes.
1357 assert(CurNode);
1358
1359 // Remove the context ids moved to NewNode from CurNode, and the
1360 // edge from the prior node.
1361 if (PrevNode) {
1362 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1363 assert(PrevEdge);
1364 set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1365 if (PrevEdge->getContextIds().empty()) {
1366 PrevNode->eraseCallerEdge(PrevEdge);
1367 CurNode->eraseCalleeEdge(PrevEdge);
1368 }
1369 }
1370 // Since we update the edges from leaf to tail, only look at the callee
1371 // edges. This isn't an alloc node, so if there are no callee edges, the
1372 // alloc type is None.
1373 CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1374 ? (uint8_t)AllocationType::None
1375 : CurNode->computeAllocType();
1376 PrevNode = CurNode;
1377 }
1378 if (VerifyNodes) {
1379 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1380 for (auto Id : Ids) {
1381 ContextNode *CurNode = getNodeForStackId(Id);
1382 // We should only have kept stack ids that had nodes.
1383 assert(CurNode);
1384 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1385 }
1386 }
1387 }
1388 }
1389
1390 template <typename DerivedCCG, typename FuncTy, typename CallTy>
updateStackNodes()1391 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1392 // Map of stack id to all calls with that as the last (outermost caller)
1393 // callsite id that has a context node (some might not due to pruning
1394 // performed during matching of the allocation profile contexts).
1395 // The CallContextInfo contains the Call and a list of its stack ids with
1396 // ContextNodes, the function containing Call, and the set of context ids
1397 // the analysis will eventually identify for use in any new node created
1398 // for that callsite.
1399 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1400 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1401 for (auto &Call : CallsWithMetadata) {
1402 // Ignore allocations, already handled.
1403 if (AllocationCallToContextNodeMap.count(Call))
1404 continue;
1405 auto StackIdsWithContextNodes =
1406 getStackIdsWithContextNodesForCall(Call.call());
1407 // If there were no nodes created for MIBs on allocs (maybe this was in
1408 // the unambiguous part of the MIB stack that was pruned), ignore.
1409 if (StackIdsWithContextNodes.empty())
1410 continue;
1411 // Otherwise, record this Call along with the list of ids for the last
1412 // (outermost caller) stack id with a node.
1413 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1414 {Call.call(), StackIdsWithContextNodes, Func, {}});
1415 }
1416 }
1417
1418 // First make a pass through all stack ids that correspond to a call,
1419 // as identified in the above loop. Compute the context ids corresponding to
1420 // each of these calls when they correspond to multiple stack ids due to
1421 // due to inlining. Perform any duplication of context ids required when
1422 // there is more than one call with the same stack ids. Their (possibly newly
1423 // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1424 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1425 for (auto &It : StackIdToMatchingCalls) {
1426 auto &Calls = It.getSecond();
1427 // Skip single calls with a single stack id. These don't need a new node.
1428 if (Calls.size() == 1) {
1429 auto &Ids = std::get<1>(Calls[0]);
1430 if (Ids.size() == 1)
1431 continue;
1432 }
1433 // In order to do the best and maximal matching of inlined calls to context
1434 // node sequences we will sort the vectors of stack ids in descending order
1435 // of length, and within each length, lexicographically by stack id. The
1436 // latter is so that we can specially handle calls that have identical stack
1437 // id sequences (either due to cloning or artificially because of the MIB
1438 // context pruning).
1439 std::stable_sort(Calls.begin(), Calls.end(),
1440 [](const CallContextInfo &A, const CallContextInfo &B) {
1441 auto &IdsA = std::get<1>(A);
1442 auto &IdsB = std::get<1>(B);
1443 return IdsA.size() > IdsB.size() ||
1444 (IdsA.size() == IdsB.size() && IdsA < IdsB);
1445 });
1446
1447 // Find the node for the last stack id, which should be the same
1448 // across all calls recorded for this id, and is the id for this
1449 // entry in the StackIdToMatchingCalls map.
1450 uint64_t LastId = It.getFirst();
1451 ContextNode *LastNode = getNodeForStackId(LastId);
1452 // We should only have kept stack ids that had nodes.
1453 assert(LastNode);
1454
1455 if (LastNode->Recursive)
1456 continue;
1457
1458 // Initialize the context ids with the last node's. We will subsequently
1459 // refine the context ids by computing the intersection along all edges.
1460 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1461 assert(!LastNodeContextIds.empty());
1462
1463 for (unsigned I = 0; I < Calls.size(); I++) {
1464 auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1465 assert(SavedContextIds.empty());
1466 assert(LastId == Ids.back());
1467
1468 // First compute the context ids for this stack id sequence (the
1469 // intersection of the context ids of the corresponding nodes).
1470 // Start with the remaining saved ids for the last node.
1471 assert(!LastNodeContextIds.empty());
1472 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1473
1474 ContextNode *PrevNode = LastNode;
1475 ContextNode *CurNode = LastNode;
1476 bool Skip = false;
1477
1478 // Iterate backwards through the stack Ids, starting after the last Id
1479 // in the list, which was handled once outside for all Calls.
1480 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1481 auto Id = *IdIter;
1482 CurNode = getNodeForStackId(Id);
1483 // We should only have kept stack ids that had nodes.
1484 assert(CurNode);
1485
1486 if (CurNode->Recursive) {
1487 Skip = true;
1488 break;
1489 }
1490
1491 auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1492 // If there is no edge then the nodes belong to different MIB contexts,
1493 // and we should skip this inlined context sequence. For example, this
1494 // particular inlined context may include stack ids A->B, and we may
1495 // indeed have nodes for both A and B, but it is possible that they were
1496 // never profiled in sequence in a single MIB for any allocation (i.e.
1497 // we might have profiled an allocation that involves the callsite A,
1498 // but through a different one of its callee callsites, and we might
1499 // have profiled an allocation that involves callsite B, but reached
1500 // from a different caller callsite).
1501 if (!Edge) {
1502 Skip = true;
1503 break;
1504 }
1505 PrevNode = CurNode;
1506
1507 // Update the context ids, which is the intersection of the ids along
1508 // all edges in the sequence.
1509 set_intersect(StackSequenceContextIds, Edge->getContextIds());
1510
1511 // If we now have no context ids for clone, skip this call.
1512 if (StackSequenceContextIds.empty()) {
1513 Skip = true;
1514 break;
1515 }
1516 }
1517 if (Skip)
1518 continue;
1519
1520 // If some of this call's stack ids did not have corresponding nodes (due
1521 // to pruning), don't include any context ids for contexts that extend
1522 // beyond these nodes. Otherwise we would be matching part of unrelated /
1523 // not fully matching stack contexts. To do this, subtract any context ids
1524 // found in caller nodes of the last node found above.
1525 if (Ids.back() != getLastStackId(Call)) {
1526 for (const auto &PE : LastNode->CallerEdges) {
1527 set_subtract(StackSequenceContextIds, PE->getContextIds());
1528 if (StackSequenceContextIds.empty())
1529 break;
1530 }
1531 // If we now have no context ids for clone, skip this call.
1532 if (StackSequenceContextIds.empty())
1533 continue;
1534 }
1535
1536 // Check if the next set of stack ids is the same (since the Calls vector
1537 // of tuples is sorted by the stack ids we can just look at the next one).
1538 bool DuplicateContextIds = false;
1539 if (I + 1 < Calls.size()) {
1540 auto NextIds = std::get<1>(Calls[I + 1]);
1541 DuplicateContextIds = Ids == NextIds;
1542 }
1543
1544 // If we don't have duplicate context ids, then we can assign all the
1545 // context ids computed for the original node sequence to this call.
1546 // If there are duplicate calls with the same stack ids then we synthesize
1547 // new context ids that are duplicates of the originals. These are
1548 // assigned to SavedContextIds, which is a reference into the map entry
1549 // for this call, allowing us to access these ids later on.
1550 OldToNewContextIds.reserve(OldToNewContextIds.size() +
1551 StackSequenceContextIds.size());
1552 SavedContextIds =
1553 DuplicateContextIds
1554 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1555 : StackSequenceContextIds;
1556 assert(!SavedContextIds.empty());
1557
1558 if (!DuplicateContextIds) {
1559 // Update saved last node's context ids to remove those that are
1560 // assigned to other calls, so that it is ready for the next call at
1561 // this stack id.
1562 set_subtract(LastNodeContextIds, StackSequenceContextIds);
1563 if (LastNodeContextIds.empty())
1564 break;
1565 }
1566 }
1567 }
1568
1569 // Propagate the duplicate context ids over the graph.
1570 propagateDuplicateContextIds(OldToNewContextIds);
1571
1572 if (VerifyCCG)
1573 check();
1574
1575 // Now perform a post-order traversal over the graph, starting with the
1576 // allocation nodes, essentially processing nodes from callers to callees.
1577 // For any that contains an id in the map, update the graph to contain new
1578 // nodes representing any inlining at interior callsites. Note we move the
1579 // associated context ids over to the new nodes.
1580 DenseSet<const ContextNode *> Visited;
1581 for (auto &Entry : AllocationCallToContextNodeMap)
1582 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1583 if (VerifyCCG)
1584 check();
1585 }
1586
getLastStackId(Instruction * Call)1587 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1588 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1589 Call->getMetadata(LLVMContext::MD_callsite));
1590 return CallsiteContext.back();
1591 }
1592
getLastStackId(IndexCall & Call)1593 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1594 assert(isa<CallsiteInfo *>(Call.getBase()));
1595 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1596 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1597 // Need to convert index into stack id.
1598 return Index.getStackIdAtIndex(CallsiteContext.back());
1599 }
1600
1601 static const std::string MemProfCloneSuffix = ".memprof.";
1602
getMemProfFuncName(Twine Base,unsigned CloneNo)1603 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1604 // We use CloneNo == 0 to refer to the original version, which doesn't get
1605 // renamed with a suffix.
1606 if (!CloneNo)
1607 return Base.str();
1608 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1609 }
1610
getLabel(const Function * Func,const Instruction * Call,unsigned CloneNo) const1611 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1612 const Instruction *Call,
1613 unsigned CloneNo) const {
1614 return (Twine(Call->getFunction()->getName()) + " -> " +
1615 cast<CallBase>(Call)->getCalledFunction()->getName())
1616 .str();
1617 }
1618
getLabel(const FunctionSummary * Func,const IndexCall & Call,unsigned CloneNo) const1619 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1620 const IndexCall &Call,
1621 unsigned CloneNo) const {
1622 auto VI = FSToVIMap.find(Func);
1623 assert(VI != FSToVIMap.end());
1624 if (isa<AllocInfo *>(Call.getBase()))
1625 return (VI->second.name() + " -> alloc").str();
1626 else {
1627 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1628 return (VI->second.name() + " -> " +
1629 getMemProfFuncName(Callsite->Callee.name(),
1630 Callsite->Clones[CloneNo]))
1631 .str();
1632 }
1633 }
1634
1635 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(Instruction * Call)1636 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1637 Instruction *Call) {
1638 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1639 Call->getMetadata(LLVMContext::MD_callsite));
1640 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1641 CallsiteContext);
1642 }
1643
1644 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(IndexCall & Call)1645 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1646 assert(isa<CallsiteInfo *>(Call.getBase()));
1647 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1648 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1649 return getStackIdsWithContextNodes<CallsiteInfo,
1650 SmallVector<unsigned>::const_iterator>(
1651 CallsiteContext);
1652 }
1653
1654 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1655 template <class NodeT, class IteratorT>
1656 std::vector<uint64_t>
getStackIdsWithContextNodes(CallStack<NodeT,IteratorT> & CallsiteContext)1657 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1658 CallStack<NodeT, IteratorT> &CallsiteContext) {
1659 std::vector<uint64_t> StackIds;
1660 for (auto IdOrIndex : CallsiteContext) {
1661 auto StackId = getStackId(IdOrIndex);
1662 ContextNode *Node = getNodeForStackId(StackId);
1663 if (!Node)
1664 break;
1665 StackIds.push_back(StackId);
1666 }
1667 return StackIds;
1668 }
1669
ModuleCallsiteContextGraph(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)1670 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1671 Module &M,
1672 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1673 : Mod(M), OREGetter(OREGetter) {
1674 for (auto &F : M) {
1675 std::vector<CallInfo> CallsWithMetadata;
1676 for (auto &BB : F) {
1677 for (auto &I : BB) {
1678 if (!isa<CallBase>(I))
1679 continue;
1680 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1681 CallsWithMetadata.push_back(&I);
1682 auto *AllocNode = addAllocNode(&I, &F);
1683 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1684 assert(CallsiteMD);
1685 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1686 // Add all of the MIBs and their stack nodes.
1687 for (auto &MDOp : MemProfMD->operands()) {
1688 auto *MIBMD = cast<const MDNode>(MDOp);
1689 MDNode *StackNode = getMIBStackNode(MIBMD);
1690 assert(StackNode);
1691 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1692 addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1693 AllocNode, StackContext, CallsiteContext,
1694 getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
1695 }
1696 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1697 // Memprof and callsite metadata on memory allocations no longer
1698 // needed.
1699 I.setMetadata(LLVMContext::MD_memprof, nullptr);
1700 I.setMetadata(LLVMContext::MD_callsite, nullptr);
1701 }
1702 // For callsite metadata, add to list for this function for later use.
1703 else if (I.getMetadata(LLVMContext::MD_callsite))
1704 CallsWithMetadata.push_back(&I);
1705 }
1706 }
1707 if (!CallsWithMetadata.empty())
1708 FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1709 }
1710
1711 if (DumpCCG) {
1712 dbgs() << "CCG before updating call stack chains:\n";
1713 dbgs() << *this;
1714 }
1715
1716 if (ExportToDot)
1717 exportToDot("prestackupdate");
1718
1719 updateStackNodes();
1720
1721 handleCallsitesWithMultipleTargets();
1722
1723 // Strip off remaining callsite metadata, no longer needed.
1724 for (auto &FuncEntry : FuncToCallsWithMetadata)
1725 for (auto &Call : FuncEntry.second)
1726 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1727 }
1728
IndexCallsiteContextGraph(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)1729 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1730 ModuleSummaryIndex &Index,
1731 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1732 isPrevailing)
1733 : Index(Index), isPrevailing(isPrevailing) {
1734 for (auto &I : Index) {
1735 auto VI = Index.getValueInfo(I);
1736 for (auto &S : VI.getSummaryList()) {
1737 // We should only add the prevailing nodes. Otherwise we may try to clone
1738 // in a weak copy that won't be linked (and may be different than the
1739 // prevailing version).
1740 // We only keep the memprof summary on the prevailing copy now when
1741 // building the combined index, as a space optimization, however don't
1742 // rely on this optimization. The linker doesn't resolve local linkage
1743 // values so don't check whether those are prevailing.
1744 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1745 !isPrevailing(VI.getGUID(), S.get()))
1746 continue;
1747 auto *FS = dyn_cast<FunctionSummary>(S.get());
1748 if (!FS)
1749 continue;
1750 std::vector<CallInfo> CallsWithMetadata;
1751 if (!FS->allocs().empty()) {
1752 for (auto &AN : FS->mutableAllocs()) {
1753 // This can happen because of recursion elimination handling that
1754 // currently exists in ModuleSummaryAnalysis. Skip these for now.
1755 // We still added them to the summary because we need to be able to
1756 // correlate properly in applyImport in the backends.
1757 if (AN.MIBs.empty())
1758 continue;
1759 CallsWithMetadata.push_back({&AN});
1760 auto *AllocNode = addAllocNode({&AN}, FS);
1761 // Pass an empty CallStack to the CallsiteContext (second)
1762 // parameter, since for ThinLTO we already collapsed out the inlined
1763 // stack ids on the allocation call during ModuleSummaryAnalysis.
1764 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1765 EmptyContext;
1766 unsigned I = 0;
1767 assert(!MemProfReportHintedSizes ||
1768 AN.TotalSizes.size() == AN.MIBs.size());
1769 // Now add all of the MIBs and their stack nodes.
1770 for (auto &MIB : AN.MIBs) {
1771 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1772 StackContext(&MIB);
1773 uint64_t TotalSize = 0;
1774 if (MemProfReportHintedSizes)
1775 TotalSize = AN.TotalSizes[I];
1776 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1777 AllocNode, StackContext, EmptyContext, MIB.AllocType,
1778 TotalSize);
1779 I++;
1780 }
1781 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1782 // Initialize version 0 on the summary alloc node to the current alloc
1783 // type, unless it has both types in which case make it default, so
1784 // that in the case where we aren't able to clone the original version
1785 // always ends up with the default allocation behavior.
1786 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1787 }
1788 }
1789 // For callsite metadata, add to list for this function for later use.
1790 if (!FS->callsites().empty())
1791 for (auto &SN : FS->mutableCallsites())
1792 CallsWithMetadata.push_back({&SN});
1793
1794 if (!CallsWithMetadata.empty())
1795 FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1796
1797 if (!FS->allocs().empty() || !FS->callsites().empty())
1798 FSToVIMap[FS] = VI;
1799 }
1800 }
1801
1802 if (DumpCCG) {
1803 dbgs() << "CCG before updating call stack chains:\n";
1804 dbgs() << *this;
1805 }
1806
1807 if (ExportToDot)
1808 exportToDot("prestackupdate");
1809
1810 updateStackNodes();
1811
1812 handleCallsitesWithMultipleTargets();
1813 }
1814
1815 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1816 void CallsiteContextGraph<DerivedCCG, FuncTy,
handleCallsitesWithMultipleTargets()1817 CallTy>::handleCallsitesWithMultipleTargets() {
1818 // Look for and workaround callsites that call multiple functions.
1819 // This can happen for indirect calls, which needs better handling, and in
1820 // more rare cases (e.g. macro expansion).
1821 // TODO: To fix this for indirect calls we will want to perform speculative
1822 // devirtualization using either the normal PGO info with ICP, or using the
1823 // information in the profiled MemProf contexts. We can do this prior to
1824 // this transformation for regular LTO, and for ThinLTO we can simulate that
1825 // effect in the summary and perform the actual speculative devirtualization
1826 // while cloning in the ThinLTO backend.
1827
1828 // Keep track of the new nodes synthesized for discovered tail calls missing
1829 // from the profiled contexts.
1830 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1831
1832 for (auto &Entry : NonAllocationCallToContextNodeMap) {
1833 auto *Node = Entry.second;
1834 assert(Node->Clones.empty());
1835 // Check all node callees and see if in the same function.
1836 auto Call = Node->Call.call();
1837 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
1838 ++EI) {
1839 auto Edge = *EI;
1840 if (!Edge->Callee->hasCall())
1841 continue;
1842 assert(NodeToCallingFunc.count(Edge->Callee));
1843 // Check if the called function matches that of the callee node.
1844 if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1845 continue;
1846 RemovedEdgesWithMismatchedCallees++;
1847 // Work around by setting Node to have a null call, so it gets
1848 // skipped during cloning. Otherwise assignFunctions will assert
1849 // because its data structures are not designed to handle this case.
1850 Node->setCall(CallInfo());
1851 break;
1852 }
1853 }
1854
1855 // Remove all mismatched nodes identified in the above loop from the node map
1856 // (checking whether they have a null call which is set above). For a
1857 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
1858 // to do the removal via remove_if than by individually erasing entries above.
1859 NonAllocationCallToContextNodeMap.remove_if(
1860 [](const auto &it) { return !it.second->hasCall(); });
1861
1862 // Add the new nodes after the above loop so that the iteration is not
1863 // invalidated.
1864 for (auto &[Call, Node] : TailCallToContextNodeMap)
1865 NonAllocationCallToContextNodeMap[Call] = Node;
1866 }
1867
getStackId(uint64_t IdOrIndex) const1868 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1869 // In the Module (IR) case this is already the Id.
1870 return IdOrIndex;
1871 }
1872
getStackId(uint64_t IdOrIndex) const1873 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1874 // In the Index case this is an index into the stack id list in the summary
1875 // index, convert it to an Id.
1876 return Index.getStackIdAtIndex(IdOrIndex);
1877 }
1878
1879 template <typename DerivedCCG, typename FuncTy, typename CallTy>
calleesMatch(CallTy Call,EdgeIter & EI,MapVector<CallInfo,ContextNode * > & TailCallToContextNodeMap)1880 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1881 CallTy Call, EdgeIter &EI,
1882 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1883 auto Edge = *EI;
1884 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1885 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1886 // Will be populated in order of callee to caller if we find a chain of tail
1887 // calls between the profiled caller and callee.
1888 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1889 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1890 FoundCalleeChain))
1891 return false;
1892
1893 // The usual case where the profiled callee matches that of the IR/summary.
1894 if (FoundCalleeChain.empty())
1895 return true;
1896
1897 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1898 auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1899 // If there is already an edge between these nodes, simply update it and
1900 // return.
1901 if (CurEdge) {
1902 CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1903 Edge->ContextIds.end());
1904 CurEdge->AllocTypes |= Edge->AllocTypes;
1905 return;
1906 }
1907 // Otherwise, create a new edge and insert it into the caller and callee
1908 // lists.
1909 auto NewEdge = std::make_shared<ContextEdge>(
1910 Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1911 Callee->CallerEdges.push_back(NewEdge);
1912 if (Caller == Edge->Caller) {
1913 // If we are inserting the new edge into the current edge's caller, insert
1914 // the new edge before the current iterator position, and then increment
1915 // back to the current edge.
1916 EI = Caller->CalleeEdges.insert(EI, NewEdge);
1917 ++EI;
1918 assert(*EI == Edge &&
1919 "Iterator position not restored after insert and increment");
1920 } else
1921 Caller->CalleeEdges.push_back(NewEdge);
1922 };
1923
1924 // Create new nodes for each found callee and connect in between the profiled
1925 // caller and callee.
1926 auto *CurCalleeNode = Edge->Callee;
1927 for (auto &[NewCall, Func] : FoundCalleeChain) {
1928 ContextNode *NewNode = nullptr;
1929 // First check if we have already synthesized a node for this tail call.
1930 if (TailCallToContextNodeMap.count(NewCall)) {
1931 NewNode = TailCallToContextNodeMap[NewCall];
1932 NewNode->AllocTypes |= Edge->AllocTypes;
1933 } else {
1934 FuncToCallsWithMetadata[Func].push_back({NewCall});
1935 // Create Node and record node info.
1936 NodeOwner.push_back(
1937 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1938 NewNode = NodeOwner.back().get();
1939 NodeToCallingFunc[NewNode] = Func;
1940 TailCallToContextNodeMap[NewCall] = NewNode;
1941 NewNode->AllocTypes = Edge->AllocTypes;
1942 }
1943
1944 // Hook up node to its callee node
1945 AddEdge(NewNode, CurCalleeNode);
1946
1947 CurCalleeNode = NewNode;
1948 }
1949
1950 // Hook up edge's original caller to new callee node.
1951 AddEdge(Edge->Caller, CurCalleeNode);
1952
1953 // Remove old edge
1954 Edge->Callee->eraseCallerEdge(Edge.get());
1955 EI = Edge->Caller->CalleeEdges.erase(EI);
1956
1957 // To simplify the increment of EI in the caller, subtract one from EI.
1958 // In the final AddEdge call we would have either added a new callee edge,
1959 // to Edge->Caller, or found an existing one. Either way we are guaranteed
1960 // that there is at least one callee edge.
1961 assert(!Edge->Caller->CalleeEdges.empty());
1962 --EI;
1963
1964 return true;
1965 }
1966
findProfiledCalleeThroughTailCalls(const Function * ProfiledCallee,Value * CurCallee,unsigned Depth,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)1967 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1968 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1969 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1970 bool &FoundMultipleCalleeChains) {
1971 // Stop recursive search if we have already explored the maximum specified
1972 // depth.
1973 if (Depth > TailCallSearchDepth)
1974 return false;
1975
1976 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1977 FoundCalleeChain.push_back({Callsite, F});
1978 };
1979
1980 auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1981 if (!CalleeFunc) {
1982 auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1983 assert(Alias);
1984 CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1985 assert(CalleeFunc);
1986 }
1987
1988 // Look for tail calls in this function, and check if they either call the
1989 // profiled callee directly, or indirectly (via a recursive search).
1990 // Only succeed if there is a single unique tail call chain found between the
1991 // profiled caller and callee, otherwise we could perform incorrect cloning.
1992 bool FoundSingleCalleeChain = false;
1993 for (auto &BB : *CalleeFunc) {
1994 for (auto &I : BB) {
1995 auto *CB = dyn_cast<CallBase>(&I);
1996 if (!CB || !CB->isTailCall())
1997 continue;
1998 auto *CalledValue = CB->getCalledOperand();
1999 auto *CalledFunction = CB->getCalledFunction();
2000 if (CalledValue && !CalledFunction) {
2001 CalledValue = CalledValue->stripPointerCasts();
2002 // Stripping pointer casts can reveal a called function.
2003 CalledFunction = dyn_cast<Function>(CalledValue);
2004 }
2005 // Check if this is an alias to a function. If so, get the
2006 // called aliasee for the checks below.
2007 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2008 assert(!CalledFunction &&
2009 "Expected null called function in callsite for alias");
2010 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2011 }
2012 if (!CalledFunction)
2013 continue;
2014 if (CalledFunction == ProfiledCallee) {
2015 if (FoundSingleCalleeChain) {
2016 FoundMultipleCalleeChains = true;
2017 return false;
2018 }
2019 FoundSingleCalleeChain = true;
2020 FoundProfiledCalleeCount++;
2021 FoundProfiledCalleeDepth += Depth;
2022 if (Depth > FoundProfiledCalleeMaxDepth)
2023 FoundProfiledCalleeMaxDepth = Depth;
2024 SaveCallsiteInfo(&I, CalleeFunc);
2025 } else if (findProfiledCalleeThroughTailCalls(
2026 ProfiledCallee, CalledFunction, Depth + 1,
2027 FoundCalleeChain, FoundMultipleCalleeChains)) {
2028 // findProfiledCalleeThroughTailCalls should not have returned
2029 // true if FoundMultipleCalleeChains.
2030 assert(!FoundMultipleCalleeChains);
2031 if (FoundSingleCalleeChain) {
2032 FoundMultipleCalleeChains = true;
2033 return false;
2034 }
2035 FoundSingleCalleeChain = true;
2036 SaveCallsiteInfo(&I, CalleeFunc);
2037 } else if (FoundMultipleCalleeChains)
2038 return false;
2039 }
2040 }
2041
2042 return FoundSingleCalleeChain;
2043 }
2044
calleeMatchesFunc(Instruction * Call,const Function * Func,const Function * CallerFunc,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain)2045 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2046 Instruction *Call, const Function *Func, const Function *CallerFunc,
2047 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2048 auto *CB = dyn_cast<CallBase>(Call);
2049 if (!CB->getCalledOperand())
2050 return false;
2051 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2052 auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2053 if (CalleeFunc == Func)
2054 return true;
2055 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2056 if (Alias && Alias->getAliasee() == Func)
2057 return true;
2058
2059 // Recursively search for the profiled callee through tail calls starting with
2060 // the actual Callee. The discovered tail call chain is saved in
2061 // FoundCalleeChain, and we will fixup the graph to include these callsites
2062 // after returning.
2063 // FIXME: We will currently redo the same recursive walk if we find the same
2064 // mismatched callee from another callsite. We can improve this with more
2065 // bookkeeping of the created chain of new nodes for each mismatch.
2066 unsigned Depth = 1;
2067 bool FoundMultipleCalleeChains = false;
2068 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2069 FoundCalleeChain,
2070 FoundMultipleCalleeChains)) {
2071 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2072 << Func->getName() << " from " << CallerFunc->getName()
2073 << " that actually called " << CalleeVal->getName()
2074 << (FoundMultipleCalleeChains
2075 ? " (found multiple possible chains)"
2076 : "")
2077 << "\n");
2078 if (FoundMultipleCalleeChains)
2079 FoundProfiledCalleeNonUniquelyCount++;
2080 return false;
2081 }
2082
2083 return true;
2084 }
2085
findProfiledCalleeThroughTailCalls(ValueInfo ProfiledCallee,ValueInfo CurCallee,unsigned Depth,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)2086 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2087 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2088 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2089 bool &FoundMultipleCalleeChains) {
2090 // Stop recursive search if we have already explored the maximum specified
2091 // depth.
2092 if (Depth > TailCallSearchDepth)
2093 return false;
2094
2095 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2096 // Make a CallsiteInfo for each discovered callee, if one hasn't already
2097 // been synthesized.
2098 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2099 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2100 // StackIds is empty (we don't have debug info available in the index for
2101 // these callsites)
2102 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2103 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2104 CallsiteInfo *NewCallsiteInfo =
2105 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2106 FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2107 };
2108
2109 // Look for tail calls in this function, and check if they either call the
2110 // profiled callee directly, or indirectly (via a recursive search).
2111 // Only succeed if there is a single unique tail call chain found between the
2112 // profiled caller and callee, otherwise we could perform incorrect cloning.
2113 bool FoundSingleCalleeChain = false;
2114 for (auto &S : CurCallee.getSummaryList()) {
2115 if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2116 !isPrevailing(CurCallee.getGUID(), S.get()))
2117 continue;
2118 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2119 if (!FS)
2120 continue;
2121 auto FSVI = CurCallee;
2122 auto *AS = dyn_cast<AliasSummary>(S.get());
2123 if (AS)
2124 FSVI = AS->getAliaseeVI();
2125 for (auto &CallEdge : FS->calls()) {
2126 if (!CallEdge.second.hasTailCall())
2127 continue;
2128 if (CallEdge.first == ProfiledCallee) {
2129 if (FoundSingleCalleeChain) {
2130 FoundMultipleCalleeChains = true;
2131 return false;
2132 }
2133 FoundSingleCalleeChain = true;
2134 FoundProfiledCalleeCount++;
2135 FoundProfiledCalleeDepth += Depth;
2136 if (Depth > FoundProfiledCalleeMaxDepth)
2137 FoundProfiledCalleeMaxDepth = Depth;
2138 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2139 // Add FS to FSToVIMap in case it isn't already there.
2140 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2141 FSToVIMap[FS] = FSVI;
2142 } else if (findProfiledCalleeThroughTailCalls(
2143 ProfiledCallee, CallEdge.first, Depth + 1,
2144 FoundCalleeChain, FoundMultipleCalleeChains)) {
2145 // findProfiledCalleeThroughTailCalls should not have returned
2146 // true if FoundMultipleCalleeChains.
2147 assert(!FoundMultipleCalleeChains);
2148 if (FoundSingleCalleeChain) {
2149 FoundMultipleCalleeChains = true;
2150 return false;
2151 }
2152 FoundSingleCalleeChain = true;
2153 CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2154 // Add FS to FSToVIMap in case it isn't already there.
2155 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2156 FSToVIMap[FS] = FSVI;
2157 } else if (FoundMultipleCalleeChains)
2158 return false;
2159 }
2160 }
2161
2162 return FoundSingleCalleeChain;
2163 }
2164
calleeMatchesFunc(IndexCall & Call,const FunctionSummary * Func,const FunctionSummary * CallerFunc,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain)2165 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2166 IndexCall &Call, const FunctionSummary *Func,
2167 const FunctionSummary *CallerFunc,
2168 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2169 ValueInfo Callee =
2170 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2171 // If there is no summary list then this is a call to an externally defined
2172 // symbol.
2173 AliasSummary *Alias =
2174 Callee.getSummaryList().empty()
2175 ? nullptr
2176 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2177 assert(FSToVIMap.count(Func));
2178 auto FuncVI = FSToVIMap[Func];
2179 if (Callee == FuncVI ||
2180 // If callee is an alias, check the aliasee, since only function
2181 // summary base objects will contain the stack node summaries and thus
2182 // get a context node.
2183 (Alias && Alias->getAliaseeVI() == FuncVI))
2184 return true;
2185
2186 // Recursively search for the profiled callee through tail calls starting with
2187 // the actual Callee. The discovered tail call chain is saved in
2188 // FoundCalleeChain, and we will fixup the graph to include these callsites
2189 // after returning.
2190 // FIXME: We will currently redo the same recursive walk if we find the same
2191 // mismatched callee from another callsite. We can improve this with more
2192 // bookkeeping of the created chain of new nodes for each mismatch.
2193 unsigned Depth = 1;
2194 bool FoundMultipleCalleeChains = false;
2195 if (!findProfiledCalleeThroughTailCalls(
2196 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2197 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2198 << " from " << FSToVIMap[CallerFunc]
2199 << " that actually called " << Callee
2200 << (FoundMultipleCalleeChains
2201 ? " (found multiple possible chains)"
2202 : "")
2203 << "\n");
2204 if (FoundMultipleCalleeChains)
2205 FoundProfiledCalleeNonUniquelyCount++;
2206 return false;
2207 }
2208
2209 return true;
2210 }
2211
2212 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2213 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2214 const {
2215 print(dbgs());
2216 dbgs() << "\n";
2217 }
2218
2219 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2220 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2221 raw_ostream &OS) const {
2222 OS << "Node " << this << "\n";
2223 OS << "\t";
2224 printCall(OS);
2225 if (Recursive)
2226 OS << " (recursive)";
2227 OS << "\n";
2228 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2229 OS << "\tContextIds:";
2230 // Make a copy of the computed context ids that we can sort for stability.
2231 auto ContextIds = getContextIds();
2232 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2233 std::sort(SortedIds.begin(), SortedIds.end());
2234 for (auto Id : SortedIds)
2235 OS << " " << Id;
2236 OS << "\n";
2237 OS << "\tCalleeEdges:\n";
2238 for (auto &Edge : CalleeEdges)
2239 OS << "\t\t" << *Edge << "\n";
2240 OS << "\tCallerEdges:\n";
2241 for (auto &Edge : CallerEdges)
2242 OS << "\t\t" << *Edge << "\n";
2243 if (!Clones.empty()) {
2244 OS << "\tClones: ";
2245 FieldSeparator FS;
2246 for (auto *Clone : Clones)
2247 OS << FS << Clone;
2248 OS << "\n";
2249 } else if (CloneOf) {
2250 OS << "\tClone of " << CloneOf << "\n";
2251 }
2252 }
2253
2254 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2255 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2256 const {
2257 print(dbgs());
2258 dbgs() << "\n";
2259 }
2260
2261 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2262 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2263 raw_ostream &OS) const {
2264 OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2265 << " AllocTypes: " << getAllocTypeString(AllocTypes);
2266 OS << " ContextIds:";
2267 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2268 std::sort(SortedIds.begin(), SortedIds.end());
2269 for (auto Id : SortedIds)
2270 OS << " " << Id;
2271 }
2272
2273 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2274 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2275 print(dbgs());
2276 }
2277
2278 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2279 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2280 raw_ostream &OS) const {
2281 OS << "Callsite Context Graph:\n";
2282 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2283 for (const auto Node : nodes<GraphType>(this)) {
2284 if (Node->isRemoved())
2285 continue;
2286 Node->print(OS);
2287 OS << "\n";
2288 }
2289 }
2290
2291 template <typename DerivedCCG, typename FuncTy, typename CallTy>
printTotalSizes(raw_ostream & OS) const2292 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2293 raw_ostream &OS) const {
2294 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2295 for (const auto Node : nodes<GraphType>(this)) {
2296 if (Node->isRemoved())
2297 continue;
2298 if (!Node->IsAllocation)
2299 continue;
2300 DenseSet<uint32_t> ContextIds = Node->getContextIds();
2301 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2302 std::sort(SortedIds.begin(), SortedIds.end());
2303 for (auto Id : SortedIds) {
2304 auto SizeI = ContextIdToTotalSize.find(Id);
2305 assert(SizeI != ContextIdToTotalSize.end());
2306 auto TypeI = ContextIdToAllocationType.find(Id);
2307 assert(TypeI != ContextIdToAllocationType.end());
2308 OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
2309 << " with total size " << SizeI->second << " is "
2310 << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2311 }
2312 }
2313 }
2314
2315 template <typename DerivedCCG, typename FuncTy, typename CallTy>
check() const2316 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2317 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2318 for (const auto Node : nodes<GraphType>(this)) {
2319 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2320 for (auto &Edge : Node->CallerEdges)
2321 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2322 }
2323 }
2324
2325 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2326 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2327 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2328 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2329
2330 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
getNodeGraphTraits2331 static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2332
2333 using nodes_iterator =
2334 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2335 decltype(&getNode)>;
2336
nodes_beginGraphTraits2337 static nodes_iterator nodes_begin(GraphType G) {
2338 return nodes_iterator(G->NodeOwner.begin(), &getNode);
2339 }
2340
nodes_endGraphTraits2341 static nodes_iterator nodes_end(GraphType G) {
2342 return nodes_iterator(G->NodeOwner.end(), &getNode);
2343 }
2344
getEntryNodeGraphTraits2345 static NodeRef getEntryNode(GraphType G) {
2346 return G->NodeOwner.begin()->get();
2347 }
2348
2349 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2350 static const ContextNode<DerivedCCG, FuncTy, CallTy> *
GetCalleeGraphTraits2351 GetCallee(const EdgePtrTy &P) {
2352 return P->Callee;
2353 }
2354
2355 using ChildIteratorType =
2356 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2357 DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2358 decltype(&GetCallee)>;
2359
child_beginGraphTraits2360 static ChildIteratorType child_begin(NodeRef N) {
2361 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2362 }
2363
child_endGraphTraits2364 static ChildIteratorType child_end(NodeRef N) {
2365 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2366 }
2367 };
2368
2369 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2370 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2371 : public DefaultDOTGraphTraits {
DOTGraphTraitsDOTGraphTraits2372 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2373
2374 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2375 using GTraits = GraphTraits<GraphType>;
2376 using NodeRef = typename GTraits::NodeRef;
2377 using ChildIteratorType = typename GTraits::ChildIteratorType;
2378
getNodeLabelDOTGraphTraits2379 static std::string getNodeLabel(NodeRef Node, GraphType G) {
2380 std::string LabelString =
2381 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2382 Twine(Node->OrigStackOrAllocId))
2383 .str();
2384 LabelString += "\n";
2385 if (Node->hasCall()) {
2386 auto Func = G->NodeToCallingFunc.find(Node);
2387 assert(Func != G->NodeToCallingFunc.end());
2388 LabelString +=
2389 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2390 } else {
2391 LabelString += "null call";
2392 if (Node->Recursive)
2393 LabelString += " (recursive)";
2394 else
2395 LabelString += " (external)";
2396 }
2397 return LabelString;
2398 }
2399
getNodeAttributesDOTGraphTraits2400 static std::string getNodeAttributes(NodeRef Node, GraphType) {
2401 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2402 getContextIds(Node->getContextIds()) + "\"")
2403 .str();
2404 AttributeString +=
2405 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2406 AttributeString += ",style=\"filled\"";
2407 if (Node->CloneOf) {
2408 AttributeString += ",color=\"blue\"";
2409 AttributeString += ",style=\"filled,bold,dashed\"";
2410 } else
2411 AttributeString += ",style=\"filled\"";
2412 return AttributeString;
2413 }
2414
getEdgeAttributesDOTGraphTraits2415 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2416 GraphType) {
2417 auto &Edge = *(ChildIter.getCurrent());
2418 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2419 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2420 .str();
2421 }
2422
2423 // Since the NodeOwners list includes nodes that are no longer connected to
2424 // the graph, skip them here.
isNodeHiddenDOTGraphTraits2425 static bool isNodeHidden(NodeRef Node, GraphType) {
2426 return Node->isRemoved();
2427 }
2428
2429 private:
getContextIdsDOTGraphTraits2430 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2431 std::string IdString = "ContextIds:";
2432 if (ContextIds.size() < 100) {
2433 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2434 std::sort(SortedIds.begin(), SortedIds.end());
2435 for (auto Id : SortedIds)
2436 IdString += (" " + Twine(Id)).str();
2437 } else {
2438 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2439 }
2440 return IdString;
2441 }
2442
getColorDOTGraphTraits2443 static std::string getColor(uint8_t AllocTypes) {
2444 if (AllocTypes == (uint8_t)AllocationType::NotCold)
2445 // Color "brown1" actually looks like a lighter red.
2446 return "brown1";
2447 if (AllocTypes == (uint8_t)AllocationType::Cold)
2448 return "cyan";
2449 if (AllocTypes ==
2450 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2451 // Lighter purple.
2452 return "mediumorchid1";
2453 return "gray";
2454 }
2455
getNodeIdDOTGraphTraits2456 static std::string getNodeId(NodeRef Node) {
2457 std::stringstream SStream;
2458 SStream << std::hex << "N0x" << (unsigned long long)Node;
2459 std::string Result = SStream.str();
2460 return Result;
2461 }
2462 };
2463
2464 template <typename DerivedCCG, typename FuncTy, typename CallTy>
exportToDot(std::string Label) const2465 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2466 std::string Label) const {
2467 WriteGraph(this, "", false, Label,
2468 DotFilePathPrefix + "ccg." + Label + ".dot");
2469 }
2470
2471 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2472 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> & Edge,EdgeIter * CallerEdgeI,DenseSet<uint32_t> ContextIdsToMove)2473 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2474 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2475 DenseSet<uint32_t> ContextIdsToMove) {
2476 ContextNode *Node = Edge->Callee;
2477 NodeOwner.push_back(
2478 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2479 ContextNode *Clone = NodeOwner.back().get();
2480 Node->addClone(Clone);
2481 assert(NodeToCallingFunc.count(Node));
2482 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2483 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2484 ContextIdsToMove);
2485 return Clone;
2486 }
2487
2488 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2489 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> & Edge,ContextNode * NewCallee,EdgeIter * CallerEdgeI,bool NewClone,DenseSet<uint32_t> ContextIdsToMove)2490 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2491 ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2492 bool NewClone,
2493 DenseSet<uint32_t> ContextIdsToMove) {
2494 // NewCallee and Edge's current callee must be clones of the same original
2495 // node (Edge's current callee may be the original node too).
2496 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2497
2498 ContextNode *OldCallee = Edge->Callee;
2499
2500 // We might already have an edge to the new callee from earlier cloning for a
2501 // different allocation. If one exists we will reuse it.
2502 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2503
2504 // Callers will pass an empty ContextIdsToMove set when they want to move the
2505 // edge. Copy in Edge's ids for simplicity.
2506 if (ContextIdsToMove.empty())
2507 ContextIdsToMove = Edge->getContextIds();
2508
2509 // If we are moving all of Edge's ids, then just move the whole Edge.
2510 // Otherwise only move the specified subset, to a new edge if needed.
2511 if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2512 // Moving the whole Edge.
2513 if (CallerEdgeI)
2514 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2515 else
2516 OldCallee->eraseCallerEdge(Edge.get());
2517 if (ExistingEdgeToNewCallee) {
2518 // Since we already have an edge to NewCallee, simply move the ids
2519 // onto it, and remove the existing Edge.
2520 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2521 ContextIdsToMove.end());
2522 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2523 assert(Edge->ContextIds == ContextIdsToMove);
2524 Edge->ContextIds.clear();
2525 Edge->AllocTypes = (uint8_t)AllocationType::None;
2526 Edge->Caller->eraseCalleeEdge(Edge.get());
2527 } else {
2528 // Otherwise just reconnect Edge to NewCallee.
2529 Edge->Callee = NewCallee;
2530 NewCallee->CallerEdges.push_back(Edge);
2531 // Don't need to update Edge's context ids since we are simply
2532 // reconnecting it.
2533 }
2534 // In either case, need to update the alloc types on New Callee.
2535 NewCallee->AllocTypes |= Edge->AllocTypes;
2536 } else {
2537 // Only moving a subset of Edge's ids.
2538 if (CallerEdgeI)
2539 ++CallerEdgeI;
2540 // Compute the alloc type of the subset of ids being moved.
2541 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2542 if (ExistingEdgeToNewCallee) {
2543 // Since we already have an edge to NewCallee, simply move the ids
2544 // onto it.
2545 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2546 ContextIdsToMove.end());
2547 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2548 } else {
2549 // Otherwise, create a new edge to NewCallee for the ids being moved.
2550 auto NewEdge = std::make_shared<ContextEdge>(
2551 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2552 Edge->Caller->CalleeEdges.push_back(NewEdge);
2553 NewCallee->CallerEdges.push_back(NewEdge);
2554 }
2555 // In either case, need to update the alloc types on NewCallee, and remove
2556 // those ids and update the alloc type on the original Edge.
2557 NewCallee->AllocTypes |= CallerEdgeAllocType;
2558 set_subtract(Edge->ContextIds, ContextIdsToMove);
2559 Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2560 }
2561 // Now walk the old callee node's callee edges and move Edge's context ids
2562 // over to the corresponding edge into the clone (which is created here if
2563 // this is a newly created clone).
2564 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2565 // The context ids moving to the new callee are the subset of this edge's
2566 // context ids and the context ids on the caller edge being moved.
2567 DenseSet<uint32_t> EdgeContextIdsToMove =
2568 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2569 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2570 OldCalleeEdge->AllocTypes =
2571 computeAllocType(OldCalleeEdge->getContextIds());
2572 if (!NewClone) {
2573 // Update context ids / alloc type on corresponding edge to NewCallee.
2574 // There is a chance this may not exist if we are reusing an existing
2575 // clone, specifically during function assignment, where we would have
2576 // removed none type edges after creating the clone. If we can't find
2577 // a corresponding edge there, fall through to the cloning below.
2578 if (auto *NewCalleeEdge =
2579 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2580 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2581 EdgeContextIdsToMove.end());
2582 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2583 continue;
2584 }
2585 }
2586 auto NewEdge = std::make_shared<ContextEdge>(
2587 OldCalleeEdge->Callee, NewCallee,
2588 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2589 NewCallee->CalleeEdges.push_back(NewEdge);
2590 NewEdge->Callee->CallerEdges.push_back(NewEdge);
2591 }
2592 // Recompute the node alloc type now that its callee edges have been
2593 // updated (since we will compute from those edges).
2594 OldCallee->AllocTypes = OldCallee->computeAllocType();
2595 // OldCallee alloc type should be None iff its context id set is now empty.
2596 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2597 OldCallee->emptyContextIds());
2598 if (VerifyCCG) {
2599 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2600 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2601 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2602 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2603 /*CheckEdges=*/false);
2604 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2605 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2606 /*CheckEdges=*/false);
2607 }
2608 }
2609
2610 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2611 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
recursivelyRemoveNoneTypeCalleeEdges(ContextNode * Node,DenseSet<const ContextNode * > & Visited)2612 recursivelyRemoveNoneTypeCalleeEdges(
2613 ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2614 auto Inserted = Visited.insert(Node);
2615 if (!Inserted.second)
2616 return;
2617
2618 removeNoneTypeCalleeEdges(Node);
2619
2620 for (auto *Clone : Node->Clones)
2621 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2622
2623 // The recursive call may remove some of this Node's caller edges.
2624 // Iterate over a copy and skip any that were removed.
2625 auto CallerEdges = Node->CallerEdges;
2626 for (auto &Edge : CallerEdges) {
2627 // Skip any that have been removed by an earlier recursive call.
2628 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2629 assert(!is_contained(Node->CallerEdges, Edge));
2630 continue;
2631 }
2632 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2633 }
2634 }
2635
2636 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones()2637 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2638 DenseSet<const ContextNode *> Visited;
2639 for (auto &Entry : AllocationCallToContextNodeMap) {
2640 Visited.clear();
2641 identifyClones(Entry.second, Visited, Entry.second->getContextIds());
2642 }
2643 Visited.clear();
2644 for (auto &Entry : AllocationCallToContextNodeMap)
2645 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2646 if (VerifyCCG)
2647 check();
2648 }
2649
2650 // helper function to check an AllocType is cold or notcold or both.
checkColdOrNotCold(uint8_t AllocType)2651 bool checkColdOrNotCold(uint8_t AllocType) {
2652 return (AllocType == (uint8_t)AllocationType::Cold) ||
2653 (AllocType == (uint8_t)AllocationType::NotCold) ||
2654 (AllocType ==
2655 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2656 }
2657
2658 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones(ContextNode * Node,DenseSet<const ContextNode * > & Visited,const DenseSet<uint32_t> & AllocContextIds)2659 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2660 ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2661 const DenseSet<uint32_t> &AllocContextIds) {
2662 if (VerifyNodes)
2663 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2664 assert(!Node->CloneOf);
2665
2666 // If Node as a null call, then either it wasn't found in the module (regular
2667 // LTO) or summary index (ThinLTO), or there were other conditions blocking
2668 // cloning (e.g. recursion, calls multiple targets, etc).
2669 // Do this here so that we don't try to recursively clone callers below, which
2670 // isn't useful at least for this node.
2671 if (!Node->hasCall())
2672 return;
2673
2674 #ifndef NDEBUG
2675 auto Insert =
2676 #endif
2677 Visited.insert(Node);
2678 // We should not have visited this node yet.
2679 assert(Insert.second);
2680 // The recursive call to identifyClones may delete the current edge from the
2681 // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2682 // in an iterator and having recursive call erase from it. Other edges may
2683 // also get removed during the recursion, which will have null Callee and
2684 // Caller pointers (and are deleted later), so we skip those below.
2685 {
2686 auto CallerEdges = Node->CallerEdges;
2687 for (auto &Edge : CallerEdges) {
2688 // Skip any that have been removed by an earlier recursive call.
2689 if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2690 assert(!llvm::count(Node->CallerEdges, Edge));
2691 continue;
2692 }
2693 // Ignore any caller we previously visited via another edge.
2694 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2695 identifyClones(Edge->Caller, Visited, AllocContextIds);
2696 }
2697 }
2698 }
2699
2700 // Check if we reached an unambiguous call or have have only a single caller.
2701 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2702 return;
2703
2704 // We need to clone.
2705
2706 // Try to keep the original version as alloc type NotCold. This will make
2707 // cases with indirect calls or any other situation with an unknown call to
2708 // the original function get the default behavior. We do this by sorting the
2709 // CallerEdges of the Node we will clone by alloc type.
2710 //
2711 // Give NotCold edge the lowest sort priority so those edges are at the end of
2712 // the caller edges vector, and stay on the original version (since the below
2713 // code clones greedily until it finds all remaining edges have the same type
2714 // and leaves the remaining ones on the original Node).
2715 //
2716 // We shouldn't actually have any None type edges, so the sorting priority for
2717 // that is arbitrary, and we assert in that case below.
2718 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2719 /*Cold*/ 1,
2720 /*NotColdCold*/ 2};
2721 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2722 [&](const std::shared_ptr<ContextEdge> &A,
2723 const std::shared_ptr<ContextEdge> &B) {
2724 // Nodes with non-empty context ids should be sorted before
2725 // those with empty context ids.
2726 if (A->ContextIds.empty())
2727 // Either B ContextIds are non-empty (in which case we
2728 // should return false because B < A), or B ContextIds
2729 // are empty, in which case they are equal, and we should
2730 // maintain the original relative ordering.
2731 return false;
2732 if (B->ContextIds.empty())
2733 return true;
2734
2735 if (A->AllocTypes == B->AllocTypes)
2736 // Use the first context id for each edge as a
2737 // tie-breaker.
2738 return *A->ContextIds.begin() < *B->ContextIds.begin();
2739 return AllocTypeCloningPriority[A->AllocTypes] <
2740 AllocTypeCloningPriority[B->AllocTypes];
2741 });
2742
2743 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2744
2745 // Iterate until we find no more opportunities for disambiguating the alloc
2746 // types via cloning. In most cases this loop will terminate once the Node
2747 // has a single allocation type, in which case no more cloning is needed.
2748 // We need to be able to remove Edge from CallerEdges, so need to adjust
2749 // iterator inside the loop.
2750 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2751 auto CallerEdge = *EI;
2752
2753 // See if cloning the prior caller edge left this node with a single alloc
2754 // type or a single caller. In that case no more cloning of Node is needed.
2755 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2756 break;
2757
2758 // Only need to process the ids along this edge pertaining to the given
2759 // allocation.
2760 auto CallerEdgeContextsForAlloc =
2761 set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2762 if (CallerEdgeContextsForAlloc.empty()) {
2763 ++EI;
2764 continue;
2765 }
2766 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2767
2768 // Compute the node callee edge alloc types corresponding to the context ids
2769 // for this caller edge.
2770 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2771 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2772 for (auto &CalleeEdge : Node->CalleeEdges)
2773 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2774 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2775
2776 // Don't clone if doing so will not disambiguate any alloc types amongst
2777 // caller edges (including the callee edges that would be cloned).
2778 // Otherwise we will simply move all edges to the clone.
2779 //
2780 // First check if by cloning we will disambiguate the caller allocation
2781 // type from node's allocation type. Query allocTypeToUse so that we don't
2782 // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2783 // neither of these should be None type.
2784 //
2785 // Then check if by cloning node at least one of the callee edges will be
2786 // disambiguated by splitting out different context ids.
2787 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2788 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2789 if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2790 allocTypeToUse(Node->AllocTypes) &&
2791 allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2792 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2793 ++EI;
2794 continue;
2795 }
2796
2797 // First see if we can use an existing clone. Check each clone and its
2798 // callee edges for matching alloc types.
2799 ContextNode *Clone = nullptr;
2800 for (auto *CurClone : Node->Clones) {
2801 if (allocTypeToUse(CurClone->AllocTypes) !=
2802 allocTypeToUse(CallerAllocTypeForAlloc))
2803 continue;
2804
2805 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2806 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2807 continue;
2808 Clone = CurClone;
2809 break;
2810 }
2811
2812 // The edge iterator is adjusted when we move the CallerEdge to the clone.
2813 if (Clone)
2814 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2815 CallerEdgeContextsForAlloc);
2816 else
2817 Clone =
2818 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2819
2820 assert(EI == Node->CallerEdges.end() ||
2821 Node->AllocTypes != (uint8_t)AllocationType::None);
2822 // Sanity check that no alloc types on clone or its edges are None.
2823 assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2824 }
2825
2826 // We should still have some context ids on the original Node.
2827 assert(!Node->emptyContextIds());
2828
2829 // Sanity check that no alloc types on node or edges are None.
2830 assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2831
2832 if (VerifyNodes)
2833 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2834 }
2835
updateAllocationCall(CallInfo & Call,AllocationType AllocType)2836 void ModuleCallsiteContextGraph::updateAllocationCall(
2837 CallInfo &Call, AllocationType AllocType) {
2838 std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2839 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2840 "memprof", AllocTypeString);
2841 cast<CallBase>(Call.call())->addFnAttr(A);
2842 OREGetter(Call.call()->getFunction())
2843 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2844 << ore::NV("AllocationCall", Call.call()) << " in clone "
2845 << ore::NV("Caller", Call.call()->getFunction())
2846 << " marked with memprof allocation attribute "
2847 << ore::NV("Attribute", AllocTypeString));
2848 }
2849
updateAllocationCall(CallInfo & Call,AllocationType AllocType)2850 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2851 AllocationType AllocType) {
2852 auto *AI = Call.call().dyn_cast<AllocInfo *>();
2853 assert(AI);
2854 assert(AI->Versions.size() > Call.cloneNo());
2855 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2856 }
2857
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)2858 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2859 FuncInfo CalleeFunc) {
2860 if (CalleeFunc.cloneNo() > 0)
2861 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2862 OREGetter(CallerCall.call()->getFunction())
2863 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2864 << ore::NV("Call", CallerCall.call()) << " in clone "
2865 << ore::NV("Caller", CallerCall.call()->getFunction())
2866 << " assigned to call function clone "
2867 << ore::NV("Callee", CalleeFunc.func()));
2868 }
2869
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)2870 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2871 FuncInfo CalleeFunc) {
2872 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2873 assert(CI &&
2874 "Caller cannot be an allocation which should not have profiled calls");
2875 assert(CI->Clones.size() > CallerCall.cloneNo());
2876 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2877 }
2878
2879 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2880 Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)2881 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2882 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2883 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2884 // Use existing LLVM facilities for cloning and obtaining Call in clone
2885 ValueToValueMapTy VMap;
2886 auto *NewFunc = CloneFunction(Func.func(), VMap);
2887 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2888 assert(!Func.func()->getParent()->getFunction(Name));
2889 NewFunc->setName(Name);
2890 for (auto &Inst : CallsWithMetadataInFunc) {
2891 // This map always has the initial version in it.
2892 assert(Inst.cloneNo() == 0);
2893 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2894 }
2895 OREGetter(Func.func())
2896 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2897 << "created clone " << ore::NV("NewFunction", NewFunc));
2898 return {NewFunc, CloneNo};
2899 }
2900
2901 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2902 IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)2903 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2904 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2905 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2906 // Check how many clones we have of Call (and therefore function).
2907 // The next clone number is the current size of versions array.
2908 // Confirm this matches the CloneNo provided by the caller, which is based on
2909 // the number of function clones we have.
2910 assert(CloneNo ==
2911 (Call.call().is<AllocInfo *>()
2912 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2913 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2914 // Walk all the instructions in this function. Create a new version for
2915 // each (by adding an entry to the Versions/Clones summary array), and copy
2916 // over the version being called for the function clone being cloned here.
2917 // Additionally, add an entry to the CallMap for the new function clone,
2918 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2919 // to the new call clone.
2920 for (auto &Inst : CallsWithMetadataInFunc) {
2921 // This map always has the initial version in it.
2922 assert(Inst.cloneNo() == 0);
2923 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2924 assert(AI->Versions.size() == CloneNo);
2925 // We assign the allocation type later (in updateAllocationCall), just add
2926 // an entry for it here.
2927 AI->Versions.push_back(0);
2928 } else {
2929 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2930 assert(CI && CI->Clones.size() == CloneNo);
2931 // We assign the clone number later (in updateCall), just add an entry for
2932 // it here.
2933 CI->Clones.push_back(0);
2934 }
2935 CallMap[Inst] = {Inst.call(), CloneNo};
2936 }
2937 return {Func.func(), CloneNo};
2938 }
2939
2940 // This method assigns cloned callsites to functions, cloning the functions as
2941 // needed. The assignment is greedy and proceeds roughly as follows:
2942 //
2943 // For each function Func:
2944 // For each call with graph Node having clones:
2945 // Initialize ClonesWorklist to Node and its clones
2946 // Initialize NodeCloneCount to 0
2947 // While ClonesWorklist is not empty:
2948 // Clone = pop front ClonesWorklist
2949 // NodeCloneCount++
2950 // If Func has been cloned less than NodeCloneCount times:
2951 // If NodeCloneCount is 1:
2952 // Assign Clone to original Func
2953 // Continue
2954 // Create a new function clone
2955 // If other callers not assigned to call a function clone yet:
2956 // Assign them to call new function clone
2957 // Continue
2958 // Assign any other caller calling the cloned version to new clone
2959 //
2960 // For each caller of Clone:
2961 // If caller is assigned to call a specific function clone:
2962 // If we cannot assign Clone to that function clone:
2963 // Create new callsite Clone NewClone
2964 // Add NewClone to ClonesWorklist
2965 // Continue
2966 // Assign Clone to existing caller's called function clone
2967 // Else:
2968 // If Clone not already assigned to a function clone:
2969 // Assign to first function clone without assignment
2970 // Assign caller to selected function clone
2971 template <typename DerivedCCG, typename FuncTy, typename CallTy>
assignFunctions()2972 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2973 bool Changed = false;
2974
2975 // Keep track of the assignment of nodes (callsites) to function clones they
2976 // call.
2977 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2978
2979 // Update caller node to call function version CalleeFunc, by recording the
2980 // assignment in CallsiteToCalleeFuncCloneMap.
2981 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2982 const FuncInfo &CalleeFunc) {
2983 assert(Caller->hasCall());
2984 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2985 };
2986
2987 // Walk all functions for which we saw calls with memprof metadata, and handle
2988 // cloning for each of its calls.
2989 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2990 FuncInfo OrigFunc(Func);
2991 // Map from each clone of OrigFunc to a map of remappings of each call of
2992 // interest (from original uncloned call to the corresponding cloned call in
2993 // that function clone).
2994 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2995 for (auto &Call : CallsWithMetadata) {
2996 ContextNode *Node = getNodeForInst(Call);
2997 // Skip call if we do not have a node for it (all uses of its stack ids
2998 // were either on inlined chains or pruned from the MIBs), or if we did
2999 // not create any clones for it.
3000 if (!Node || Node->Clones.empty())
3001 continue;
3002 assert(Node->hasCall() &&
3003 "Not having a call should have prevented cloning");
3004
3005 // Track the assignment of function clones to clones of the current
3006 // callsite Node being handled.
3007 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3008
3009 // Assign callsite version CallsiteClone to function version FuncClone,
3010 // and also assign (possibly cloned) Call to CallsiteClone.
3011 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3012 CallInfo &Call,
3013 ContextNode *CallsiteClone,
3014 bool IsAlloc) {
3015 // Record the clone of callsite node assigned to this function clone.
3016 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3017
3018 assert(FuncClonesToCallMap.count(FuncClone));
3019 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3020 CallInfo CallClone(Call);
3021 if (CallMap.count(Call))
3022 CallClone = CallMap[Call];
3023 CallsiteClone->setCall(CallClone);
3024 };
3025
3026 // Keep track of the clones of callsite Node that need to be assigned to
3027 // function clones. This list may be expanded in the loop body below if we
3028 // find additional cloning is required.
3029 std::deque<ContextNode *> ClonesWorklist;
3030 // Ignore original Node if we moved all of its contexts to clones.
3031 if (!Node->emptyContextIds())
3032 ClonesWorklist.push_back(Node);
3033 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3034 Node->Clones.end());
3035
3036 // Now walk through all of the clones of this callsite Node that we need,
3037 // and determine the assignment to a corresponding clone of the current
3038 // function (creating new function clones as needed).
3039 unsigned NodeCloneCount = 0;
3040 while (!ClonesWorklist.empty()) {
3041 ContextNode *Clone = ClonesWorklist.front();
3042 ClonesWorklist.pop_front();
3043 NodeCloneCount++;
3044 if (VerifyNodes)
3045 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3046
3047 // Need to create a new function clone if we have more callsite clones
3048 // than existing function clones, which would have been assigned to an
3049 // earlier clone in the list (we assign callsite clones to function
3050 // clones greedily).
3051 if (FuncClonesToCallMap.size() < NodeCloneCount) {
3052 // If this is the first callsite copy, assign to original function.
3053 if (NodeCloneCount == 1) {
3054 // Since FuncClonesToCallMap is empty in this case, no clones have
3055 // been created for this function yet, and no callers should have
3056 // been assigned a function clone for this callee node yet.
3057 assert(llvm::none_of(
3058 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3059 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3060 }));
3061 // Initialize with empty call map, assign Clone to original function
3062 // and its callers, and skip to the next clone.
3063 FuncClonesToCallMap[OrigFunc] = {};
3064 AssignCallsiteCloneToFuncClone(
3065 OrigFunc, Call, Clone,
3066 AllocationCallToContextNodeMap.count(Call));
3067 for (auto &CE : Clone->CallerEdges) {
3068 // Ignore any caller that does not have a recorded callsite Call.
3069 if (!CE->Caller->hasCall())
3070 continue;
3071 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3072 }
3073 continue;
3074 }
3075
3076 // First locate which copy of OrigFunc to clone again. If a caller
3077 // of this callsite clone was already assigned to call a particular
3078 // function clone, we need to redirect all of those callers to the
3079 // new function clone, and update their other callees within this
3080 // function.
3081 FuncInfo PreviousAssignedFuncClone;
3082 auto EI = llvm::find_if(
3083 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3084 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3085 });
3086 bool CallerAssignedToCloneOfFunc = false;
3087 if (EI != Clone->CallerEdges.end()) {
3088 const std::shared_ptr<ContextEdge> &Edge = *EI;
3089 PreviousAssignedFuncClone =
3090 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3091 CallerAssignedToCloneOfFunc = true;
3092 }
3093
3094 // Clone function and save it along with the CallInfo map created
3095 // during cloning in the FuncClonesToCallMap.
3096 std::map<CallInfo, CallInfo> NewCallMap;
3097 unsigned CloneNo = FuncClonesToCallMap.size();
3098 assert(CloneNo > 0 && "Clone 0 is the original function, which "
3099 "should already exist in the map");
3100 FuncInfo NewFuncClone = cloneFunctionForCallsite(
3101 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3102 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3103 FunctionClonesAnalysis++;
3104 Changed = true;
3105
3106 // If no caller callsites were already assigned to a clone of this
3107 // function, we can simply assign this clone to the new func clone
3108 // and update all callers to it, then skip to the next clone.
3109 if (!CallerAssignedToCloneOfFunc) {
3110 AssignCallsiteCloneToFuncClone(
3111 NewFuncClone, Call, Clone,
3112 AllocationCallToContextNodeMap.count(Call));
3113 for (auto &CE : Clone->CallerEdges) {
3114 // Ignore any caller that does not have a recorded callsite Call.
3115 if (!CE->Caller->hasCall())
3116 continue;
3117 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3118 }
3119 continue;
3120 }
3121
3122 // We may need to do additional node cloning in this case.
3123 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3124 // that were previously assigned to call PreviousAssignedFuncClone,
3125 // to record that they now call NewFuncClone.
3126 for (auto CE : Clone->CallerEdges) {
3127 // Skip any that have been removed on an earlier iteration.
3128 if (!CE)
3129 continue;
3130 // Ignore any caller that does not have a recorded callsite Call.
3131 if (!CE->Caller->hasCall())
3132 continue;
3133
3134 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3135 // We subsequently fall through to later handling that
3136 // will perform any additional cloning required for
3137 // callers that were calling other function clones.
3138 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3139 PreviousAssignedFuncClone)
3140 continue;
3141
3142 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3143
3144 // If we are cloning a function that was already assigned to some
3145 // callers, then essentially we are creating new callsite clones
3146 // of the other callsites in that function that are reached by those
3147 // callers. Clone the other callees of the current callsite's caller
3148 // that were already assigned to PreviousAssignedFuncClone
3149 // accordingly. This is important since we subsequently update the
3150 // calls from the nodes in the graph and their assignments to callee
3151 // functions recorded in CallsiteToCalleeFuncCloneMap.
3152 for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3153 // Skip any that have been removed on an earlier iteration when
3154 // cleaning up newly None type callee edges.
3155 if (!CalleeEdge)
3156 continue;
3157 ContextNode *Callee = CalleeEdge->Callee;
3158 // Skip the current callsite, we are looking for other
3159 // callsites Caller calls, as well as any that does not have a
3160 // recorded callsite Call.
3161 if (Callee == Clone || !Callee->hasCall())
3162 continue;
3163 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3164 removeNoneTypeCalleeEdges(NewClone);
3165 // Moving the edge may have resulted in some none type
3166 // callee edges on the original Callee.
3167 removeNoneTypeCalleeEdges(Callee);
3168 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3169 // If the Callee node was already assigned to call a specific
3170 // function version, make sure its new clone is assigned to call
3171 // that same function clone.
3172 if (CallsiteToCalleeFuncCloneMap.count(Callee))
3173 RecordCalleeFuncOfCallsite(
3174 NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3175 // Update NewClone with the new Call clone of this callsite's Call
3176 // created for the new function clone created earlier.
3177 // Recall that we have already ensured when building the graph
3178 // that each caller can only call callsites within the same
3179 // function, so we are guaranteed that Callee Call is in the
3180 // current OrigFunc.
3181 // CallMap is set up as indexed by original Call at clone 0.
3182 CallInfo OrigCall(Callee->getOrigNode()->Call);
3183 OrigCall.setCloneNo(0);
3184 std::map<CallInfo, CallInfo> &CallMap =
3185 FuncClonesToCallMap[NewFuncClone];
3186 assert(CallMap.count(OrigCall));
3187 CallInfo NewCall(CallMap[OrigCall]);
3188 assert(NewCall);
3189 NewClone->setCall(NewCall);
3190 }
3191 }
3192 // Fall through to handling below to perform the recording of the
3193 // function for this callsite clone. This enables handling of cases
3194 // where the callers were assigned to different clones of a function.
3195 }
3196
3197 // See if we can use existing function clone. Walk through
3198 // all caller edges to see if any have already been assigned to
3199 // a clone of this callsite's function. If we can use it, do so. If not,
3200 // because that function clone is already assigned to a different clone
3201 // of this callsite, then we need to clone again.
3202 // Basically, this checking is needed to handle the case where different
3203 // caller functions/callsites may need versions of this function
3204 // containing different mixes of callsite clones across the different
3205 // callsites within the function. If that happens, we need to create
3206 // additional function clones to handle the various combinations.
3207 //
3208 // Keep track of any new clones of this callsite created by the
3209 // following loop, as well as any existing clone that we decided to
3210 // assign this clone to.
3211 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3212 FuncInfo FuncCloneAssignedToCurCallsiteClone;
3213 // We need to be able to remove Edge from CallerEdges, so need to adjust
3214 // iterator in the loop.
3215 for (auto EI = Clone->CallerEdges.begin();
3216 EI != Clone->CallerEdges.end();) {
3217 auto Edge = *EI;
3218 // Ignore any caller that does not have a recorded callsite Call.
3219 if (!Edge->Caller->hasCall()) {
3220 EI++;
3221 continue;
3222 }
3223 // If this caller already assigned to call a version of OrigFunc, need
3224 // to ensure we can assign this callsite clone to that function clone.
3225 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3226 FuncInfo FuncCloneCalledByCaller =
3227 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3228 // First we need to confirm that this function clone is available
3229 // for use by this callsite node clone.
3230 //
3231 // While FuncCloneToCurNodeCloneMap is built only for this Node and
3232 // its callsite clones, one of those callsite clones X could have
3233 // been assigned to the same function clone called by Edge's caller
3234 // - if Edge's caller calls another callsite within Node's original
3235 // function, and that callsite has another caller reaching clone X.
3236 // We need to clone Node again in this case.
3237 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3238 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3239 Clone) ||
3240 // Detect when we have multiple callers of this callsite that
3241 // have already been assigned to specific, and different, clones
3242 // of OrigFunc (due to other unrelated callsites in Func they
3243 // reach via call contexts). Is this Clone of callsite Node
3244 // assigned to a different clone of OrigFunc? If so, clone Node
3245 // again.
3246 (FuncCloneAssignedToCurCallsiteClone &&
3247 FuncCloneAssignedToCurCallsiteClone !=
3248 FuncCloneCalledByCaller)) {
3249 // We need to use a different newly created callsite clone, in
3250 // order to assign it to another new function clone on a
3251 // subsequent iteration over the Clones array (adjusted below).
3252 // Note we specifically do not reset the
3253 // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3254 // when this new clone is processed later we know which version of
3255 // the function to copy (so that other callsite clones we have
3256 // assigned to that function clone are properly cloned over). See
3257 // comments in the function cloning handling earlier.
3258
3259 // Check if we already have cloned this callsite again while
3260 // walking through caller edges, for a caller calling the same
3261 // function clone. If so, we can move this edge to that new clone
3262 // rather than creating yet another new clone.
3263 if (FuncCloneToNewCallsiteCloneMap.count(
3264 FuncCloneCalledByCaller)) {
3265 ContextNode *NewClone =
3266 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3267 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3268 // Cleanup any none type edges cloned over.
3269 removeNoneTypeCalleeEdges(NewClone);
3270 } else {
3271 // Create a new callsite clone.
3272 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3273 removeNoneTypeCalleeEdges(NewClone);
3274 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3275 NewClone;
3276 // Add to list of clones and process later.
3277 ClonesWorklist.push_back(NewClone);
3278 assert(EI == Clone->CallerEdges.end() ||
3279 Clone->AllocTypes != (uint8_t)AllocationType::None);
3280 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3281 }
3282 // Moving the caller edge may have resulted in some none type
3283 // callee edges.
3284 removeNoneTypeCalleeEdges(Clone);
3285 // We will handle the newly created callsite clone in a subsequent
3286 // iteration over this Node's Clones. Continue here since we
3287 // already adjusted iterator EI while moving the edge.
3288 continue;
3289 }
3290
3291 // Otherwise, we can use the function clone already assigned to this
3292 // caller.
3293 if (!FuncCloneAssignedToCurCallsiteClone) {
3294 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3295 // Assign Clone to FuncCloneCalledByCaller
3296 AssignCallsiteCloneToFuncClone(
3297 FuncCloneCalledByCaller, Call, Clone,
3298 AllocationCallToContextNodeMap.count(Call));
3299 } else
3300 // Don't need to do anything - callsite is already calling this
3301 // function clone.
3302 assert(FuncCloneAssignedToCurCallsiteClone ==
3303 FuncCloneCalledByCaller);
3304
3305 } else {
3306 // We have not already assigned this caller to a version of
3307 // OrigFunc. Do the assignment now.
3308
3309 // First check if we have already assigned this callsite clone to a
3310 // clone of OrigFunc for another caller during this iteration over
3311 // its caller edges.
3312 if (!FuncCloneAssignedToCurCallsiteClone) {
3313 // Find first function in FuncClonesToCallMap without an assigned
3314 // clone of this callsite Node. We should always have one
3315 // available at this point due to the earlier cloning when the
3316 // FuncClonesToCallMap size was smaller than the clone number.
3317 for (auto &CF : FuncClonesToCallMap) {
3318 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3319 FuncCloneAssignedToCurCallsiteClone = CF.first;
3320 break;
3321 }
3322 }
3323 assert(FuncCloneAssignedToCurCallsiteClone);
3324 // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3325 AssignCallsiteCloneToFuncClone(
3326 FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3327 AllocationCallToContextNodeMap.count(Call));
3328 } else
3329 assert(FuncCloneToCurNodeCloneMap
3330 [FuncCloneAssignedToCurCallsiteClone] == Clone);
3331 // Update callers to record function version called.
3332 RecordCalleeFuncOfCallsite(Edge->Caller,
3333 FuncCloneAssignedToCurCallsiteClone);
3334 }
3335
3336 EI++;
3337 }
3338 }
3339 if (VerifyCCG) {
3340 checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3341 for (const auto &PE : Node->CalleeEdges)
3342 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3343 for (const auto &CE : Node->CallerEdges)
3344 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3345 for (auto *Clone : Node->Clones) {
3346 checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3347 for (const auto &PE : Clone->CalleeEdges)
3348 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3349 for (const auto &CE : Clone->CallerEdges)
3350 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3351 }
3352 }
3353 }
3354 }
3355
3356 auto UpdateCalls = [&](ContextNode *Node,
3357 DenseSet<const ContextNode *> &Visited,
3358 auto &&UpdateCalls) {
3359 auto Inserted = Visited.insert(Node);
3360 if (!Inserted.second)
3361 return;
3362
3363 for (auto *Clone : Node->Clones)
3364 UpdateCalls(Clone, Visited, UpdateCalls);
3365
3366 for (auto &Edge : Node->CallerEdges)
3367 UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3368
3369 // Skip if either no call to update, or if we ended up with no context ids
3370 // (we moved all edges onto other clones).
3371 if (!Node->hasCall() || Node->emptyContextIds())
3372 return;
3373
3374 if (Node->IsAllocation) {
3375 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3376 return;
3377 }
3378
3379 if (!CallsiteToCalleeFuncCloneMap.count(Node))
3380 return;
3381
3382 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3383 updateCall(Node->Call, CalleeFunc);
3384 };
3385
3386 // Performs DFS traversal starting from allocation nodes to update calls to
3387 // reflect cloning decisions recorded earlier. For regular LTO this will
3388 // update the actual calls in the IR to call the appropriate function clone
3389 // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3390 // are recorded in the summary entries.
3391 DenseSet<const ContextNode *> Visited;
3392 for (auto &Entry : AllocationCallToContextNodeMap)
3393 UpdateCalls(Entry.second, Visited, UpdateCalls);
3394
3395 return Changed;
3396 }
3397
createFunctionClones(Function & F,unsigned NumClones,Module & M,OptimizationRemarkEmitter & ORE,std::map<const Function *,SmallPtrSet<const GlobalAlias *,1>> & FuncToAliasMap)3398 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3399 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3400 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3401 &FuncToAliasMap) {
3402 // The first "clone" is the original copy, we should only call this if we
3403 // needed to create new clones.
3404 assert(NumClones > 1);
3405 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3406 VMaps.reserve(NumClones - 1);
3407 FunctionsClonedThinBackend++;
3408 for (unsigned I = 1; I < NumClones; I++) {
3409 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3410 auto *NewF = CloneFunction(&F, *VMaps.back());
3411 FunctionClonesThinBackend++;
3412 // Strip memprof and callsite metadata from clone as they are no longer
3413 // needed.
3414 for (auto &BB : *NewF) {
3415 for (auto &Inst : BB) {
3416 Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3417 Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3418 }
3419 }
3420 std::string Name = getMemProfFuncName(F.getName(), I);
3421 auto *PrevF = M.getFunction(Name);
3422 if (PrevF) {
3423 // We might have created this when adjusting callsite in another
3424 // function. It should be a declaration.
3425 assert(PrevF->isDeclaration());
3426 NewF->takeName(PrevF);
3427 PrevF->replaceAllUsesWith(NewF);
3428 PrevF->eraseFromParent();
3429 } else
3430 NewF->setName(Name);
3431 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3432 << "created clone " << ore::NV("NewFunction", NewF));
3433
3434 // Now handle aliases to this function, and clone those as well.
3435 if (!FuncToAliasMap.count(&F))
3436 continue;
3437 for (auto *A : FuncToAliasMap[&F]) {
3438 std::string Name = getMemProfFuncName(A->getName(), I);
3439 auto *PrevA = M.getNamedAlias(Name);
3440 auto *NewA = GlobalAlias::create(A->getValueType(),
3441 A->getType()->getPointerAddressSpace(),
3442 A->getLinkage(), Name, NewF);
3443 NewA->copyAttributesFrom(A);
3444 if (PrevA) {
3445 // We might have created this when adjusting callsite in another
3446 // function. It should be a declaration.
3447 assert(PrevA->isDeclaration());
3448 NewA->takeName(PrevA);
3449 PrevA->replaceAllUsesWith(NewA);
3450 PrevA->eraseFromParent();
3451 }
3452 }
3453 }
3454 return VMaps;
3455 }
3456
3457 // Locate the summary for F. This is complicated by the fact that it might
3458 // have been internalized or promoted.
findValueInfoForFunc(const Function & F,const Module & M,const ModuleSummaryIndex * ImportSummary)3459 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3460 const ModuleSummaryIndex *ImportSummary) {
3461 // FIXME: Ideally we would retain the original GUID in some fashion on the
3462 // function (e.g. as metadata), but for now do our best to locate the
3463 // summary without that information.
3464 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3465 if (!TheFnVI)
3466 // See if theFn was internalized, by checking index directly with
3467 // original name (this avoids the name adjustment done by getGUID() for
3468 // internal symbols).
3469 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3470 if (TheFnVI)
3471 return TheFnVI;
3472 // Now query with the original name before any promotion was performed.
3473 StringRef OrigName =
3474 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
3475 std::string OrigId = GlobalValue::getGlobalIdentifier(
3476 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3477 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3478 if (TheFnVI)
3479 return TheFnVI;
3480 // Could be a promoted local imported from another module. We need to pass
3481 // down more info here to find the original module id. For now, try with
3482 // the OrigName which might have been stored in the OidGuidMap in the
3483 // index. This would not work if there were same-named locals in multiple
3484 // modules, however.
3485 auto OrigGUID =
3486 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3487 if (OrigGUID)
3488 TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3489 return TheFnVI;
3490 }
3491
applyImport(Module & M)3492 bool MemProfContextDisambiguation::applyImport(Module &M) {
3493 assert(ImportSummary);
3494 bool Changed = false;
3495
3496 auto IsMemProfClone = [](const Function &F) {
3497 return F.getName().contains(MemProfCloneSuffix);
3498 };
3499
3500 // We also need to clone any aliases that reference cloned functions, because
3501 // the modified callsites may invoke via the alias. Keep track of the aliases
3502 // for each function.
3503 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3504 FuncToAliasMap;
3505 for (auto &A : M.aliases()) {
3506 auto *Aliasee = A.getAliaseeObject();
3507 if (auto *F = dyn_cast<Function>(Aliasee))
3508 FuncToAliasMap[F].insert(&A);
3509 }
3510
3511 for (auto &F : M) {
3512 if (F.isDeclaration() || IsMemProfClone(F))
3513 continue;
3514
3515 OptimizationRemarkEmitter ORE(&F);
3516
3517 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3518 bool ClonesCreated = false;
3519 unsigned NumClonesCreated = 0;
3520 auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3521 // We should at least have version 0 which is the original copy.
3522 assert(NumClones > 0);
3523 // If only one copy needed use original.
3524 if (NumClones == 1)
3525 return;
3526 // If we already performed cloning of this function, confirm that the
3527 // requested number of clones matches (the thin link should ensure the
3528 // number of clones for each constituent callsite is consistent within
3529 // each function), before returning.
3530 if (ClonesCreated) {
3531 assert(NumClonesCreated == NumClones);
3532 return;
3533 }
3534 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3535 // The first "clone" is the original copy, which doesn't have a VMap.
3536 assert(VMaps.size() == NumClones - 1);
3537 Changed = true;
3538 ClonesCreated = true;
3539 NumClonesCreated = NumClones;
3540 };
3541
3542 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3543 Function *CalledFunction) {
3544 // Perform cloning if not yet done.
3545 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3546
3547 // Should have skipped indirect calls via mayHaveMemprofSummary.
3548 assert(CalledFunction);
3549 assert(!IsMemProfClone(*CalledFunction));
3550
3551 // Update the calls per the summary info.
3552 // Save orig name since it gets updated in the first iteration
3553 // below.
3554 auto CalleeOrigName = CalledFunction->getName();
3555 for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3556 // Do nothing if this version calls the original version of its
3557 // callee.
3558 if (!StackNode.Clones[J])
3559 continue;
3560 auto NewF = M.getOrInsertFunction(
3561 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3562 CalledFunction->getFunctionType());
3563 CallBase *CBClone;
3564 // Copy 0 is the original function.
3565 if (!J)
3566 CBClone = CB;
3567 else
3568 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3569 CBClone->setCalledFunction(NewF);
3570 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3571 << ore::NV("Call", CBClone) << " in clone "
3572 << ore::NV("Caller", CBClone->getFunction())
3573 << " assigned to call function clone "
3574 << ore::NV("Callee", NewF.getCallee()));
3575 }
3576 };
3577
3578 // Locate the summary for F.
3579 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3580 // If not found, this could be an imported local (see comment in
3581 // findValueInfoForFunc). Skip for now as it will be cloned in its original
3582 // module (where it would have been promoted to global scope so should
3583 // satisfy any reference in this module).
3584 if (!TheFnVI)
3585 continue;
3586
3587 auto *GVSummary =
3588 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3589 if (!GVSummary) {
3590 // Must have been imported, use the summary which matches the definition。
3591 // (might be multiple if this was a linkonce_odr).
3592 auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3593 assert(SrcModuleMD &&
3594 "enable-import-metadata is needed to emit thinlto_src_module");
3595 StringRef SrcModule =
3596 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3597 for (auto &GVS : TheFnVI.getSummaryList()) {
3598 if (GVS->modulePath() == SrcModule) {
3599 GVSummary = GVS.get();
3600 break;
3601 }
3602 }
3603 assert(GVSummary && GVSummary->modulePath() == SrcModule);
3604 }
3605
3606 // If this was an imported alias skip it as we won't have the function
3607 // summary, and it should be cloned in the original module.
3608 if (isa<AliasSummary>(GVSummary))
3609 continue;
3610
3611 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3612
3613 if (FS->allocs().empty() && FS->callsites().empty())
3614 continue;
3615
3616 auto SI = FS->callsites().begin();
3617 auto AI = FS->allocs().begin();
3618
3619 // To handle callsite infos synthesized for tail calls which have missing
3620 // frames in the profiled context, map callee VI to the synthesized callsite
3621 // info.
3622 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3623 // Iterate the callsites for this function in reverse, since we place all
3624 // those synthesized for tail calls at the end.
3625 for (auto CallsiteIt = FS->callsites().rbegin();
3626 CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3627 auto &Callsite = *CallsiteIt;
3628 // Stop as soon as we see a non-synthesized callsite info (see comment
3629 // above loop). All the entries added for discovered tail calls have empty
3630 // stack ids.
3631 if (!Callsite.StackIdIndices.empty())
3632 break;
3633 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3634 }
3635
3636 // Assume for now that the instructions are in the exact same order
3637 // as when the summary was created, but confirm this is correct by
3638 // matching the stack ids.
3639 for (auto &BB : F) {
3640 for (auto &I : BB) {
3641 auto *CB = dyn_cast<CallBase>(&I);
3642 // Same handling as when creating module summary.
3643 if (!mayHaveMemprofSummary(CB))
3644 continue;
3645
3646 auto *CalledValue = CB->getCalledOperand();
3647 auto *CalledFunction = CB->getCalledFunction();
3648 if (CalledValue && !CalledFunction) {
3649 CalledValue = CalledValue->stripPointerCasts();
3650 // Stripping pointer casts can reveal a called function.
3651 CalledFunction = dyn_cast<Function>(CalledValue);
3652 }
3653 // Check if this is an alias to a function. If so, get the
3654 // called aliasee for the checks below.
3655 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3656 assert(!CalledFunction &&
3657 "Expected null called function in callsite for alias");
3658 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3659 }
3660
3661 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3662 I.getMetadata(LLVMContext::MD_callsite));
3663 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3664
3665 // Include allocs that were already assigned a memprof function
3666 // attribute in the statistics.
3667 if (CB->getAttributes().hasFnAttr("memprof")) {
3668 assert(!MemProfMD);
3669 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3670 ? AllocTypeColdThinBackend++
3671 : AllocTypeNotColdThinBackend++;
3672 OrigAllocsThinBackend++;
3673 AllocVersionsThinBackend++;
3674 if (!MaxAllocVersionsThinBackend)
3675 MaxAllocVersionsThinBackend = 1;
3676 // Remove any remaining callsite metadata and we can skip the rest of
3677 // the handling for this instruction, since no cloning needed.
3678 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3679 continue;
3680 }
3681
3682 if (MemProfMD) {
3683 // Consult the next alloc node.
3684 assert(AI != FS->allocs().end());
3685 auto &AllocNode = *(AI++);
3686
3687 // Sanity check that the MIB stack ids match between the summary and
3688 // instruction metadata.
3689 auto MIBIter = AllocNode.MIBs.begin();
3690 for (auto &MDOp : MemProfMD->operands()) {
3691 assert(MIBIter != AllocNode.MIBs.end());
3692 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3693 MIBIter->StackIdIndices.begin();
3694 auto *MIBMD = cast<const MDNode>(MDOp);
3695 MDNode *StackMDNode = getMIBStackNode(MIBMD);
3696 assert(StackMDNode);
3697 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3698 auto ContextIterBegin =
3699 StackContext.beginAfterSharedPrefix(CallsiteContext);
3700 // Skip the checking on the first iteration.
3701 uint64_t LastStackContextId =
3702 (ContextIterBegin != StackContext.end() &&
3703 *ContextIterBegin == 0)
3704 ? 1
3705 : 0;
3706 for (auto ContextIter = ContextIterBegin;
3707 ContextIter != StackContext.end(); ++ContextIter) {
3708 // If this is a direct recursion, simply skip the duplicate
3709 // entries, to be consistent with how the summary ids were
3710 // generated during ModuleSummaryAnalysis.
3711 if (LastStackContextId == *ContextIter)
3712 continue;
3713 LastStackContextId = *ContextIter;
3714 assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3715 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3716 *ContextIter);
3717 StackIdIndexIter++;
3718 }
3719 MIBIter++;
3720 }
3721
3722 // Perform cloning if not yet done.
3723 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3724
3725 OrigAllocsThinBackend++;
3726 AllocVersionsThinBackend += AllocNode.Versions.size();
3727 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3728 MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3729
3730 // If there is only one version that means we didn't end up
3731 // considering this function for cloning, and in that case the alloc
3732 // will still be none type or should have gotten the default NotCold.
3733 // Skip that after calling clone helper since that does some sanity
3734 // checks that confirm we haven't decided yet that we need cloning.
3735 if (AllocNode.Versions.size() == 1) {
3736 assert((AllocationType)AllocNode.Versions[0] ==
3737 AllocationType::NotCold ||
3738 (AllocationType)AllocNode.Versions[0] ==
3739 AllocationType::None);
3740 UnclonableAllocsThinBackend++;
3741 continue;
3742 }
3743
3744 // All versions should have a singular allocation type.
3745 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3746 return Type == ((uint8_t)AllocationType::NotCold |
3747 (uint8_t)AllocationType::Cold);
3748 }));
3749
3750 // Update the allocation types per the summary info.
3751 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3752 // Ignore any that didn't get an assigned allocation type.
3753 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3754 continue;
3755 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3756 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3757 : AllocTypeNotColdThinBackend++;
3758 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3759 auto A = llvm::Attribute::get(F.getContext(), "memprof",
3760 AllocTypeString);
3761 CallBase *CBClone;
3762 // Copy 0 is the original function.
3763 if (!J)
3764 CBClone = CB;
3765 else
3766 // Since VMaps are only created for new clones, we index with
3767 // clone J-1 (J==0 is the original clone and does not have a VMaps
3768 // entry).
3769 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3770 CBClone->addFnAttr(A);
3771 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3772 << ore::NV("AllocationCall", CBClone) << " in clone "
3773 << ore::NV("Caller", CBClone->getFunction())
3774 << " marked with memprof allocation attribute "
3775 << ore::NV("Attribute", AllocTypeString));
3776 }
3777 } else if (!CallsiteContext.empty()) {
3778 // Consult the next callsite node.
3779 assert(SI != FS->callsites().end());
3780 auto &StackNode = *(SI++);
3781
3782 #ifndef NDEBUG
3783 // Sanity check that the stack ids match between the summary and
3784 // instruction metadata.
3785 auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3786 for (auto StackId : CallsiteContext) {
3787 assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3788 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3789 StackId);
3790 StackIdIndexIter++;
3791 }
3792 #endif
3793
3794 CloneCallsite(StackNode, CB, CalledFunction);
3795 } else if (CB->isTailCall()) {
3796 // Locate the synthesized callsite info for the callee VI, if any was
3797 // created, and use that for cloning.
3798 ValueInfo CalleeVI =
3799 findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3800 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3801 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3802 assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3803 CloneCallsite(Callsite->second, CB, CalledFunction);
3804 }
3805 }
3806 // Memprof and callsite metadata on memory allocations no longer needed.
3807 I.setMetadata(LLVMContext::MD_memprof, nullptr);
3808 I.setMetadata(LLVMContext::MD_callsite, nullptr);
3809 }
3810 }
3811 }
3812
3813 return Changed;
3814 }
3815
3816 template <typename DerivedCCG, typename FuncTy, typename CallTy>
process()3817 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3818 if (DumpCCG) {
3819 dbgs() << "CCG before cloning:\n";
3820 dbgs() << *this;
3821 }
3822 if (ExportToDot)
3823 exportToDot("postbuild");
3824
3825 if (VerifyCCG) {
3826 check();
3827 }
3828
3829 identifyClones();
3830
3831 if (VerifyCCG) {
3832 check();
3833 }
3834
3835 if (DumpCCG) {
3836 dbgs() << "CCG after cloning:\n";
3837 dbgs() << *this;
3838 }
3839 if (ExportToDot)
3840 exportToDot("cloned");
3841
3842 bool Changed = assignFunctions();
3843
3844 if (DumpCCG) {
3845 dbgs() << "CCG after assigning function clones:\n";
3846 dbgs() << *this;
3847 }
3848 if (ExportToDot)
3849 exportToDot("clonefuncassign");
3850
3851 if (MemProfReportHintedSizes)
3852 printTotalSizes(errs());
3853
3854 return Changed;
3855 }
3856
processModule(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)3857 bool MemProfContextDisambiguation::processModule(
3858 Module &M,
3859 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3860
3861 // If we have an import summary, then the cloning decisions were made during
3862 // the thin link on the index. Apply them and return.
3863 if (ImportSummary)
3864 return applyImport(M);
3865
3866 // TODO: If/when other types of memprof cloning are enabled beyond just for
3867 // hot and cold, we will need to change this to individually control the
3868 // AllocationType passed to addStackNodesForMIB during CCG construction.
3869 // Note that we specifically check this after applying imports above, so that
3870 // the option isn't needed to be passed to distributed ThinLTO backend
3871 // clang processes, which won't necessarily have visibility into the linker
3872 // dependences. Instead the information is communicated from the LTO link to
3873 // the backends via the combined summary index.
3874 if (!SupportsHotColdNew)
3875 return false;
3876
3877 ModuleCallsiteContextGraph CCG(M, OREGetter);
3878 return CCG.process();
3879 }
3880
MemProfContextDisambiguation(const ModuleSummaryIndex * Summary)3881 MemProfContextDisambiguation::MemProfContextDisambiguation(
3882 const ModuleSummaryIndex *Summary)
3883 : ImportSummary(Summary) {
3884 if (ImportSummary) {
3885 // The MemProfImportSummary should only be used for testing ThinLTO
3886 // distributed backend handling via opt, in which case we don't have a
3887 // summary from the pass pipeline.
3888 assert(MemProfImportSummary.empty());
3889 return;
3890 }
3891 if (MemProfImportSummary.empty())
3892 return;
3893
3894 auto ReadSummaryFile =
3895 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3896 if (!ReadSummaryFile) {
3897 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3898 "Error loading file '" + MemProfImportSummary +
3899 "': ");
3900 return;
3901 }
3902 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3903 if (!ImportSummaryForTestingOrErr) {
3904 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3905 "Error parsing file '" + MemProfImportSummary +
3906 "': ");
3907 return;
3908 }
3909 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3910 ImportSummary = ImportSummaryForTesting.get();
3911 }
3912
run(Module & M,ModuleAnalysisManager & AM)3913 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3914 ModuleAnalysisManager &AM) {
3915 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3916 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3917 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3918 };
3919 if (!processModule(M, OREGetter))
3920 return PreservedAnalyses::all();
3921 return PreservedAnalyses::none();
3922 }
3923
run(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)3924 void MemProfContextDisambiguation::run(
3925 ModuleSummaryIndex &Index,
3926 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3927 isPrevailing) {
3928 // TODO: If/when other types of memprof cloning are enabled beyond just for
3929 // hot and cold, we will need to change this to individually control the
3930 // AllocationType passed to addStackNodesForMIB during CCG construction.
3931 // The index was set from the option, so these should be in sync.
3932 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3933 if (!SupportsHotColdNew)
3934 return;
3935
3936 IndexCallsiteContextGraph CCG(Index, isPrevailing);
3937 CCG.process();
3938 }
3939