xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (revision b64c5a0ace59af62eff52bfe110a521dc73c937b)
1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <deque>
48 #include <sstream>
49 #include <unordered_map>
50 #include <vector>
51 using namespace llvm;
52 using namespace llvm::memprof;
53 
54 #define DEBUG_TYPE "memprof-context-disambiguation"
55 
56 STATISTIC(FunctionClonesAnalysis,
57           "Number of function clones created during whole program analysis");
58 STATISTIC(FunctionClonesThinBackend,
59           "Number of function clones created during ThinLTO backend");
60 STATISTIC(FunctionsClonedThinBackend,
61           "Number of functions that had clones created during ThinLTO backend");
62 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
63                             "cloned) during whole program analysis");
64 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
65                          "during whole program analysis");
66 STATISTIC(AllocTypeNotColdThinBackend,
67           "Number of not cold static allocations (possibly cloned) during "
68           "ThinLTO backend");
69 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
70                                     "(possibly cloned) during ThinLTO backend");
71 STATISTIC(OrigAllocsThinBackend,
72           "Number of original (not cloned) allocations with memprof profiles "
73           "during ThinLTO backend");
74 STATISTIC(
75     AllocVersionsThinBackend,
76     "Number of allocation versions (including clones) during ThinLTO backend");
77 STATISTIC(MaxAllocVersionsThinBackend,
78           "Maximum number of allocation versions created for an original "
79           "allocation during ThinLTO backend");
80 STATISTIC(UnclonableAllocsThinBackend,
81           "Number of unclonable ambigous allocations during ThinLTO backend");
82 STATISTIC(RemovedEdgesWithMismatchedCallees,
83           "Number of edges removed due to mismatched callees (profiled vs IR)");
84 STATISTIC(FoundProfiledCalleeCount,
85           "Number of profiled callees found via tail calls");
86 STATISTIC(FoundProfiledCalleeDepth,
87           "Aggregate depth of profiled callees found via tail calls");
88 STATISTIC(FoundProfiledCalleeMaxDepth,
89           "Maximum depth of profiled callees found via tail calls");
90 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
91           "Number of profiled callees found via multiple tail call chains");
92 
93 static cl::opt<std::string> DotFilePathPrefix(
94     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
95     cl::value_desc("filename"),
96     cl::desc("Specify the path prefix of the MemProf dot files."));
97 
98 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
99                                  cl::Hidden,
100                                  cl::desc("Export graph to dot files."));
101 
102 static cl::opt<bool>
103     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
104             cl::desc("Dump CallingContextGraph to stdout after each stage."));
105 
106 static cl::opt<bool>
107     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
108               cl::desc("Perform verification checks on CallingContextGraph."));
109 
110 static cl::opt<bool>
111     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
112                 cl::desc("Perform frequent verification checks on nodes."));
113 
114 static cl::opt<std::string> MemProfImportSummary(
115     "memprof-import-summary",
116     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
117     cl::Hidden);
118 
119 static cl::opt<unsigned>
120     TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
121                         cl::Hidden,
122                         cl::desc("Max depth to recursively search for missing "
123                                  "frames through tail calls."));
124 
125 namespace llvm {
126 cl::opt<bool> EnableMemProfContextDisambiguation(
127     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
128     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
129 
130 // Indicate we are linking with an allocator that supports hot/cold operator
131 // new interfaces.
132 cl::opt<bool> SupportsHotColdNew(
133     "supports-hot-cold-new", cl::init(false), cl::Hidden,
134     cl::desc("Linking with hot/cold operator new interfaces"));
135 } // namespace llvm
136 
137 extern cl::opt<bool> MemProfReportHintedSizes;
138 
139 namespace {
140 /// CRTP base for graphs built from either IR or ThinLTO summary index.
141 ///
142 /// The graph represents the call contexts in all memprof metadata on allocation
143 /// calls, with nodes for the allocations themselves, as well as for the calls
144 /// in each context. The graph is initially built from the allocation memprof
145 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
146 /// metadata onto the nodes, updating it to reflect any inlining performed on
147 /// those calls.
148 ///
149 /// Each MIB (representing an allocation's call context with allocation
150 /// behavior) is assigned a unique context id during the graph build. The edges
151 /// and nodes in the graph are decorated with the context ids they carry. This
152 /// is used to correctly update the graph when cloning is performed so that we
153 /// can uniquify the context for a single (possibly cloned) allocation.
154 template <typename DerivedCCG, typename FuncTy, typename CallTy>
155 class CallsiteContextGraph {
156 public:
157   CallsiteContextGraph() = default;
158   CallsiteContextGraph(const CallsiteContextGraph &) = default;
159   CallsiteContextGraph(CallsiteContextGraph &&) = default;
160 
161   /// Main entry point to perform analysis and transformations on graph.
162   bool process();
163 
164   /// Perform cloning on the graph necessary to uniquely identify the allocation
165   /// behavior of an allocation based on its context.
166   void identifyClones();
167 
168   /// Assign callsite clones to functions, cloning functions as needed to
169   /// accommodate the combinations of their callsite clones reached by callers.
170   /// For regular LTO this clones functions and callsites in the IR, but for
171   /// ThinLTO the cloning decisions are noted in the summaries and later applied
172   /// in applyImport.
173   bool assignFunctions();
174 
175   void dump() const;
176   void print(raw_ostream &OS) const;
177   void printTotalSizes(raw_ostream &OS) const;
178 
179   friend raw_ostream &operator<<(raw_ostream &OS,
180                                  const CallsiteContextGraph &CCG) {
181     CCG.print(OS);
182     return OS;
183   }
184 
185   friend struct GraphTraits<
186       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
187   friend struct DOTGraphTraits<
188       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
189 
190   void exportToDot(std::string Label) const;
191 
192   /// Represents a function clone via FuncTy pointer and clone number pair.
193   struct FuncInfo final
194       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
195     using Base = std::pair<FuncTy *, unsigned>;
196     FuncInfo(const Base &B) : Base(B) {}
197     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
198     explicit operator bool() const { return this->first != nullptr; }
199     FuncTy *func() const { return this->first; }
200     unsigned cloneNo() const { return this->second; }
201   };
202 
203   /// Represents a callsite clone via CallTy and clone number pair.
204   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
205     using Base = std::pair<CallTy, unsigned>;
206     CallInfo(const Base &B) : Base(B) {}
207     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
208         : Base(Call, CloneNo) {}
209     explicit operator bool() const { return (bool)this->first; }
210     CallTy call() const { return this->first; }
211     unsigned cloneNo() const { return this->second; }
212     void setCloneNo(unsigned N) { this->second = N; }
213     void print(raw_ostream &OS) const {
214       if (!operator bool()) {
215         assert(!cloneNo());
216         OS << "null Call";
217         return;
218       }
219       call()->print(OS);
220       OS << "\t(clone " << cloneNo() << ")";
221     }
222     void dump() const {
223       print(dbgs());
224       dbgs() << "\n";
225     }
226     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
227       Call.print(OS);
228       return OS;
229     }
230   };
231 
232   struct ContextEdge;
233 
234   /// Node in the Callsite Context Graph
235   struct ContextNode {
236     // Keep this for now since in the IR case where we have an Instruction* it
237     // is not as immediately discoverable. Used for printing richer information
238     // when dumping graph.
239     bool IsAllocation;
240 
241     // Keeps track of when the Call was reset to null because there was
242     // recursion.
243     bool Recursive = false;
244 
245     // The corresponding allocation or interior call.
246     CallInfo Call;
247 
248     // For alloc nodes this is a unique id assigned when constructed, and for
249     // callsite stack nodes it is the original stack id when the node is
250     // constructed from the memprof MIB metadata on the alloc nodes. Note that
251     // this is only used when matching callsite metadata onto the stack nodes
252     // created when processing the allocation memprof MIBs, and for labeling
253     // nodes in the dot graph. Therefore we don't bother to assign a value for
254     // clones.
255     uint64_t OrigStackOrAllocId = 0;
256 
257     // This will be formed by ORing together the AllocationType enum values
258     // for contexts including this node.
259     uint8_t AllocTypes = 0;
260 
261     // Edges to all callees in the profiled call stacks.
262     // TODO: Should this be a map (from Callee node) for more efficient lookup?
263     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
264 
265     // Edges to all callers in the profiled call stacks.
266     // TODO: Should this be a map (from Caller node) for more efficient lookup?
267     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
268 
269     // Get the list of edges from which we can compute allocation information
270     // such as the context ids and allocation type of this node.
271     const std::vector<std::shared_ptr<ContextEdge>> *
272     getEdgesWithAllocInfo() const {
273       // If node has any callees, compute from those, otherwise compute from
274       // callers (i.e. if this is the leaf allocation node).
275       if (!CalleeEdges.empty())
276         return &CalleeEdges;
277       if (!CallerEdges.empty()) {
278         // A node with caller edges but no callee edges must be the allocation
279         // node.
280         assert(IsAllocation);
281         return &CallerEdges;
282       }
283       return nullptr;
284     }
285 
286     // Compute the context ids for this node from the union of its edge context
287     // ids.
288     DenseSet<uint32_t> getContextIds() const {
289       DenseSet<uint32_t> ContextIds;
290       auto *Edges = getEdgesWithAllocInfo();
291       if (!Edges)
292         return {};
293       unsigned Count = 0;
294       for (auto &Edge : *Edges)
295         Count += Edge->getContextIds().size();
296       ContextIds.reserve(Count);
297       for (auto &Edge : *Edges)
298         ContextIds.insert(Edge->getContextIds().begin(),
299                           Edge->getContextIds().end());
300       return ContextIds;
301     }
302 
303     // Compute the allocation type for this node from the OR of its edge
304     // allocation types.
305     uint8_t computeAllocType() const {
306       auto *Edges = getEdgesWithAllocInfo();
307       if (!Edges)
308         return (uint8_t)AllocationType::None;
309       uint8_t BothTypes =
310           (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
311       uint8_t AllocType = (uint8_t)AllocationType::None;
312       for (auto &Edge : *Edges) {
313         AllocType |= Edge->AllocTypes;
314         // Bail early if alloc type reached both, no further refinement.
315         if (AllocType == BothTypes)
316           return AllocType;
317       }
318       return AllocType;
319     }
320 
321     // The context ids set for this node is empty if its edge context ids are
322     // also all empty.
323     bool emptyContextIds() const {
324       auto *Edges = getEdgesWithAllocInfo();
325       if (!Edges)
326         return true;
327       for (auto &Edge : *Edges) {
328         if (!Edge->getContextIds().empty())
329           return false;
330       }
331       return true;
332     }
333 
334     // List of clones of this ContextNode, initially empty.
335     std::vector<ContextNode *> Clones;
336 
337     // If a clone, points to the original uncloned node.
338     ContextNode *CloneOf = nullptr;
339 
340     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
341 
342     ContextNode(bool IsAllocation, CallInfo C)
343         : IsAllocation(IsAllocation), Call(C) {}
344 
345     void addClone(ContextNode *Clone) {
346       if (CloneOf) {
347         CloneOf->Clones.push_back(Clone);
348         Clone->CloneOf = CloneOf;
349       } else {
350         Clones.push_back(Clone);
351         assert(!Clone->CloneOf);
352         Clone->CloneOf = this;
353       }
354     }
355 
356     ContextNode *getOrigNode() {
357       if (!CloneOf)
358         return this;
359       return CloneOf;
360     }
361 
362     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
363                                unsigned int ContextId);
364 
365     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
366     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
367     void eraseCalleeEdge(const ContextEdge *Edge);
368     void eraseCallerEdge(const ContextEdge *Edge);
369 
370     void setCall(CallInfo C) { Call = C; }
371 
372     bool hasCall() const { return (bool)Call.call(); }
373 
374     void printCall(raw_ostream &OS) const { Call.print(OS); }
375 
376     // True if this node was effectively removed from the graph, in which case
377     // it should have an allocation type of None and empty context ids.
378     bool isRemoved() const {
379       assert((AllocTypes == (uint8_t)AllocationType::None) ==
380              emptyContextIds());
381       return AllocTypes == (uint8_t)AllocationType::None;
382     }
383 
384     void dump() const;
385     void print(raw_ostream &OS) const;
386 
387     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
388       Node.print(OS);
389       return OS;
390     }
391   };
392 
393   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
394   /// callee.
395   struct ContextEdge {
396     ContextNode *Callee;
397     ContextNode *Caller;
398 
399     // This will be formed by ORing together the AllocationType enum values
400     // for contexts including this edge.
401     uint8_t AllocTypes = 0;
402 
403     // The set of IDs for contexts including this edge.
404     DenseSet<uint32_t> ContextIds;
405 
406     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
407                 DenseSet<uint32_t> ContextIds)
408         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
409           ContextIds(std::move(ContextIds)) {}
410 
411     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
412 
413     void dump() const;
414     void print(raw_ostream &OS) const;
415 
416     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
417       Edge.print(OS);
418       return OS;
419     }
420   };
421 
422   /// Helpers to remove callee edges that have allocation type None (due to not
423   /// carrying any context ids) after transformations.
424   void removeNoneTypeCalleeEdges(ContextNode *Node);
425   void
426   recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
427                                        DenseSet<const ContextNode *> &Visited);
428 
429 protected:
430   /// Get a list of nodes corresponding to the stack ids in the given callsite
431   /// context.
432   template <class NodeT, class IteratorT>
433   std::vector<uint64_t>
434   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
435 
436   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
437   /// metadata (or summary).
438   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
439 
440   /// Adds nodes for the given MIB stack ids.
441   template <class NodeT, class IteratorT>
442   void addStackNodesForMIB(ContextNode *AllocNode,
443                            CallStack<NodeT, IteratorT> &StackContext,
444                            CallStack<NodeT, IteratorT> &CallsiteContext,
445                            AllocationType AllocType, uint64_t TotalSize);
446 
447   /// Matches all callsite metadata (or summary) to the nodes created for
448   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
449   /// inlining performed on those callsite instructions.
450   void updateStackNodes();
451 
452   /// Update graph to conservatively handle any callsite stack nodes that target
453   /// multiple different callee target functions.
454   void handleCallsitesWithMultipleTargets();
455 
456   /// Save lists of calls with MemProf metadata in each function, for faster
457   /// iteration.
458   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
459 
460   /// Map from callsite node to the enclosing caller function.
461   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
462 
463 private:
464   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
465 
466   using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
467                                      const FuncTy *, DenseSet<uint32_t>>;
468 
469   /// Assigns the given Node to calls at or inlined into the location with
470   /// the Node's stack id, after post order traversing and processing its
471   /// caller nodes. Uses the call information recorded in the given
472   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
473   /// as needed. Called by updateStackNodes which sets up the given
474   /// StackIdToMatchingCalls map.
475   void assignStackNodesPostOrder(
476       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
477       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
478 
479   /// Duplicates the given set of context ids, updating the provided
480   /// map from each original id with the newly generated context ids,
481   /// and returning the new duplicated id set.
482   DenseSet<uint32_t> duplicateContextIds(
483       const DenseSet<uint32_t> &StackSequenceContextIds,
484       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
485 
486   /// Propagates all duplicated context ids across the graph.
487   void propagateDuplicateContextIds(
488       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
489 
490   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
491   /// else to its callers. Also updates OrigNode's edges to remove any context
492   /// ids moved to the newly created edge.
493   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
494                       bool TowardsCallee,
495                       DenseSet<uint32_t> RemainingContextIds);
496 
497   /// Get the stack id corresponding to the given Id or Index (for IR this will
498   /// return itself, for a summary index this will return the id recorded in the
499   /// index for that stack id index value).
500   uint64_t getStackId(uint64_t IdOrIndex) const {
501     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
502   }
503 
504   /// Returns true if the given call targets the callee of the given edge, or if
505   /// we were able to identify the call chain through intermediate tail calls.
506   /// In the latter case new context nodes are added to the graph for the
507   /// identified tail calls, and their synthesized nodes are added to
508   /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
509   /// the updated edges and to prepare it for an increment in the caller.
510   bool
511   calleesMatch(CallTy Call, EdgeIter &EI,
512                MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
513 
514   /// Returns true if the given call targets the given function, or if we were
515   /// able to identify the call chain through intermediate tail calls (in which
516   /// case FoundCalleeChain will be populated).
517   bool calleeMatchesFunc(
518       CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
519       std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
520     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
521         Call, Func, CallerFunc, FoundCalleeChain);
522   }
523 
524   /// Get a list of nodes corresponding to the stack ids in the given
525   /// callsite's context.
526   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
527     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
528         Call);
529   }
530 
531   /// Get the last stack id in the context for callsite.
532   uint64_t getLastStackId(CallTy Call) {
533     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
534   }
535 
536   /// Update the allocation call to record type of allocated memory.
537   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
538     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
539     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
540   }
541 
542   /// Update non-allocation call to invoke (possibly cloned) function
543   /// CalleeFunc.
544   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
545     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
546   }
547 
548   /// Clone the given function for the given callsite, recording mapping of all
549   /// of the functions tracked calls to their new versions in the CallMap.
550   /// Assigns new clones to clone number CloneNo.
551   FuncInfo cloneFunctionForCallsite(
552       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
553       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
554     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
555         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
556   }
557 
558   /// Gets a label to use in the dot graph for the given call clone in the given
559   /// function.
560   std::string getLabel(const FuncTy *Func, const CallTy Call,
561                        unsigned CloneNo) const {
562     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
563   }
564 
565   /// Helpers to find the node corresponding to the given call or stackid.
566   ContextNode *getNodeForInst(const CallInfo &C);
567   ContextNode *getNodeForAlloc(const CallInfo &C);
568   ContextNode *getNodeForStackId(uint64_t StackId);
569 
570   /// Computes the alloc type corresponding to the given context ids, by
571   /// unioning their recorded alloc types.
572   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
573 
574   /// Returns the allocation type of the intersection of the contexts of two
575   /// nodes (based on their provided context id sets), optimized for the case
576   /// when Node1Ids is smaller than Node2Ids.
577   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
578                                   const DenseSet<uint32_t> &Node2Ids);
579 
580   /// Returns the allocation type of the intersection of the contexts of two
581   /// nodes (based on their provided context id sets).
582   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
583                               const DenseSet<uint32_t> &Node2Ids);
584 
585   /// Create a clone of Edge's callee and move Edge to that new callee node,
586   /// performing the necessary context id and allocation type updates.
587   /// If callee's caller edge iterator is supplied, it is updated when removing
588   /// the edge from that list. If ContextIdsToMove is non-empty, only that
589   /// subset of Edge's ids are moved to an edge to the new callee.
590   ContextNode *
591   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
592                            EdgeIter *CallerEdgeI = nullptr,
593                            DenseSet<uint32_t> ContextIdsToMove = {});
594 
595   /// Change the callee of Edge to existing callee clone NewCallee, performing
596   /// the necessary context id and allocation type updates.
597   /// If callee's caller edge iterator is supplied, it is updated when removing
598   /// the edge from that list. If ContextIdsToMove is non-empty, only that
599   /// subset of Edge's ids are moved to an edge to the new callee.
600   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
601                                      ContextNode *NewCallee,
602                                      EdgeIter *CallerEdgeI = nullptr,
603                                      bool NewClone = false,
604                                      DenseSet<uint32_t> ContextIdsToMove = {});
605 
606   /// Recursively perform cloning on the graph for the given Node and its
607   /// callers, in order to uniquely identify the allocation behavior of an
608   /// allocation given its context. The context ids of the allocation being
609   /// processed are given in AllocContextIds.
610   void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
611                       const DenseSet<uint32_t> &AllocContextIds);
612 
613   /// Map from each context ID to the AllocationType assigned to that context.
614   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
615 
616   /// Map from each contextID to the profiled aggregate allocation size,
617   /// optionally populated when requested (via MemProfReportHintedSizes).
618   DenseMap<uint32_t, uint64_t> ContextIdToTotalSize;
619 
620   /// Identifies the context node created for a stack id when adding the MIB
621   /// contexts to the graph. This is used to locate the context nodes when
622   /// trying to assign the corresponding callsites with those stack ids to these
623   /// nodes.
624   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
625 
626   /// Maps to track the calls to their corresponding nodes in the graph.
627   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
628   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
629 
630   /// Owner of all ContextNode unique_ptrs.
631   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
632 
633   /// Perform sanity checks on graph when requested.
634   void check() const;
635 
636   /// Keeps track of the last unique context id assigned.
637   unsigned int LastContextId = 0;
638 };
639 
640 template <typename DerivedCCG, typename FuncTy, typename CallTy>
641 using ContextNode =
642     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
643 template <typename DerivedCCG, typename FuncTy, typename CallTy>
644 using ContextEdge =
645     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
646 template <typename DerivedCCG, typename FuncTy, typename CallTy>
647 using FuncInfo =
648     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
649 template <typename DerivedCCG, typename FuncTy, typename CallTy>
650 using CallInfo =
651     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
652 
653 /// CRTP derived class for graphs built from IR (regular LTO).
654 class ModuleCallsiteContextGraph
655     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
656                                   Instruction *> {
657 public:
658   ModuleCallsiteContextGraph(
659       Module &M,
660       llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
661 
662 private:
663   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
664                               Instruction *>;
665 
666   uint64_t getStackId(uint64_t IdOrIndex) const;
667   bool calleeMatchesFunc(
668       Instruction *Call, const Function *Func, const Function *CallerFunc,
669       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
670   bool findProfiledCalleeThroughTailCalls(
671       const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
672       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
673       bool &FoundMultipleCalleeChains);
674   uint64_t getLastStackId(Instruction *Call);
675   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
676   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
677   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
678   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
679                        Instruction *>::FuncInfo
680   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
681                            std::map<CallInfo, CallInfo> &CallMap,
682                            std::vector<CallInfo> &CallsWithMetadataInFunc,
683                            unsigned CloneNo);
684   std::string getLabel(const Function *Func, const Instruction *Call,
685                        unsigned CloneNo) const;
686 
687   const Module &Mod;
688   llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
689 };
690 
691 /// Represents a call in the summary index graph, which can either be an
692 /// allocation or an interior callsite node in an allocation's context.
693 /// Holds a pointer to the corresponding data structure in the index.
694 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
695   IndexCall() : PointerUnion() {}
696   IndexCall(std::nullptr_t) : IndexCall() {}
697   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
698   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
699   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
700 
701   IndexCall *operator->() { return this; }
702 
703   PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
704 
705   void print(raw_ostream &OS) const {
706     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
707       OS << *AI;
708     } else {
709       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
710       assert(CI);
711       OS << *CI;
712     }
713   }
714 };
715 
716 /// CRTP derived class for graphs built from summary index (ThinLTO).
717 class IndexCallsiteContextGraph
718     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
719                                   IndexCall> {
720 public:
721   IndexCallsiteContextGraph(
722       ModuleSummaryIndex &Index,
723       llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
724           isPrevailing);
725 
726   ~IndexCallsiteContextGraph() {
727     // Now that we are done with the graph it is safe to add the new
728     // CallsiteInfo structs to the function summary vectors. The graph nodes
729     // point into locations within these vectors, so we don't want to add them
730     // any earlier.
731     for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
732       auto *FS = I.first;
733       for (auto &Callsite : I.second)
734         FS->addCallsite(*Callsite.second);
735     }
736   }
737 
738 private:
739   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
740                               IndexCall>;
741 
742   uint64_t getStackId(uint64_t IdOrIndex) const;
743   bool calleeMatchesFunc(
744       IndexCall &Call, const FunctionSummary *Func,
745       const FunctionSummary *CallerFunc,
746       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
747   bool findProfiledCalleeThroughTailCalls(
748       ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
749       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
750       bool &FoundMultipleCalleeChains);
751   uint64_t getLastStackId(IndexCall &Call);
752   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
753   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
754   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
755   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
756                        IndexCall>::FuncInfo
757   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
758                            std::map<CallInfo, CallInfo> &CallMap,
759                            std::vector<CallInfo> &CallsWithMetadataInFunc,
760                            unsigned CloneNo);
761   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
762                        unsigned CloneNo) const;
763 
764   // Saves mapping from function summaries containing memprof records back to
765   // its VI, for use in checking and debugging.
766   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
767 
768   const ModuleSummaryIndex &Index;
769   llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
770       isPrevailing;
771 
772   // Saves/owns the callsite info structures synthesized for missing tail call
773   // frames that we discover while building the graph.
774   // It maps from the summary of the function making the tail call, to a map
775   // of callee ValueInfo to corresponding synthesized callsite info.
776   std::unordered_map<FunctionSummary *,
777                      std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
778       FunctionCalleesToSynthesizedCallsiteInfos;
779 };
780 } // namespace
781 
782 namespace llvm {
783 template <>
784 struct DenseMapInfo<typename CallsiteContextGraph<
785     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
786     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
787 template <>
788 struct DenseMapInfo<typename CallsiteContextGraph<
789     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
790     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
791 template <>
792 struct DenseMapInfo<IndexCall>
793     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
794 } // end namespace llvm
795 
796 namespace {
797 
798 struct FieldSeparator {
799   bool Skip = true;
800   const char *Sep;
801 
802   FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
803 };
804 
805 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
806   if (FS.Skip) {
807     FS.Skip = false;
808     return OS;
809   }
810   return OS << FS.Sep;
811 }
812 
813 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
814 // type we should actually use on the corresponding allocation.
815 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
816 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
817 // from NotCold.
818 AllocationType allocTypeToUse(uint8_t AllocTypes) {
819   assert(AllocTypes != (uint8_t)AllocationType::None);
820   if (AllocTypes ==
821       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
822     return AllocationType::NotCold;
823   else
824     return (AllocationType)AllocTypes;
825 }
826 
827 // Helper to check if the alloc types for all edges recorded in the
828 // InAllocTypes vector match the alloc types for all edges in the Edges
829 // vector.
830 template <typename DerivedCCG, typename FuncTy, typename CallTy>
831 bool allocTypesMatch(
832     const std::vector<uint8_t> &InAllocTypes,
833     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
834         &Edges) {
835   return std::equal(
836       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
837       [](const uint8_t &l,
838          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
839         // Can share if one of the edges is None type - don't
840         // care about the type along that edge as it doesn't
841         // exist for those context ids.
842         if (l == (uint8_t)AllocationType::None ||
843             r->AllocTypes == (uint8_t)AllocationType::None)
844           return true;
845         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
846       });
847 }
848 
849 } // end anonymous namespace
850 
851 template <typename DerivedCCG, typename FuncTy, typename CallTy>
852 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
853 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
854     const CallInfo &C) {
855   ContextNode *Node = getNodeForAlloc(C);
856   if (Node)
857     return Node;
858 
859   return NonAllocationCallToContextNodeMap.lookup(C);
860 }
861 
862 template <typename DerivedCCG, typename FuncTy, typename CallTy>
863 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
864 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
865     const CallInfo &C) {
866   return AllocationCallToContextNodeMap.lookup(C);
867 }
868 
869 template <typename DerivedCCG, typename FuncTy, typename CallTy>
870 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
871 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
872     uint64_t StackId) {
873   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
874   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
875     return StackEntryNode->second;
876   return nullptr;
877 }
878 
879 template <typename DerivedCCG, typename FuncTy, typename CallTy>
880 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
881     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
882                           unsigned int ContextId) {
883   for (auto &Edge : CallerEdges) {
884     if (Edge->Caller == Caller) {
885       Edge->AllocTypes |= (uint8_t)AllocType;
886       Edge->getContextIds().insert(ContextId);
887       return;
888     }
889   }
890   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
891       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
892   CallerEdges.push_back(Edge);
893   Caller->CalleeEdges.push_back(Edge);
894 }
895 
896 template <typename DerivedCCG, typename FuncTy, typename CallTy>
897 void CallsiteContextGraph<
898     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
899   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
900     auto Edge = *EI;
901     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
902       assert(Edge->ContextIds.empty());
903       Edge->Callee->eraseCallerEdge(Edge.get());
904       EI = Node->CalleeEdges.erase(EI);
905     } else
906       ++EI;
907   }
908 }
909 
910 template <typename DerivedCCG, typename FuncTy, typename CallTy>
911 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
912 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
913     findEdgeFromCallee(const ContextNode *Callee) {
914   for (const auto &Edge : CalleeEdges)
915     if (Edge->Callee == Callee)
916       return Edge.get();
917   return nullptr;
918 }
919 
920 template <typename DerivedCCG, typename FuncTy, typename CallTy>
921 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
922 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
923     findEdgeFromCaller(const ContextNode *Caller) {
924   for (const auto &Edge : CallerEdges)
925     if (Edge->Caller == Caller)
926       return Edge.get();
927   return nullptr;
928 }
929 
930 template <typename DerivedCCG, typename FuncTy, typename CallTy>
931 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
932     eraseCalleeEdge(const ContextEdge *Edge) {
933   auto EI = llvm::find_if(
934       CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
935         return CalleeEdge.get() == Edge;
936       });
937   assert(EI != CalleeEdges.end());
938   CalleeEdges.erase(EI);
939 }
940 
941 template <typename DerivedCCG, typename FuncTy, typename CallTy>
942 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
943     eraseCallerEdge(const ContextEdge *Edge) {
944   auto EI = llvm::find_if(
945       CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
946         return CallerEdge.get() == Edge;
947       });
948   assert(EI != CallerEdges.end());
949   CallerEdges.erase(EI);
950 }
951 
952 template <typename DerivedCCG, typename FuncTy, typename CallTy>
953 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
954     DenseSet<uint32_t> &ContextIds) {
955   uint8_t BothTypes =
956       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
957   uint8_t AllocType = (uint8_t)AllocationType::None;
958   for (auto Id : ContextIds) {
959     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
960     // Bail early if alloc type reached both, no further refinement.
961     if (AllocType == BothTypes)
962       return AllocType;
963   }
964   return AllocType;
965 }
966 
967 template <typename DerivedCCG, typename FuncTy, typename CallTy>
968 uint8_t
969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
970     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
971   uint8_t BothTypes =
972       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
973   uint8_t AllocType = (uint8_t)AllocationType::None;
974   for (auto Id : Node1Ids) {
975     if (!Node2Ids.count(Id))
976       continue;
977     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
978     // Bail early if alloc type reached both, no further refinement.
979     if (AllocType == BothTypes)
980       return AllocType;
981   }
982   return AllocType;
983 }
984 
985 template <typename DerivedCCG, typename FuncTy, typename CallTy>
986 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
987     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
988   if (Node1Ids.size() < Node2Ids.size())
989     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
990   else
991     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
992 }
993 
994 template <typename DerivedCCG, typename FuncTy, typename CallTy>
995 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
996 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
997     CallInfo Call, const FuncTy *F) {
998   assert(!getNodeForAlloc(Call));
999   NodeOwner.push_back(
1000       std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
1001   ContextNode *AllocNode = NodeOwner.back().get();
1002   AllocationCallToContextNodeMap[Call] = AllocNode;
1003   NodeToCallingFunc[AllocNode] = F;
1004   // Use LastContextId as a uniq id for MIB allocation nodes.
1005   AllocNode->OrigStackOrAllocId = LastContextId;
1006   // Alloc type should be updated as we add in the MIBs. We should assert
1007   // afterwards that it is not still None.
1008   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1009 
1010   return AllocNode;
1011 }
1012 
1013 static std::string getAllocTypeString(uint8_t AllocTypes) {
1014   if (!AllocTypes)
1015     return "None";
1016   std::string Str;
1017   if (AllocTypes & (uint8_t)AllocationType::NotCold)
1018     Str += "NotCold";
1019   if (AllocTypes & (uint8_t)AllocationType::Cold)
1020     Str += "Cold";
1021   return Str;
1022 }
1023 
1024 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1025 template <class NodeT, class IteratorT>
1026 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1027     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1028     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1029     uint64_t TotalSize) {
1030   assert(!MemProfReportHintedSizes || TotalSize > 0);
1031   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1032   // is done.
1033   if (AllocType == AllocationType::Hot)
1034     AllocType = AllocationType::NotCold;
1035 
1036   ContextIdToAllocationType[++LastContextId] = AllocType;
1037 
1038   if (MemProfReportHintedSizes) {
1039     assert(TotalSize);
1040     ContextIdToTotalSize[LastContextId] = TotalSize;
1041   }
1042 
1043   // Update alloc type and context ids for this MIB.
1044   AllocNode->AllocTypes |= (uint8_t)AllocType;
1045 
1046   // Now add or update nodes for each stack id in alloc's context.
1047   // Later when processing the stack ids on non-alloc callsites we will adjust
1048   // for any inlining in the context.
1049   ContextNode *PrevNode = AllocNode;
1050   // Look for recursion (direct recursion should have been collapsed by
1051   // module summary analysis, here we should just be detecting mutual
1052   // recursion). Mark these nodes so we don't try to clone.
1053   SmallSet<uint64_t, 8> StackIdSet;
1054   // Skip any on the allocation call (inlining).
1055   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1056        ContextIter != StackContext.end(); ++ContextIter) {
1057     auto StackId = getStackId(*ContextIter);
1058     ContextNode *StackNode = getNodeForStackId(StackId);
1059     if (!StackNode) {
1060       NodeOwner.push_back(
1061           std::make_unique<ContextNode>(/*IsAllocation=*/false));
1062       StackNode = NodeOwner.back().get();
1063       StackEntryIdToContextNodeMap[StackId] = StackNode;
1064       StackNode->OrigStackOrAllocId = StackId;
1065     }
1066     auto Ins = StackIdSet.insert(StackId);
1067     if (!Ins.second)
1068       StackNode->Recursive = true;
1069     StackNode->AllocTypes |= (uint8_t)AllocType;
1070     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1071     PrevNode = StackNode;
1072   }
1073 }
1074 
1075 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1076 DenseSet<uint32_t>
1077 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1078     const DenseSet<uint32_t> &StackSequenceContextIds,
1079     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1080   DenseSet<uint32_t> NewContextIds;
1081   for (auto OldId : StackSequenceContextIds) {
1082     NewContextIds.insert(++LastContextId);
1083     OldToNewContextIds[OldId].insert(LastContextId);
1084     assert(ContextIdToAllocationType.count(OldId));
1085     // The new context has the same allocation type as original.
1086     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1087     // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy
1088     // up the size. Assume that if we are able to duplicate context ids that we
1089     // will be able to disambiguate all copies.
1090     ContextIdToTotalSize[LastContextId] = 0;
1091   }
1092   return NewContextIds;
1093 }
1094 
1095 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1096 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1097     propagateDuplicateContextIds(
1098         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1099   // Build a set of duplicated context ids corresponding to the input id set.
1100   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1101     DenseSet<uint32_t> NewIds;
1102     for (auto Id : ContextIds)
1103       if (auto NewId = OldToNewContextIds.find(Id);
1104           NewId != OldToNewContextIds.end())
1105         NewIds.insert(NewId->second.begin(), NewId->second.end());
1106     return NewIds;
1107   };
1108 
1109   // Recursively update context ids sets along caller edges.
1110   auto UpdateCallers = [&](ContextNode *Node,
1111                            DenseSet<const ContextEdge *> &Visited,
1112                            auto &&UpdateCallers) -> void {
1113     for (const auto &Edge : Node->CallerEdges) {
1114       auto Inserted = Visited.insert(Edge.get());
1115       if (!Inserted.second)
1116         continue;
1117       ContextNode *NextNode = Edge->Caller;
1118       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1119       // Only need to recursively iterate to NextNode via this caller edge if
1120       // it resulted in any added ids to NextNode.
1121       if (!NewIdsToAdd.empty()) {
1122         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
1123         UpdateCallers(NextNode, Visited, UpdateCallers);
1124       }
1125     }
1126   };
1127 
1128   DenseSet<const ContextEdge *> Visited;
1129   for (auto &Entry : AllocationCallToContextNodeMap) {
1130     auto *Node = Entry.second;
1131     UpdateCallers(Node, Visited, UpdateCallers);
1132   }
1133 }
1134 
1135 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1136 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1137     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1138     // This must be passed by value to make a copy since it will be adjusted
1139     // as ids are moved.
1140     DenseSet<uint32_t> RemainingContextIds) {
1141   auto &OrigEdges =
1142       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1143   // Increment iterator in loop so that we can remove edges as needed.
1144   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1145     auto Edge = *EI;
1146     // Remove any matching context ids from Edge, return set that were found and
1147     // removed, these are the new edge's context ids. Also update the remaining
1148     // (not found ids).
1149     DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
1150     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1151                  NotFoundContextIds);
1152     RemainingContextIds.swap(NotFoundContextIds);
1153     // If no matching context ids for this edge, skip it.
1154     if (NewEdgeContextIds.empty()) {
1155       ++EI;
1156       continue;
1157     }
1158     if (TowardsCallee) {
1159       uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1160       auto NewEdge = std::make_shared<ContextEdge>(
1161           Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1162       NewNode->CalleeEdges.push_back(NewEdge);
1163       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1164     } else {
1165       uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1166       auto NewEdge = std::make_shared<ContextEdge>(
1167           NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1168       NewNode->CallerEdges.push_back(NewEdge);
1169       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1170     }
1171     // Remove old edge if context ids empty.
1172     if (Edge->getContextIds().empty()) {
1173       if (TowardsCallee) {
1174         Edge->Callee->eraseCallerEdge(Edge.get());
1175         EI = OrigNode->CalleeEdges.erase(EI);
1176       } else {
1177         Edge->Caller->eraseCalleeEdge(Edge.get());
1178         EI = OrigNode->CallerEdges.erase(EI);
1179       }
1180       continue;
1181     }
1182     ++EI;
1183   }
1184 }
1185 
1186 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1187 static void checkEdge(
1188     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1189   // Confirm that alloc type is not None and that we have at least one context
1190   // id.
1191   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1192   assert(!Edge->ContextIds.empty());
1193 }
1194 
1195 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1196 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1197                       bool CheckEdges = true) {
1198   if (Node->isRemoved())
1199     return;
1200 #ifndef NDEBUG
1201   // Compute node's context ids once for use in asserts.
1202   auto NodeContextIds = Node->getContextIds();
1203 #endif
1204   // Node's context ids should be the union of both its callee and caller edge
1205   // context ids.
1206   if (Node->CallerEdges.size()) {
1207     DenseSet<uint32_t> CallerEdgeContextIds(
1208         Node->CallerEdges.front()->ContextIds);
1209     for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1210       if (CheckEdges)
1211         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1212       set_union(CallerEdgeContextIds, Edge->ContextIds);
1213     }
1214     // Node can have more context ids than callers if some contexts terminate at
1215     // node and some are longer.
1216     assert(NodeContextIds == CallerEdgeContextIds ||
1217            set_is_subset(CallerEdgeContextIds, NodeContextIds));
1218   }
1219   if (Node->CalleeEdges.size()) {
1220     DenseSet<uint32_t> CalleeEdgeContextIds(
1221         Node->CalleeEdges.front()->ContextIds);
1222     for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1223       if (CheckEdges)
1224         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1225       set_union(CalleeEdgeContextIds, Edge->getContextIds());
1226     }
1227     assert(NodeContextIds == CalleeEdgeContextIds);
1228   }
1229 }
1230 
1231 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1232 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1233     assignStackNodesPostOrder(ContextNode *Node,
1234                               DenseSet<const ContextNode *> &Visited,
1235                               DenseMap<uint64_t, std::vector<CallContextInfo>>
1236                                   &StackIdToMatchingCalls) {
1237   auto Inserted = Visited.insert(Node);
1238   if (!Inserted.second)
1239     return;
1240   // Post order traversal. Iterate over a copy since we may add nodes and
1241   // therefore new callers during the recursive call, invalidating any
1242   // iterator over the original edge vector. We don't need to process these
1243   // new nodes as they were already processed on creation.
1244   auto CallerEdges = Node->CallerEdges;
1245   for (auto &Edge : CallerEdges) {
1246     // Skip any that have been removed during the recursion.
1247     if (!Edge)
1248       continue;
1249     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1250   }
1251 
1252   // If this node's stack id is in the map, update the graph to contain new
1253   // nodes representing any inlining at interior callsites. Note we move the
1254   // associated context ids over to the new nodes.
1255 
1256   // Ignore this node if it is for an allocation or we didn't record any
1257   // stack id lists ending at it.
1258   if (Node->IsAllocation ||
1259       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1260     return;
1261 
1262   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1263   // Handle the simple case first. A single call with a single stack id.
1264   // In this case there is no need to create any new context nodes, simply
1265   // assign the context node for stack id to this Call.
1266   if (Calls.size() == 1) {
1267     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1268     if (Ids.size() == 1) {
1269       assert(SavedContextIds.empty());
1270       // It should be this Node
1271       assert(Node == getNodeForStackId(Ids[0]));
1272       if (Node->Recursive)
1273         return;
1274       Node->setCall(Call);
1275       NonAllocationCallToContextNodeMap[Call] = Node;
1276       NodeToCallingFunc[Node] = Func;
1277       return;
1278     }
1279   }
1280 
1281   // Find the node for the last stack id, which should be the same
1282   // across all calls recorded for this id, and is this node's id.
1283   uint64_t LastId = Node->OrigStackOrAllocId;
1284   ContextNode *LastNode = getNodeForStackId(LastId);
1285   // We should only have kept stack ids that had nodes.
1286   assert(LastNode);
1287 
1288   for (unsigned I = 0; I < Calls.size(); I++) {
1289     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1290     // Skip any for which we didn't assign any ids, these don't get a node in
1291     // the graph.
1292     if (SavedContextIds.empty())
1293       continue;
1294 
1295     assert(LastId == Ids.back());
1296 
1297     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1298     assert(FirstNode);
1299 
1300     // Recompute the context ids for this stack id sequence (the
1301     // intersection of the context ids of the corresponding nodes).
1302     // Start with the ids we saved in the map for this call, which could be
1303     // duplicated context ids. We have to recompute as we might have overlap
1304     // overlap between the saved context ids for different last nodes, and
1305     // removed them already during the post order traversal.
1306     set_intersect(SavedContextIds, FirstNode->getContextIds());
1307     ContextNode *PrevNode = nullptr;
1308     for (auto Id : Ids) {
1309       ContextNode *CurNode = getNodeForStackId(Id);
1310       // We should only have kept stack ids that had nodes and weren't
1311       // recursive.
1312       assert(CurNode);
1313       assert(!CurNode->Recursive);
1314       if (!PrevNode) {
1315         PrevNode = CurNode;
1316         continue;
1317       }
1318       auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1319       if (!Edge) {
1320         SavedContextIds.clear();
1321         break;
1322       }
1323       PrevNode = CurNode;
1324       set_intersect(SavedContextIds, Edge->getContextIds());
1325 
1326       // If we now have no context ids for clone, skip this call.
1327       if (SavedContextIds.empty())
1328         break;
1329     }
1330     if (SavedContextIds.empty())
1331       continue;
1332 
1333     // Create new context node.
1334     NodeOwner.push_back(
1335         std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1336     ContextNode *NewNode = NodeOwner.back().get();
1337     NodeToCallingFunc[NewNode] = Func;
1338     NonAllocationCallToContextNodeMap[Call] = NewNode;
1339     NewNode->AllocTypes = computeAllocType(SavedContextIds);
1340 
1341     // Connect to callees of innermost stack frame in inlined call chain.
1342     // This updates context ids for FirstNode's callee's to reflect those
1343     // moved to NewNode.
1344     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1345 
1346     // Connect to callers of outermost stack frame in inlined call chain.
1347     // This updates context ids for FirstNode's caller's to reflect those
1348     // moved to NewNode.
1349     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1350 
1351     // Now we need to remove context ids from edges/nodes between First and
1352     // Last Node.
1353     PrevNode = nullptr;
1354     for (auto Id : Ids) {
1355       ContextNode *CurNode = getNodeForStackId(Id);
1356       // We should only have kept stack ids that had nodes.
1357       assert(CurNode);
1358 
1359       // Remove the context ids moved to NewNode from CurNode, and the
1360       // edge from the prior node.
1361       if (PrevNode) {
1362         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1363         assert(PrevEdge);
1364         set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1365         if (PrevEdge->getContextIds().empty()) {
1366           PrevNode->eraseCallerEdge(PrevEdge);
1367           CurNode->eraseCalleeEdge(PrevEdge);
1368         }
1369       }
1370       // Since we update the edges from leaf to tail, only look at the callee
1371       // edges. This isn't an alloc node, so if there are no callee edges, the
1372       // alloc type is None.
1373       CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1374                                 ? (uint8_t)AllocationType::None
1375                                 : CurNode->computeAllocType();
1376       PrevNode = CurNode;
1377     }
1378     if (VerifyNodes) {
1379       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1380       for (auto Id : Ids) {
1381         ContextNode *CurNode = getNodeForStackId(Id);
1382         // We should only have kept stack ids that had nodes.
1383         assert(CurNode);
1384         checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1385       }
1386     }
1387   }
1388 }
1389 
1390 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1391 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1392   // Map of stack id to all calls with that as the last (outermost caller)
1393   // callsite id that has a context node (some might not due to pruning
1394   // performed during matching of the allocation profile contexts).
1395   // The CallContextInfo contains the Call and a list of its stack ids with
1396   // ContextNodes, the function containing Call, and the set of context ids
1397   // the analysis will eventually identify for use in any new node created
1398   // for that callsite.
1399   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1400   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1401     for (auto &Call : CallsWithMetadata) {
1402       // Ignore allocations, already handled.
1403       if (AllocationCallToContextNodeMap.count(Call))
1404         continue;
1405       auto StackIdsWithContextNodes =
1406           getStackIdsWithContextNodesForCall(Call.call());
1407       // If there were no nodes created for MIBs on allocs (maybe this was in
1408       // the unambiguous part of the MIB stack that was pruned), ignore.
1409       if (StackIdsWithContextNodes.empty())
1410         continue;
1411       // Otherwise, record this Call along with the list of ids for the last
1412       // (outermost caller) stack id with a node.
1413       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1414           {Call.call(), StackIdsWithContextNodes, Func, {}});
1415     }
1416   }
1417 
1418   // First make a pass through all stack ids that correspond to a call,
1419   // as identified in the above loop. Compute the context ids corresponding to
1420   // each of these calls when they correspond to multiple stack ids due to
1421   // due to inlining. Perform any duplication of context ids required when
1422   // there is more than one call with the same stack ids. Their (possibly newly
1423   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1424   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1425   for (auto &It : StackIdToMatchingCalls) {
1426     auto &Calls = It.getSecond();
1427     // Skip single calls with a single stack id. These don't need a new node.
1428     if (Calls.size() == 1) {
1429       auto &Ids = std::get<1>(Calls[0]);
1430       if (Ids.size() == 1)
1431         continue;
1432     }
1433     // In order to do the best and maximal matching of inlined calls to context
1434     // node sequences we will sort the vectors of stack ids in descending order
1435     // of length, and within each length, lexicographically by stack id. The
1436     // latter is so that we can specially handle calls that have identical stack
1437     // id sequences (either due to cloning or artificially because of the MIB
1438     // context pruning).
1439     std::stable_sort(Calls.begin(), Calls.end(),
1440                      [](const CallContextInfo &A, const CallContextInfo &B) {
1441                        auto &IdsA = std::get<1>(A);
1442                        auto &IdsB = std::get<1>(B);
1443                        return IdsA.size() > IdsB.size() ||
1444                               (IdsA.size() == IdsB.size() && IdsA < IdsB);
1445                      });
1446 
1447     // Find the node for the last stack id, which should be the same
1448     // across all calls recorded for this id, and is the id for this
1449     // entry in the StackIdToMatchingCalls map.
1450     uint64_t LastId = It.getFirst();
1451     ContextNode *LastNode = getNodeForStackId(LastId);
1452     // We should only have kept stack ids that had nodes.
1453     assert(LastNode);
1454 
1455     if (LastNode->Recursive)
1456       continue;
1457 
1458     // Initialize the context ids with the last node's. We will subsequently
1459     // refine the context ids by computing the intersection along all edges.
1460     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1461     assert(!LastNodeContextIds.empty());
1462 
1463     for (unsigned I = 0; I < Calls.size(); I++) {
1464       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1465       assert(SavedContextIds.empty());
1466       assert(LastId == Ids.back());
1467 
1468       // First compute the context ids for this stack id sequence (the
1469       // intersection of the context ids of the corresponding nodes).
1470       // Start with the remaining saved ids for the last node.
1471       assert(!LastNodeContextIds.empty());
1472       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1473 
1474       ContextNode *PrevNode = LastNode;
1475       ContextNode *CurNode = LastNode;
1476       bool Skip = false;
1477 
1478       // Iterate backwards through the stack Ids, starting after the last Id
1479       // in the list, which was handled once outside for all Calls.
1480       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1481         auto Id = *IdIter;
1482         CurNode = getNodeForStackId(Id);
1483         // We should only have kept stack ids that had nodes.
1484         assert(CurNode);
1485 
1486         if (CurNode->Recursive) {
1487           Skip = true;
1488           break;
1489         }
1490 
1491         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1492         // If there is no edge then the nodes belong to different MIB contexts,
1493         // and we should skip this inlined context sequence. For example, this
1494         // particular inlined context may include stack ids A->B, and we may
1495         // indeed have nodes for both A and B, but it is possible that they were
1496         // never profiled in sequence in a single MIB for any allocation (i.e.
1497         // we might have profiled an allocation that involves the callsite A,
1498         // but through a different one of its callee callsites, and we might
1499         // have profiled an allocation that involves callsite B, but reached
1500         // from a different caller callsite).
1501         if (!Edge) {
1502           Skip = true;
1503           break;
1504         }
1505         PrevNode = CurNode;
1506 
1507         // Update the context ids, which is the intersection of the ids along
1508         // all edges in the sequence.
1509         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1510 
1511         // If we now have no context ids for clone, skip this call.
1512         if (StackSequenceContextIds.empty()) {
1513           Skip = true;
1514           break;
1515         }
1516       }
1517       if (Skip)
1518         continue;
1519 
1520       // If some of this call's stack ids did not have corresponding nodes (due
1521       // to pruning), don't include any context ids for contexts that extend
1522       // beyond these nodes. Otherwise we would be matching part of unrelated /
1523       // not fully matching stack contexts. To do this, subtract any context ids
1524       // found in caller nodes of the last node found above.
1525       if (Ids.back() != getLastStackId(Call)) {
1526         for (const auto &PE : LastNode->CallerEdges) {
1527           set_subtract(StackSequenceContextIds, PE->getContextIds());
1528           if (StackSequenceContextIds.empty())
1529             break;
1530         }
1531         // If we now have no context ids for clone, skip this call.
1532         if (StackSequenceContextIds.empty())
1533           continue;
1534       }
1535 
1536       // Check if the next set of stack ids is the same (since the Calls vector
1537       // of tuples is sorted by the stack ids we can just look at the next one).
1538       bool DuplicateContextIds = false;
1539       if (I + 1 < Calls.size()) {
1540         auto NextIds = std::get<1>(Calls[I + 1]);
1541         DuplicateContextIds = Ids == NextIds;
1542       }
1543 
1544       // If we don't have duplicate context ids, then we can assign all the
1545       // context ids computed for the original node sequence to this call.
1546       // If there are duplicate calls with the same stack ids then we synthesize
1547       // new context ids that are duplicates of the originals. These are
1548       // assigned to SavedContextIds, which is a reference into the map entry
1549       // for this call, allowing us to access these ids later on.
1550       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1551                                  StackSequenceContextIds.size());
1552       SavedContextIds =
1553           DuplicateContextIds
1554               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1555               : StackSequenceContextIds;
1556       assert(!SavedContextIds.empty());
1557 
1558       if (!DuplicateContextIds) {
1559         // Update saved last node's context ids to remove those that are
1560         // assigned to other calls, so that it is ready for the next call at
1561         // this stack id.
1562         set_subtract(LastNodeContextIds, StackSequenceContextIds);
1563         if (LastNodeContextIds.empty())
1564           break;
1565       }
1566     }
1567   }
1568 
1569   // Propagate the duplicate context ids over the graph.
1570   propagateDuplicateContextIds(OldToNewContextIds);
1571 
1572   if (VerifyCCG)
1573     check();
1574 
1575   // Now perform a post-order traversal over the graph, starting with the
1576   // allocation nodes, essentially processing nodes from callers to callees.
1577   // For any that contains an id in the map, update the graph to contain new
1578   // nodes representing any inlining at interior callsites. Note we move the
1579   // associated context ids over to the new nodes.
1580   DenseSet<const ContextNode *> Visited;
1581   for (auto &Entry : AllocationCallToContextNodeMap)
1582     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1583   if (VerifyCCG)
1584     check();
1585 }
1586 
1587 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1588   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1589       Call->getMetadata(LLVMContext::MD_callsite));
1590   return CallsiteContext.back();
1591 }
1592 
1593 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1594   assert(isa<CallsiteInfo *>(Call.getBase()));
1595   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1596       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1597   // Need to convert index into stack id.
1598   return Index.getStackIdAtIndex(CallsiteContext.back());
1599 }
1600 
1601 static const std::string MemProfCloneSuffix = ".memprof.";
1602 
1603 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1604   // We use CloneNo == 0 to refer to the original version, which doesn't get
1605   // renamed with a suffix.
1606   if (!CloneNo)
1607     return Base.str();
1608   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1609 }
1610 
1611 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1612                                                  const Instruction *Call,
1613                                                  unsigned CloneNo) const {
1614   return (Twine(Call->getFunction()->getName()) + " -> " +
1615           cast<CallBase>(Call)->getCalledFunction()->getName())
1616       .str();
1617 }
1618 
1619 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1620                                                 const IndexCall &Call,
1621                                                 unsigned CloneNo) const {
1622   auto VI = FSToVIMap.find(Func);
1623   assert(VI != FSToVIMap.end());
1624   if (isa<AllocInfo *>(Call.getBase()))
1625     return (VI->second.name() + " -> alloc").str();
1626   else {
1627     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1628     return (VI->second.name() + " -> " +
1629             getMemProfFuncName(Callsite->Callee.name(),
1630                                Callsite->Clones[CloneNo]))
1631         .str();
1632   }
1633 }
1634 
1635 std::vector<uint64_t>
1636 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1637     Instruction *Call) {
1638   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1639       Call->getMetadata(LLVMContext::MD_callsite));
1640   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1641       CallsiteContext);
1642 }
1643 
1644 std::vector<uint64_t>
1645 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1646   assert(isa<CallsiteInfo *>(Call.getBase()));
1647   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1648       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1649   return getStackIdsWithContextNodes<CallsiteInfo,
1650                                      SmallVector<unsigned>::const_iterator>(
1651       CallsiteContext);
1652 }
1653 
1654 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1655 template <class NodeT, class IteratorT>
1656 std::vector<uint64_t>
1657 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1658     CallStack<NodeT, IteratorT> &CallsiteContext) {
1659   std::vector<uint64_t> StackIds;
1660   for (auto IdOrIndex : CallsiteContext) {
1661     auto StackId = getStackId(IdOrIndex);
1662     ContextNode *Node = getNodeForStackId(StackId);
1663     if (!Node)
1664       break;
1665     StackIds.push_back(StackId);
1666   }
1667   return StackIds;
1668 }
1669 
1670 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1671     Module &M,
1672     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1673     : Mod(M), OREGetter(OREGetter) {
1674   for (auto &F : M) {
1675     std::vector<CallInfo> CallsWithMetadata;
1676     for (auto &BB : F) {
1677       for (auto &I : BB) {
1678         if (!isa<CallBase>(I))
1679           continue;
1680         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1681           CallsWithMetadata.push_back(&I);
1682           auto *AllocNode = addAllocNode(&I, &F);
1683           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1684           assert(CallsiteMD);
1685           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1686           // Add all of the MIBs and their stack nodes.
1687           for (auto &MDOp : MemProfMD->operands()) {
1688             auto *MIBMD = cast<const MDNode>(MDOp);
1689             MDNode *StackNode = getMIBStackNode(MIBMD);
1690             assert(StackNode);
1691             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1692             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1693                 AllocNode, StackContext, CallsiteContext,
1694                 getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD));
1695           }
1696           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1697           // Memprof and callsite metadata on memory allocations no longer
1698           // needed.
1699           I.setMetadata(LLVMContext::MD_memprof, nullptr);
1700           I.setMetadata(LLVMContext::MD_callsite, nullptr);
1701         }
1702         // For callsite metadata, add to list for this function for later use.
1703         else if (I.getMetadata(LLVMContext::MD_callsite))
1704           CallsWithMetadata.push_back(&I);
1705       }
1706     }
1707     if (!CallsWithMetadata.empty())
1708       FuncToCallsWithMetadata[&F] = CallsWithMetadata;
1709   }
1710 
1711   if (DumpCCG) {
1712     dbgs() << "CCG before updating call stack chains:\n";
1713     dbgs() << *this;
1714   }
1715 
1716   if (ExportToDot)
1717     exportToDot("prestackupdate");
1718 
1719   updateStackNodes();
1720 
1721   handleCallsitesWithMultipleTargets();
1722 
1723   // Strip off remaining callsite metadata, no longer needed.
1724   for (auto &FuncEntry : FuncToCallsWithMetadata)
1725     for (auto &Call : FuncEntry.second)
1726       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1727 }
1728 
1729 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1730     ModuleSummaryIndex &Index,
1731     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1732         isPrevailing)
1733     : Index(Index), isPrevailing(isPrevailing) {
1734   for (auto &I : Index) {
1735     auto VI = Index.getValueInfo(I);
1736     for (auto &S : VI.getSummaryList()) {
1737       // We should only add the prevailing nodes. Otherwise we may try to clone
1738       // in a weak copy that won't be linked (and may be different than the
1739       // prevailing version).
1740       // We only keep the memprof summary on the prevailing copy now when
1741       // building the combined index, as a space optimization, however don't
1742       // rely on this optimization. The linker doesn't resolve local linkage
1743       // values so don't check whether those are prevailing.
1744       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1745           !isPrevailing(VI.getGUID(), S.get()))
1746         continue;
1747       auto *FS = dyn_cast<FunctionSummary>(S.get());
1748       if (!FS)
1749         continue;
1750       std::vector<CallInfo> CallsWithMetadata;
1751       if (!FS->allocs().empty()) {
1752         for (auto &AN : FS->mutableAllocs()) {
1753           // This can happen because of recursion elimination handling that
1754           // currently exists in ModuleSummaryAnalysis. Skip these for now.
1755           // We still added them to the summary because we need to be able to
1756           // correlate properly in applyImport in the backends.
1757           if (AN.MIBs.empty())
1758             continue;
1759           CallsWithMetadata.push_back({&AN});
1760           auto *AllocNode = addAllocNode({&AN}, FS);
1761           // Pass an empty CallStack to the CallsiteContext (second)
1762           // parameter, since for ThinLTO we already collapsed out the inlined
1763           // stack ids on the allocation call during ModuleSummaryAnalysis.
1764           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1765               EmptyContext;
1766           unsigned I = 0;
1767           assert(!MemProfReportHintedSizes ||
1768                  AN.TotalSizes.size() == AN.MIBs.size());
1769           // Now add all of the MIBs and their stack nodes.
1770           for (auto &MIB : AN.MIBs) {
1771             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1772                 StackContext(&MIB);
1773             uint64_t TotalSize = 0;
1774             if (MemProfReportHintedSizes)
1775               TotalSize = AN.TotalSizes[I];
1776             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1777                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
1778                 TotalSize);
1779             I++;
1780           }
1781           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1782           // Initialize version 0 on the summary alloc node to the current alloc
1783           // type, unless it has both types in which case make it default, so
1784           // that in the case where we aren't able to clone the original version
1785           // always ends up with the default allocation behavior.
1786           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1787         }
1788       }
1789       // For callsite metadata, add to list for this function for later use.
1790       if (!FS->callsites().empty())
1791         for (auto &SN : FS->mutableCallsites())
1792           CallsWithMetadata.push_back({&SN});
1793 
1794       if (!CallsWithMetadata.empty())
1795         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
1796 
1797       if (!FS->allocs().empty() || !FS->callsites().empty())
1798         FSToVIMap[FS] = VI;
1799     }
1800   }
1801 
1802   if (DumpCCG) {
1803     dbgs() << "CCG before updating call stack chains:\n";
1804     dbgs() << *this;
1805   }
1806 
1807   if (ExportToDot)
1808     exportToDot("prestackupdate");
1809 
1810   updateStackNodes();
1811 
1812   handleCallsitesWithMultipleTargets();
1813 }
1814 
1815 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1816 void CallsiteContextGraph<DerivedCCG, FuncTy,
1817                           CallTy>::handleCallsitesWithMultipleTargets() {
1818   // Look for and workaround callsites that call multiple functions.
1819   // This can happen for indirect calls, which needs better handling, and in
1820   // more rare cases (e.g. macro expansion).
1821   // TODO: To fix this for indirect calls we will want to perform speculative
1822   // devirtualization using either the normal PGO info with ICP, or using the
1823   // information in the profiled MemProf contexts. We can do this prior to
1824   // this transformation for regular LTO, and for ThinLTO we can simulate that
1825   // effect in the summary and perform the actual speculative devirtualization
1826   // while cloning in the ThinLTO backend.
1827 
1828   // Keep track of the new nodes synthesized for discovered tail calls missing
1829   // from the profiled contexts.
1830   MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
1831 
1832   for (auto &Entry : NonAllocationCallToContextNodeMap) {
1833     auto *Node = Entry.second;
1834     assert(Node->Clones.empty());
1835     // Check all node callees and see if in the same function.
1836     auto Call = Node->Call.call();
1837     for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
1838          ++EI) {
1839       auto Edge = *EI;
1840       if (!Edge->Callee->hasCall())
1841         continue;
1842       assert(NodeToCallingFunc.count(Edge->Callee));
1843       // Check if the called function matches that of the callee node.
1844       if (calleesMatch(Call, EI, TailCallToContextNodeMap))
1845         continue;
1846       RemovedEdgesWithMismatchedCallees++;
1847       // Work around by setting Node to have a null call, so it gets
1848       // skipped during cloning. Otherwise assignFunctions will assert
1849       // because its data structures are not designed to handle this case.
1850       Node->setCall(CallInfo());
1851       break;
1852     }
1853   }
1854 
1855   // Remove all mismatched nodes identified in the above loop from the node map
1856   // (checking whether they have a null call which is set above). For a
1857   // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
1858   // to do the removal via remove_if than by individually erasing entries above.
1859   NonAllocationCallToContextNodeMap.remove_if(
1860       [](const auto &it) { return !it.second->hasCall(); });
1861 
1862   // Add the new nodes after the above loop so that the iteration is not
1863   // invalidated.
1864   for (auto &[Call, Node] : TailCallToContextNodeMap)
1865     NonAllocationCallToContextNodeMap[Call] = Node;
1866 }
1867 
1868 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1869   // In the Module (IR) case this is already the Id.
1870   return IdOrIndex;
1871 }
1872 
1873 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1874   // In the Index case this is an index into the stack id list in the summary
1875   // index, convert it to an Id.
1876   return Index.getStackIdAtIndex(IdOrIndex);
1877 }
1878 
1879 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1880 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
1881     CallTy Call, EdgeIter &EI,
1882     MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
1883   auto Edge = *EI;
1884   const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
1885   const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
1886   // Will be populated in order of callee to caller if we find a chain of tail
1887   // calls between the profiled caller and callee.
1888   std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
1889   if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
1890                          FoundCalleeChain))
1891     return false;
1892 
1893   // The usual case where the profiled callee matches that of the IR/summary.
1894   if (FoundCalleeChain.empty())
1895     return true;
1896 
1897   auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
1898     auto *CurEdge = Callee->findEdgeFromCaller(Caller);
1899     // If there is already an edge between these nodes, simply update it and
1900     // return.
1901     if (CurEdge) {
1902       CurEdge->ContextIds.insert(Edge->ContextIds.begin(),
1903                                  Edge->ContextIds.end());
1904       CurEdge->AllocTypes |= Edge->AllocTypes;
1905       return;
1906     }
1907     // Otherwise, create a new edge and insert it into the caller and callee
1908     // lists.
1909     auto NewEdge = std::make_shared<ContextEdge>(
1910         Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
1911     Callee->CallerEdges.push_back(NewEdge);
1912     if (Caller == Edge->Caller) {
1913       // If we are inserting the new edge into the current edge's caller, insert
1914       // the new edge before the current iterator position, and then increment
1915       // back to the current edge.
1916       EI = Caller->CalleeEdges.insert(EI, NewEdge);
1917       ++EI;
1918       assert(*EI == Edge &&
1919              "Iterator position not restored after insert and increment");
1920     } else
1921       Caller->CalleeEdges.push_back(NewEdge);
1922   };
1923 
1924   // Create new nodes for each found callee and connect in between the profiled
1925   // caller and callee.
1926   auto *CurCalleeNode = Edge->Callee;
1927   for (auto &[NewCall, Func] : FoundCalleeChain) {
1928     ContextNode *NewNode = nullptr;
1929     // First check if we have already synthesized a node for this tail call.
1930     if (TailCallToContextNodeMap.count(NewCall)) {
1931       NewNode = TailCallToContextNodeMap[NewCall];
1932       NewNode->AllocTypes |= Edge->AllocTypes;
1933     } else {
1934       FuncToCallsWithMetadata[Func].push_back({NewCall});
1935       // Create Node and record node info.
1936       NodeOwner.push_back(
1937           std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall));
1938       NewNode = NodeOwner.back().get();
1939       NodeToCallingFunc[NewNode] = Func;
1940       TailCallToContextNodeMap[NewCall] = NewNode;
1941       NewNode->AllocTypes = Edge->AllocTypes;
1942     }
1943 
1944     // Hook up node to its callee node
1945     AddEdge(NewNode, CurCalleeNode);
1946 
1947     CurCalleeNode = NewNode;
1948   }
1949 
1950   // Hook up edge's original caller to new callee node.
1951   AddEdge(Edge->Caller, CurCalleeNode);
1952 
1953   // Remove old edge
1954   Edge->Callee->eraseCallerEdge(Edge.get());
1955   EI = Edge->Caller->CalleeEdges.erase(EI);
1956 
1957   // To simplify the increment of EI in the caller, subtract one from EI.
1958   // In the final AddEdge call we would have either added a new callee edge,
1959   // to Edge->Caller, or found an existing one. Either way we are guaranteed
1960   // that there is at least one callee edge.
1961   assert(!Edge->Caller->CalleeEdges.empty());
1962   --EI;
1963 
1964   return true;
1965 }
1966 
1967 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
1968     const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
1969     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
1970     bool &FoundMultipleCalleeChains) {
1971   // Stop recursive search if we have already explored the maximum specified
1972   // depth.
1973   if (Depth > TailCallSearchDepth)
1974     return false;
1975 
1976   auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
1977     FoundCalleeChain.push_back({Callsite, F});
1978   };
1979 
1980   auto *CalleeFunc = dyn_cast<Function>(CurCallee);
1981   if (!CalleeFunc) {
1982     auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
1983     assert(Alias);
1984     CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
1985     assert(CalleeFunc);
1986   }
1987 
1988   // Look for tail calls in this function, and check if they either call the
1989   // profiled callee directly, or indirectly (via a recursive search).
1990   // Only succeed if there is a single unique tail call chain found between the
1991   // profiled caller and callee, otherwise we could perform incorrect cloning.
1992   bool FoundSingleCalleeChain = false;
1993   for (auto &BB : *CalleeFunc) {
1994     for (auto &I : BB) {
1995       auto *CB = dyn_cast<CallBase>(&I);
1996       if (!CB || !CB->isTailCall())
1997         continue;
1998       auto *CalledValue = CB->getCalledOperand();
1999       auto *CalledFunction = CB->getCalledFunction();
2000       if (CalledValue && !CalledFunction) {
2001         CalledValue = CalledValue->stripPointerCasts();
2002         // Stripping pointer casts can reveal a called function.
2003         CalledFunction = dyn_cast<Function>(CalledValue);
2004       }
2005       // Check if this is an alias to a function. If so, get the
2006       // called aliasee for the checks below.
2007       if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2008         assert(!CalledFunction &&
2009                "Expected null called function in callsite for alias");
2010         CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2011       }
2012       if (!CalledFunction)
2013         continue;
2014       if (CalledFunction == ProfiledCallee) {
2015         if (FoundSingleCalleeChain) {
2016           FoundMultipleCalleeChains = true;
2017           return false;
2018         }
2019         FoundSingleCalleeChain = true;
2020         FoundProfiledCalleeCount++;
2021         FoundProfiledCalleeDepth += Depth;
2022         if (Depth > FoundProfiledCalleeMaxDepth)
2023           FoundProfiledCalleeMaxDepth = Depth;
2024         SaveCallsiteInfo(&I, CalleeFunc);
2025       } else if (findProfiledCalleeThroughTailCalls(
2026                      ProfiledCallee, CalledFunction, Depth + 1,
2027                      FoundCalleeChain, FoundMultipleCalleeChains)) {
2028         // findProfiledCalleeThroughTailCalls should not have returned
2029         // true if FoundMultipleCalleeChains.
2030         assert(!FoundMultipleCalleeChains);
2031         if (FoundSingleCalleeChain) {
2032           FoundMultipleCalleeChains = true;
2033           return false;
2034         }
2035         FoundSingleCalleeChain = true;
2036         SaveCallsiteInfo(&I, CalleeFunc);
2037       } else if (FoundMultipleCalleeChains)
2038         return false;
2039     }
2040   }
2041 
2042   return FoundSingleCalleeChain;
2043 }
2044 
2045 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2046     Instruction *Call, const Function *Func, const Function *CallerFunc,
2047     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2048   auto *CB = dyn_cast<CallBase>(Call);
2049   if (!CB->getCalledOperand())
2050     return false;
2051   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2052   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2053   if (CalleeFunc == Func)
2054     return true;
2055   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2056   if (Alias && Alias->getAliasee() == Func)
2057     return true;
2058 
2059   // Recursively search for the profiled callee through tail calls starting with
2060   // the actual Callee. The discovered tail call chain is saved in
2061   // FoundCalleeChain, and we will fixup the graph to include these callsites
2062   // after returning.
2063   // FIXME: We will currently redo the same recursive walk if we find the same
2064   // mismatched callee from another callsite. We can improve this with more
2065   // bookkeeping of the created chain of new nodes for each mismatch.
2066   unsigned Depth = 1;
2067   bool FoundMultipleCalleeChains = false;
2068   if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2069                                           FoundCalleeChain,
2070                                           FoundMultipleCalleeChains)) {
2071     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2072                       << Func->getName() << " from " << CallerFunc->getName()
2073                       << " that actually called " << CalleeVal->getName()
2074                       << (FoundMultipleCalleeChains
2075                               ? " (found multiple possible chains)"
2076                               : "")
2077                       << "\n");
2078     if (FoundMultipleCalleeChains)
2079       FoundProfiledCalleeNonUniquelyCount++;
2080     return false;
2081   }
2082 
2083   return true;
2084 }
2085 
2086 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2087     ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2088     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2089     bool &FoundMultipleCalleeChains) {
2090   // Stop recursive search if we have already explored the maximum specified
2091   // depth.
2092   if (Depth > TailCallSearchDepth)
2093     return false;
2094 
2095   auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2096     // Make a CallsiteInfo for each discovered callee, if one hasn't already
2097     // been synthesized.
2098     if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2099         !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2100       // StackIds is empty (we don't have debug info available in the index for
2101       // these callsites)
2102       FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2103           std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2104     CallsiteInfo *NewCallsiteInfo =
2105         FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2106     FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2107   };
2108 
2109   // Look for tail calls in this function, and check if they either call the
2110   // profiled callee directly, or indirectly (via a recursive search).
2111   // Only succeed if there is a single unique tail call chain found between the
2112   // profiled caller and callee, otherwise we could perform incorrect cloning.
2113   bool FoundSingleCalleeChain = false;
2114   for (auto &S : CurCallee.getSummaryList()) {
2115     if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2116         !isPrevailing(CurCallee.getGUID(), S.get()))
2117       continue;
2118     auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2119     if (!FS)
2120       continue;
2121     auto FSVI = CurCallee;
2122     auto *AS = dyn_cast<AliasSummary>(S.get());
2123     if (AS)
2124       FSVI = AS->getAliaseeVI();
2125     for (auto &CallEdge : FS->calls()) {
2126       if (!CallEdge.second.hasTailCall())
2127         continue;
2128       if (CallEdge.first == ProfiledCallee) {
2129         if (FoundSingleCalleeChain) {
2130           FoundMultipleCalleeChains = true;
2131           return false;
2132         }
2133         FoundSingleCalleeChain = true;
2134         FoundProfiledCalleeCount++;
2135         FoundProfiledCalleeDepth += Depth;
2136         if (Depth > FoundProfiledCalleeMaxDepth)
2137           FoundProfiledCalleeMaxDepth = Depth;
2138         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2139         // Add FS to FSToVIMap  in case it isn't already there.
2140         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2141         FSToVIMap[FS] = FSVI;
2142       } else if (findProfiledCalleeThroughTailCalls(
2143                      ProfiledCallee, CallEdge.first, Depth + 1,
2144                      FoundCalleeChain, FoundMultipleCalleeChains)) {
2145         // findProfiledCalleeThroughTailCalls should not have returned
2146         // true if FoundMultipleCalleeChains.
2147         assert(!FoundMultipleCalleeChains);
2148         if (FoundSingleCalleeChain) {
2149           FoundMultipleCalleeChains = true;
2150           return false;
2151         }
2152         FoundSingleCalleeChain = true;
2153         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2154         // Add FS to FSToVIMap  in case it isn't already there.
2155         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2156         FSToVIMap[FS] = FSVI;
2157       } else if (FoundMultipleCalleeChains)
2158         return false;
2159     }
2160   }
2161 
2162   return FoundSingleCalleeChain;
2163 }
2164 
2165 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2166     IndexCall &Call, const FunctionSummary *Func,
2167     const FunctionSummary *CallerFunc,
2168     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2169   ValueInfo Callee =
2170       dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
2171   // If there is no summary list then this is a call to an externally defined
2172   // symbol.
2173   AliasSummary *Alias =
2174       Callee.getSummaryList().empty()
2175           ? nullptr
2176           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2177   assert(FSToVIMap.count(Func));
2178   auto FuncVI = FSToVIMap[Func];
2179   if (Callee == FuncVI ||
2180       // If callee is an alias, check the aliasee, since only function
2181       // summary base objects will contain the stack node summaries and thus
2182       // get a context node.
2183       (Alias && Alias->getAliaseeVI() == FuncVI))
2184     return true;
2185 
2186   // Recursively search for the profiled callee through tail calls starting with
2187   // the actual Callee. The discovered tail call chain is saved in
2188   // FoundCalleeChain, and we will fixup the graph to include these callsites
2189   // after returning.
2190   // FIXME: We will currently redo the same recursive walk if we find the same
2191   // mismatched callee from another callsite. We can improve this with more
2192   // bookkeeping of the created chain of new nodes for each mismatch.
2193   unsigned Depth = 1;
2194   bool FoundMultipleCalleeChains = false;
2195   if (!findProfiledCalleeThroughTailCalls(
2196           FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2197     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2198                       << " from " << FSToVIMap[CallerFunc]
2199                       << " that actually called " << Callee
2200                       << (FoundMultipleCalleeChains
2201                               ? " (found multiple possible chains)"
2202                               : "")
2203                       << "\n");
2204     if (FoundMultipleCalleeChains)
2205       FoundProfiledCalleeNonUniquelyCount++;
2206     return false;
2207   }
2208 
2209   return true;
2210 }
2211 
2212 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2213 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2214     const {
2215   print(dbgs());
2216   dbgs() << "\n";
2217 }
2218 
2219 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2220 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2221     raw_ostream &OS) const {
2222   OS << "Node " << this << "\n";
2223   OS << "\t";
2224   printCall(OS);
2225   if (Recursive)
2226     OS << " (recursive)";
2227   OS << "\n";
2228   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2229   OS << "\tContextIds:";
2230   // Make a copy of the computed context ids that we can sort for stability.
2231   auto ContextIds = getContextIds();
2232   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2233   std::sort(SortedIds.begin(), SortedIds.end());
2234   for (auto Id : SortedIds)
2235     OS << " " << Id;
2236   OS << "\n";
2237   OS << "\tCalleeEdges:\n";
2238   for (auto &Edge : CalleeEdges)
2239     OS << "\t\t" << *Edge << "\n";
2240   OS << "\tCallerEdges:\n";
2241   for (auto &Edge : CallerEdges)
2242     OS << "\t\t" << *Edge << "\n";
2243   if (!Clones.empty()) {
2244     OS << "\tClones: ";
2245     FieldSeparator FS;
2246     for (auto *Clone : Clones)
2247       OS << FS << Clone;
2248     OS << "\n";
2249   } else if (CloneOf) {
2250     OS << "\tClone of " << CloneOf << "\n";
2251   }
2252 }
2253 
2254 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2255 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2256     const {
2257   print(dbgs());
2258   dbgs() << "\n";
2259 }
2260 
2261 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2262 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2263     raw_ostream &OS) const {
2264   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2265      << " AllocTypes: " << getAllocTypeString(AllocTypes);
2266   OS << " ContextIds:";
2267   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2268   std::sort(SortedIds.begin(), SortedIds.end());
2269   for (auto Id : SortedIds)
2270     OS << " " << Id;
2271 }
2272 
2273 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2274 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
2275   print(dbgs());
2276 }
2277 
2278 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2279 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
2280     raw_ostream &OS) const {
2281   OS << "Callsite Context Graph:\n";
2282   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2283   for (const auto Node : nodes<GraphType>(this)) {
2284     if (Node->isRemoved())
2285       continue;
2286     Node->print(OS);
2287     OS << "\n";
2288   }
2289 }
2290 
2291 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2292 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
2293     raw_ostream &OS) const {
2294   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2295   for (const auto Node : nodes<GraphType>(this)) {
2296     if (Node->isRemoved())
2297       continue;
2298     if (!Node->IsAllocation)
2299       continue;
2300     DenseSet<uint32_t> ContextIds = Node->getContextIds();
2301     std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2302     std::sort(SortedIds.begin(), SortedIds.end());
2303     for (auto Id : SortedIds) {
2304       auto SizeI = ContextIdToTotalSize.find(Id);
2305       assert(SizeI != ContextIdToTotalSize.end());
2306       auto TypeI = ContextIdToAllocationType.find(Id);
2307       assert(TypeI != ContextIdToAllocationType.end());
2308       OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id
2309          << " with total size " << SizeI->second << " is "
2310          << getAllocTypeString(Node->AllocTypes) << " after cloning\n";
2311     }
2312   }
2313 }
2314 
2315 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2316 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
2317   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2318   for (const auto Node : nodes<GraphType>(this)) {
2319     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2320     for (auto &Edge : Node->CallerEdges)
2321       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
2322   }
2323 }
2324 
2325 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2326 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
2327   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2328   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
2329 
2330   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
2331   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
2332 
2333   using nodes_iterator =
2334       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
2335                       decltype(&getNode)>;
2336 
2337   static nodes_iterator nodes_begin(GraphType G) {
2338     return nodes_iterator(G->NodeOwner.begin(), &getNode);
2339   }
2340 
2341   static nodes_iterator nodes_end(GraphType G) {
2342     return nodes_iterator(G->NodeOwner.end(), &getNode);
2343   }
2344 
2345   static NodeRef getEntryNode(GraphType G) {
2346     return G->NodeOwner.begin()->get();
2347   }
2348 
2349   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
2350   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
2351   GetCallee(const EdgePtrTy &P) {
2352     return P->Callee;
2353   }
2354 
2355   using ChildIteratorType =
2356       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
2357                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
2358                       decltype(&GetCallee)>;
2359 
2360   static ChildIteratorType child_begin(NodeRef N) {
2361     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
2362   }
2363 
2364   static ChildIteratorType child_end(NodeRef N) {
2365     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
2366   }
2367 };
2368 
2369 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2370 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
2371     : public DefaultDOTGraphTraits {
2372   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
2373 
2374   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
2375   using GTraits = GraphTraits<GraphType>;
2376   using NodeRef = typename GTraits::NodeRef;
2377   using ChildIteratorType = typename GTraits::ChildIteratorType;
2378 
2379   static std::string getNodeLabel(NodeRef Node, GraphType G) {
2380     std::string LabelString =
2381         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
2382          Twine(Node->OrigStackOrAllocId))
2383             .str();
2384     LabelString += "\n";
2385     if (Node->hasCall()) {
2386       auto Func = G->NodeToCallingFunc.find(Node);
2387       assert(Func != G->NodeToCallingFunc.end());
2388       LabelString +=
2389           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
2390     } else {
2391       LabelString += "null call";
2392       if (Node->Recursive)
2393         LabelString += " (recursive)";
2394       else
2395         LabelString += " (external)";
2396     }
2397     return LabelString;
2398   }
2399 
2400   static std::string getNodeAttributes(NodeRef Node, GraphType) {
2401     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
2402                                    getContextIds(Node->getContextIds()) + "\"")
2403                                       .str();
2404     AttributeString +=
2405         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
2406     AttributeString += ",style=\"filled\"";
2407     if (Node->CloneOf) {
2408       AttributeString += ",color=\"blue\"";
2409       AttributeString += ",style=\"filled,bold,dashed\"";
2410     } else
2411       AttributeString += ",style=\"filled\"";
2412     return AttributeString;
2413   }
2414 
2415   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
2416                                        GraphType) {
2417     auto &Edge = *(ChildIter.getCurrent());
2418     return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
2419             Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
2420         .str();
2421   }
2422 
2423   // Since the NodeOwners list includes nodes that are no longer connected to
2424   // the graph, skip them here.
2425   static bool isNodeHidden(NodeRef Node, GraphType) {
2426     return Node->isRemoved();
2427   }
2428 
2429 private:
2430   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
2431     std::string IdString = "ContextIds:";
2432     if (ContextIds.size() < 100) {
2433       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2434       std::sort(SortedIds.begin(), SortedIds.end());
2435       for (auto Id : SortedIds)
2436         IdString += (" " + Twine(Id)).str();
2437     } else {
2438       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
2439     }
2440     return IdString;
2441   }
2442 
2443   static std::string getColor(uint8_t AllocTypes) {
2444     if (AllocTypes == (uint8_t)AllocationType::NotCold)
2445       // Color "brown1" actually looks like a lighter red.
2446       return "brown1";
2447     if (AllocTypes == (uint8_t)AllocationType::Cold)
2448       return "cyan";
2449     if (AllocTypes ==
2450         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
2451       // Lighter purple.
2452       return "mediumorchid1";
2453     return "gray";
2454   }
2455 
2456   static std::string getNodeId(NodeRef Node) {
2457     std::stringstream SStream;
2458     SStream << std::hex << "N0x" << (unsigned long long)Node;
2459     std::string Result = SStream.str();
2460     return Result;
2461   }
2462 };
2463 
2464 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2465 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
2466     std::string Label) const {
2467   WriteGraph(this, "", false, Label,
2468              DotFilePathPrefix + "ccg." + Label + ".dot");
2469 }
2470 
2471 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2472 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
2473 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
2474     const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI,
2475     DenseSet<uint32_t> ContextIdsToMove) {
2476   ContextNode *Node = Edge->Callee;
2477   NodeOwner.push_back(
2478       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
2479   ContextNode *Clone = NodeOwner.back().get();
2480   Node->addClone(Clone);
2481   assert(NodeToCallingFunc.count(Node));
2482   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
2483   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true,
2484                                 ContextIdsToMove);
2485   return Clone;
2486 }
2487 
2488 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2489 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2490     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
2491                                   ContextNode *NewCallee, EdgeIter *CallerEdgeI,
2492                                   bool NewClone,
2493                                   DenseSet<uint32_t> ContextIdsToMove) {
2494   // NewCallee and Edge's current callee must be clones of the same original
2495   // node (Edge's current callee may be the original node too).
2496   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
2497 
2498   ContextNode *OldCallee = Edge->Callee;
2499 
2500   // We might already have an edge to the new callee from earlier cloning for a
2501   // different allocation. If one exists we will reuse it.
2502   auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
2503 
2504   // Callers will pass an empty ContextIdsToMove set when they want to move the
2505   // edge. Copy in Edge's ids for simplicity.
2506   if (ContextIdsToMove.empty())
2507     ContextIdsToMove = Edge->getContextIds();
2508 
2509   // If we are moving all of Edge's ids, then just move the whole Edge.
2510   // Otherwise only move the specified subset, to a new edge if needed.
2511   if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
2512     // Moving the whole Edge.
2513     if (CallerEdgeI)
2514       *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
2515     else
2516       OldCallee->eraseCallerEdge(Edge.get());
2517     if (ExistingEdgeToNewCallee) {
2518       // Since we already have an edge to NewCallee, simply move the ids
2519       // onto it, and remove the existing Edge.
2520       ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2521                                                       ContextIdsToMove.end());
2522       ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
2523       assert(Edge->ContextIds == ContextIdsToMove);
2524       Edge->ContextIds.clear();
2525       Edge->AllocTypes = (uint8_t)AllocationType::None;
2526       Edge->Caller->eraseCalleeEdge(Edge.get());
2527     } else {
2528       // Otherwise just reconnect Edge to NewCallee.
2529       Edge->Callee = NewCallee;
2530       NewCallee->CallerEdges.push_back(Edge);
2531       // Don't need to update Edge's context ids since we are simply
2532       // reconnecting it.
2533     }
2534     // In either case, need to update the alloc types on New Callee.
2535     NewCallee->AllocTypes |= Edge->AllocTypes;
2536   } else {
2537     // Only moving a subset of Edge's ids.
2538     if (CallerEdgeI)
2539       ++CallerEdgeI;
2540     // Compute the alloc type of the subset of ids being moved.
2541     auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
2542     if (ExistingEdgeToNewCallee) {
2543       // Since we already have an edge to NewCallee, simply move the ids
2544       // onto it.
2545       ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(),
2546                                                       ContextIdsToMove.end());
2547       ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
2548     } else {
2549       // Otherwise, create a new edge to NewCallee for the ids being moved.
2550       auto NewEdge = std::make_shared<ContextEdge>(
2551           NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
2552       Edge->Caller->CalleeEdges.push_back(NewEdge);
2553       NewCallee->CallerEdges.push_back(NewEdge);
2554     }
2555     // In either case, need to update the alloc types on NewCallee, and remove
2556     // those ids and update the alloc type on the original Edge.
2557     NewCallee->AllocTypes |= CallerEdgeAllocType;
2558     set_subtract(Edge->ContextIds, ContextIdsToMove);
2559     Edge->AllocTypes = computeAllocType(Edge->ContextIds);
2560   }
2561   // Now walk the old callee node's callee edges and move Edge's context ids
2562   // over to the corresponding edge into the clone (which is created here if
2563   // this is a newly created clone).
2564   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2565     // The context ids moving to the new callee are the subset of this edge's
2566     // context ids and the context ids on the caller edge being moved.
2567     DenseSet<uint32_t> EdgeContextIdsToMove =
2568         set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
2569     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2570     OldCalleeEdge->AllocTypes =
2571         computeAllocType(OldCalleeEdge->getContextIds());
2572     if (!NewClone) {
2573       // Update context ids / alloc type on corresponding edge to NewCallee.
2574       // There is a chance this may not exist if we are reusing an existing
2575       // clone, specifically during function assignment, where we would have
2576       // removed none type edges after creating the clone. If we can't find
2577       // a corresponding edge there, fall through to the cloning below.
2578       if (auto *NewCalleeEdge =
2579               NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2580         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2581                                               EdgeContextIdsToMove.end());
2582         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2583         continue;
2584       }
2585     }
2586     auto NewEdge = std::make_shared<ContextEdge>(
2587         OldCalleeEdge->Callee, NewCallee,
2588         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2589     NewCallee->CalleeEdges.push_back(NewEdge);
2590     NewEdge->Callee->CallerEdges.push_back(NewEdge);
2591   }
2592   // Recompute the node alloc type now that its callee edges have been
2593   // updated (since we will compute from those edges).
2594   OldCallee->AllocTypes = OldCallee->computeAllocType();
2595   // OldCallee alloc type should be None iff its context id set is now empty.
2596   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2597          OldCallee->emptyContextIds());
2598   if (VerifyCCG) {
2599     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2600     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2601     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2602       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2603                                             /*CheckEdges=*/false);
2604     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2605       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2606                                             /*CheckEdges=*/false);
2607   }
2608 }
2609 
2610 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2611 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
2612     recursivelyRemoveNoneTypeCalleeEdges(
2613         ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2614   auto Inserted = Visited.insert(Node);
2615   if (!Inserted.second)
2616     return;
2617 
2618   removeNoneTypeCalleeEdges(Node);
2619 
2620   for (auto *Clone : Node->Clones)
2621     recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
2622 
2623   // The recursive call may remove some of this Node's caller edges.
2624   // Iterate over a copy and skip any that were removed.
2625   auto CallerEdges = Node->CallerEdges;
2626   for (auto &Edge : CallerEdges) {
2627     // Skip any that have been removed by an earlier recursive call.
2628     if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2629       assert(!is_contained(Node->CallerEdges, Edge));
2630       continue;
2631     }
2632     recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
2633   }
2634 }
2635 
2636 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2637 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2638   DenseSet<const ContextNode *> Visited;
2639   for (auto &Entry : AllocationCallToContextNodeMap) {
2640     Visited.clear();
2641     identifyClones(Entry.second, Visited, Entry.second->getContextIds());
2642   }
2643   Visited.clear();
2644   for (auto &Entry : AllocationCallToContextNodeMap)
2645     recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
2646   if (VerifyCCG)
2647     check();
2648 }
2649 
2650 // helper function to check an AllocType is cold or notcold or both.
2651 bool checkColdOrNotCold(uint8_t AllocType) {
2652   return (AllocType == (uint8_t)AllocationType::Cold) ||
2653          (AllocType == (uint8_t)AllocationType::NotCold) ||
2654          (AllocType ==
2655           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2656 }
2657 
2658 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2659 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2660     ContextNode *Node, DenseSet<const ContextNode *> &Visited,
2661     const DenseSet<uint32_t> &AllocContextIds) {
2662   if (VerifyNodes)
2663     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2664   assert(!Node->CloneOf);
2665 
2666   // If Node as a null call, then either it wasn't found in the module (regular
2667   // LTO) or summary index (ThinLTO), or there were other conditions blocking
2668   // cloning (e.g. recursion, calls multiple targets, etc).
2669   // Do this here so that we don't try to recursively clone callers below, which
2670   // isn't useful at least for this node.
2671   if (!Node->hasCall())
2672     return;
2673 
2674 #ifndef NDEBUG
2675   auto Insert =
2676 #endif
2677       Visited.insert(Node);
2678   // We should not have visited this node yet.
2679   assert(Insert.second);
2680   // The recursive call to identifyClones may delete the current edge from the
2681   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2682   // in an iterator and having recursive call erase from it. Other edges may
2683   // also get removed during the recursion, which will have null Callee and
2684   // Caller pointers (and are deleted later), so we skip those below.
2685   {
2686     auto CallerEdges = Node->CallerEdges;
2687     for (auto &Edge : CallerEdges) {
2688       // Skip any that have been removed by an earlier recursive call.
2689       if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2690         assert(!llvm::count(Node->CallerEdges, Edge));
2691         continue;
2692       }
2693       // Ignore any caller we previously visited via another edge.
2694       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2695         identifyClones(Edge->Caller, Visited, AllocContextIds);
2696       }
2697     }
2698   }
2699 
2700   // Check if we reached an unambiguous call or have have only a single caller.
2701   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2702     return;
2703 
2704   // We need to clone.
2705 
2706   // Try to keep the original version as alloc type NotCold. This will make
2707   // cases with indirect calls or any other situation with an unknown call to
2708   // the original function get the default behavior. We do this by sorting the
2709   // CallerEdges of the Node we will clone by alloc type.
2710   //
2711   // Give NotCold edge the lowest sort priority so those edges are at the end of
2712   // the caller edges vector, and stay on the original version (since the below
2713   // code clones greedily until it finds all remaining edges have the same type
2714   // and leaves the remaining ones on the original Node).
2715   //
2716   // We shouldn't actually have any None type edges, so the sorting priority for
2717   // that is arbitrary, and we assert in that case below.
2718   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2719                                                /*Cold*/ 1,
2720                                                /*NotColdCold*/ 2};
2721   std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2722                    [&](const std::shared_ptr<ContextEdge> &A,
2723                        const std::shared_ptr<ContextEdge> &B) {
2724                      // Nodes with non-empty context ids should be sorted before
2725                      // those with empty context ids.
2726                      if (A->ContextIds.empty())
2727                        // Either B ContextIds are non-empty (in which case we
2728                        // should return false because B < A), or B ContextIds
2729                        // are empty, in which case they are equal, and we should
2730                        // maintain the original relative ordering.
2731                        return false;
2732                      if (B->ContextIds.empty())
2733                        return true;
2734 
2735                      if (A->AllocTypes == B->AllocTypes)
2736                        // Use the first context id for each edge as a
2737                        // tie-breaker.
2738                        return *A->ContextIds.begin() < *B->ContextIds.begin();
2739                      return AllocTypeCloningPriority[A->AllocTypes] <
2740                             AllocTypeCloningPriority[B->AllocTypes];
2741                    });
2742 
2743   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2744 
2745   // Iterate until we find no more opportunities for disambiguating the alloc
2746   // types via cloning. In most cases this loop will terminate once the Node
2747   // has a single allocation type, in which case no more cloning is needed.
2748   // We need to be able to remove Edge from CallerEdges, so need to adjust
2749   // iterator inside the loop.
2750   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2751     auto CallerEdge = *EI;
2752 
2753     // See if cloning the prior caller edge left this node with a single alloc
2754     // type or a single caller. In that case no more cloning of Node is needed.
2755     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2756       break;
2757 
2758     // Only need to process the ids along this edge pertaining to the given
2759     // allocation.
2760     auto CallerEdgeContextsForAlloc =
2761         set_intersection(CallerEdge->getContextIds(), AllocContextIds);
2762     if (CallerEdgeContextsForAlloc.empty()) {
2763       ++EI;
2764       continue;
2765     }
2766     auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
2767 
2768     // Compute the node callee edge alloc types corresponding to the context ids
2769     // for this caller edge.
2770     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2771     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2772     for (auto &CalleeEdge : Node->CalleeEdges)
2773       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2774           CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
2775 
2776     // Don't clone if doing so will not disambiguate any alloc types amongst
2777     // caller edges (including the callee edges that would be cloned).
2778     // Otherwise we will simply move all edges to the clone.
2779     //
2780     // First check if by cloning we will disambiguate the caller allocation
2781     // type from node's allocation type. Query allocTypeToUse so that we don't
2782     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2783     // neither of these should be None type.
2784     //
2785     // Then check if by cloning node at least one of the callee edges will be
2786     // disambiguated by splitting out different context ids.
2787     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2788     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2789     if (allocTypeToUse(CallerAllocTypeForAlloc) ==
2790             allocTypeToUse(Node->AllocTypes) &&
2791         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2792             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2793       ++EI;
2794       continue;
2795     }
2796 
2797     // First see if we can use an existing clone. Check each clone and its
2798     // callee edges for matching alloc types.
2799     ContextNode *Clone = nullptr;
2800     for (auto *CurClone : Node->Clones) {
2801       if (allocTypeToUse(CurClone->AllocTypes) !=
2802           allocTypeToUse(CallerAllocTypeForAlloc))
2803         continue;
2804 
2805       if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2806               CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2807         continue;
2808       Clone = CurClone;
2809       break;
2810     }
2811 
2812     // The edge iterator is adjusted when we move the CallerEdge to the clone.
2813     if (Clone)
2814       moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false,
2815                                     CallerEdgeContextsForAlloc);
2816     else
2817       Clone =
2818           moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc);
2819 
2820     assert(EI == Node->CallerEdges.end() ||
2821            Node->AllocTypes != (uint8_t)AllocationType::None);
2822     // Sanity check that no alloc types on clone or its edges are None.
2823     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2824   }
2825 
2826   // We should still have some context ids on the original Node.
2827   assert(!Node->emptyContextIds());
2828 
2829   // Sanity check that no alloc types on node or edges are None.
2830   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2831 
2832   if (VerifyNodes)
2833     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
2834 }
2835 
2836 void ModuleCallsiteContextGraph::updateAllocationCall(
2837     CallInfo &Call, AllocationType AllocType) {
2838   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2839   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2840                                 "memprof", AllocTypeString);
2841   cast<CallBase>(Call.call())->addFnAttr(A);
2842   OREGetter(Call.call()->getFunction())
2843       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2844             << ore::NV("AllocationCall", Call.call()) << " in clone "
2845             << ore::NV("Caller", Call.call()->getFunction())
2846             << " marked with memprof allocation attribute "
2847             << ore::NV("Attribute", AllocTypeString));
2848 }
2849 
2850 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2851                                                      AllocationType AllocType) {
2852   auto *AI = Call.call().dyn_cast<AllocInfo *>();
2853   assert(AI);
2854   assert(AI->Versions.size() > Call.cloneNo());
2855   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2856 }
2857 
2858 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2859                                             FuncInfo CalleeFunc) {
2860   if (CalleeFunc.cloneNo() > 0)
2861     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2862   OREGetter(CallerCall.call()->getFunction())
2863       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2864             << ore::NV("Call", CallerCall.call()) << " in clone "
2865             << ore::NV("Caller", CallerCall.call()->getFunction())
2866             << " assigned to call function clone "
2867             << ore::NV("Callee", CalleeFunc.func()));
2868 }
2869 
2870 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2871                                            FuncInfo CalleeFunc) {
2872   auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2873   assert(CI &&
2874          "Caller cannot be an allocation which should not have profiled calls");
2875   assert(CI->Clones.size() > CallerCall.cloneNo());
2876   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2877 }
2878 
2879 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2880                      Instruction *>::FuncInfo
2881 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2882     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2883     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2884   // Use existing LLVM facilities for cloning and obtaining Call in clone
2885   ValueToValueMapTy VMap;
2886   auto *NewFunc = CloneFunction(Func.func(), VMap);
2887   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2888   assert(!Func.func()->getParent()->getFunction(Name));
2889   NewFunc->setName(Name);
2890   for (auto &Inst : CallsWithMetadataInFunc) {
2891     // This map always has the initial version in it.
2892     assert(Inst.cloneNo() == 0);
2893     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2894   }
2895   OREGetter(Func.func())
2896       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2897             << "created clone " << ore::NV("NewFunction", NewFunc));
2898   return {NewFunc, CloneNo};
2899 }
2900 
2901 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2902                      IndexCall>::FuncInfo
2903 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2904     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2905     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2906   // Check how many clones we have of Call (and therefore function).
2907   // The next clone number is the current size of versions array.
2908   // Confirm this matches the CloneNo provided by the caller, which is based on
2909   // the number of function clones we have.
2910   assert(CloneNo ==
2911          (Call.call().is<AllocInfo *>()
2912               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2913               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2914   // Walk all the instructions in this function. Create a new version for
2915   // each (by adding an entry to the Versions/Clones summary array), and copy
2916   // over the version being called for the function clone being cloned here.
2917   // Additionally, add an entry to the CallMap for the new function clone,
2918   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2919   // to the new call clone.
2920   for (auto &Inst : CallsWithMetadataInFunc) {
2921     // This map always has the initial version in it.
2922     assert(Inst.cloneNo() == 0);
2923     if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2924       assert(AI->Versions.size() == CloneNo);
2925       // We assign the allocation type later (in updateAllocationCall), just add
2926       // an entry for it here.
2927       AI->Versions.push_back(0);
2928     } else {
2929       auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2930       assert(CI && CI->Clones.size() == CloneNo);
2931       // We assign the clone number later (in updateCall), just add an entry for
2932       // it here.
2933       CI->Clones.push_back(0);
2934     }
2935     CallMap[Inst] = {Inst.call(), CloneNo};
2936   }
2937   return {Func.func(), CloneNo};
2938 }
2939 
2940 // This method assigns cloned callsites to functions, cloning the functions as
2941 // needed. The assignment is greedy and proceeds roughly as follows:
2942 //
2943 // For each function Func:
2944 //   For each call with graph Node having clones:
2945 //     Initialize ClonesWorklist to Node and its clones
2946 //     Initialize NodeCloneCount to 0
2947 //     While ClonesWorklist is not empty:
2948 //        Clone = pop front ClonesWorklist
2949 //        NodeCloneCount++
2950 //        If Func has been cloned less than NodeCloneCount times:
2951 //           If NodeCloneCount is 1:
2952 //             Assign Clone to original Func
2953 //             Continue
2954 //           Create a new function clone
2955 //           If other callers not assigned to call a function clone yet:
2956 //              Assign them to call new function clone
2957 //              Continue
2958 //           Assign any other caller calling the cloned version to new clone
2959 //
2960 //        For each caller of Clone:
2961 //           If caller is assigned to call a specific function clone:
2962 //             If we cannot assign Clone to that function clone:
2963 //               Create new callsite Clone NewClone
2964 //               Add NewClone to ClonesWorklist
2965 //               Continue
2966 //             Assign Clone to existing caller's called function clone
2967 //           Else:
2968 //             If Clone not already assigned to a function clone:
2969 //                Assign to first function clone without assignment
2970 //             Assign caller to selected function clone
2971 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2972 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2973   bool Changed = false;
2974 
2975   // Keep track of the assignment of nodes (callsites) to function clones they
2976   // call.
2977   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2978 
2979   // Update caller node to call function version CalleeFunc, by recording the
2980   // assignment in CallsiteToCalleeFuncCloneMap.
2981   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2982                                         const FuncInfo &CalleeFunc) {
2983     assert(Caller->hasCall());
2984     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2985   };
2986 
2987   // Walk all functions for which we saw calls with memprof metadata, and handle
2988   // cloning for each of its calls.
2989   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2990     FuncInfo OrigFunc(Func);
2991     // Map from each clone of OrigFunc to a map of remappings of each call of
2992     // interest (from original uncloned call to the corresponding cloned call in
2993     // that function clone).
2994     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2995     for (auto &Call : CallsWithMetadata) {
2996       ContextNode *Node = getNodeForInst(Call);
2997       // Skip call if we do not have a node for it (all uses of its stack ids
2998       // were either on inlined chains or pruned from the MIBs), or if we did
2999       // not create any clones for it.
3000       if (!Node || Node->Clones.empty())
3001         continue;
3002       assert(Node->hasCall() &&
3003              "Not having a call should have prevented cloning");
3004 
3005       // Track the assignment of function clones to clones of the current
3006       // callsite Node being handled.
3007       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
3008 
3009       // Assign callsite version CallsiteClone to function version FuncClone,
3010       // and also assign (possibly cloned) Call to CallsiteClone.
3011       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
3012                                                 CallInfo &Call,
3013                                                 ContextNode *CallsiteClone,
3014                                                 bool IsAlloc) {
3015         // Record the clone of callsite node assigned to this function clone.
3016         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
3017 
3018         assert(FuncClonesToCallMap.count(FuncClone));
3019         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
3020         CallInfo CallClone(Call);
3021         if (CallMap.count(Call))
3022           CallClone = CallMap[Call];
3023         CallsiteClone->setCall(CallClone);
3024       };
3025 
3026       // Keep track of the clones of callsite Node that need to be assigned to
3027       // function clones. This list may be expanded in the loop body below if we
3028       // find additional cloning is required.
3029       std::deque<ContextNode *> ClonesWorklist;
3030       // Ignore original Node if we moved all of its contexts to clones.
3031       if (!Node->emptyContextIds())
3032         ClonesWorklist.push_back(Node);
3033       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
3034                             Node->Clones.end());
3035 
3036       // Now walk through all of the clones of this callsite Node that we need,
3037       // and determine the assignment to a corresponding clone of the current
3038       // function (creating new function clones as needed).
3039       unsigned NodeCloneCount = 0;
3040       while (!ClonesWorklist.empty()) {
3041         ContextNode *Clone = ClonesWorklist.front();
3042         ClonesWorklist.pop_front();
3043         NodeCloneCount++;
3044         if (VerifyNodes)
3045           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3046 
3047         // Need to create a new function clone if we have more callsite clones
3048         // than existing function clones, which would have been assigned to an
3049         // earlier clone in the list (we assign callsite clones to function
3050         // clones greedily).
3051         if (FuncClonesToCallMap.size() < NodeCloneCount) {
3052           // If this is the first callsite copy, assign to original function.
3053           if (NodeCloneCount == 1) {
3054             // Since FuncClonesToCallMap is empty in this case, no clones have
3055             // been created for this function yet, and no callers should have
3056             // been assigned a function clone for this callee node yet.
3057             assert(llvm::none_of(
3058                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3059                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3060                 }));
3061             // Initialize with empty call map, assign Clone to original function
3062             // and its callers, and skip to the next clone.
3063             FuncClonesToCallMap[OrigFunc] = {};
3064             AssignCallsiteCloneToFuncClone(
3065                 OrigFunc, Call, Clone,
3066                 AllocationCallToContextNodeMap.count(Call));
3067             for (auto &CE : Clone->CallerEdges) {
3068               // Ignore any caller that does not have a recorded callsite Call.
3069               if (!CE->Caller->hasCall())
3070                 continue;
3071               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
3072             }
3073             continue;
3074           }
3075 
3076           // First locate which copy of OrigFunc to clone again. If a caller
3077           // of this callsite clone was already assigned to call a particular
3078           // function clone, we need to redirect all of those callers to the
3079           // new function clone, and update their other callees within this
3080           // function.
3081           FuncInfo PreviousAssignedFuncClone;
3082           auto EI = llvm::find_if(
3083               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
3084                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
3085               });
3086           bool CallerAssignedToCloneOfFunc = false;
3087           if (EI != Clone->CallerEdges.end()) {
3088             const std::shared_ptr<ContextEdge> &Edge = *EI;
3089             PreviousAssignedFuncClone =
3090                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3091             CallerAssignedToCloneOfFunc = true;
3092           }
3093 
3094           // Clone function and save it along with the CallInfo map created
3095           // during cloning in the FuncClonesToCallMap.
3096           std::map<CallInfo, CallInfo> NewCallMap;
3097           unsigned CloneNo = FuncClonesToCallMap.size();
3098           assert(CloneNo > 0 && "Clone 0 is the original function, which "
3099                                 "should already exist in the map");
3100           FuncInfo NewFuncClone = cloneFunctionForCallsite(
3101               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
3102           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
3103           FunctionClonesAnalysis++;
3104           Changed = true;
3105 
3106           // If no caller callsites were already assigned to a clone of this
3107           // function, we can simply assign this clone to the new func clone
3108           // and update all callers to it, then skip to the next clone.
3109           if (!CallerAssignedToCloneOfFunc) {
3110             AssignCallsiteCloneToFuncClone(
3111                 NewFuncClone, Call, Clone,
3112                 AllocationCallToContextNodeMap.count(Call));
3113             for (auto &CE : Clone->CallerEdges) {
3114               // Ignore any caller that does not have a recorded callsite Call.
3115               if (!CE->Caller->hasCall())
3116                 continue;
3117               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3118             }
3119             continue;
3120           }
3121 
3122           // We may need to do additional node cloning in this case.
3123           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
3124           // that were previously assigned to call PreviousAssignedFuncClone,
3125           // to record that they now call NewFuncClone.
3126           for (auto CE : Clone->CallerEdges) {
3127             // Skip any that have been removed on an earlier iteration.
3128             if (!CE)
3129               continue;
3130             // Ignore any caller that does not have a recorded callsite Call.
3131             if (!CE->Caller->hasCall())
3132               continue;
3133 
3134             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
3135                 // We subsequently fall through to later handling that
3136                 // will perform any additional cloning required for
3137                 // callers that were calling other function clones.
3138                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
3139                     PreviousAssignedFuncClone)
3140               continue;
3141 
3142             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
3143 
3144             // If we are cloning a function that was already assigned to some
3145             // callers, then essentially we are creating new callsite clones
3146             // of the other callsites in that function that are reached by those
3147             // callers. Clone the other callees of the current callsite's caller
3148             // that were already assigned to PreviousAssignedFuncClone
3149             // accordingly. This is important since we subsequently update the
3150             // calls from the nodes in the graph and their assignments to callee
3151             // functions recorded in CallsiteToCalleeFuncCloneMap.
3152             for (auto CalleeEdge : CE->Caller->CalleeEdges) {
3153               // Skip any that have been removed on an earlier iteration when
3154               // cleaning up newly None type callee edges.
3155               if (!CalleeEdge)
3156                 continue;
3157               ContextNode *Callee = CalleeEdge->Callee;
3158               // Skip the current callsite, we are looking for other
3159               // callsites Caller calls, as well as any that does not have a
3160               // recorded callsite Call.
3161               if (Callee == Clone || !Callee->hasCall())
3162                 continue;
3163               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
3164               removeNoneTypeCalleeEdges(NewClone);
3165               // Moving the edge may have resulted in some none type
3166               // callee edges on the original Callee.
3167               removeNoneTypeCalleeEdges(Callee);
3168               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3169               // If the Callee node was already assigned to call a specific
3170               // function version, make sure its new clone is assigned to call
3171               // that same function clone.
3172               if (CallsiteToCalleeFuncCloneMap.count(Callee))
3173                 RecordCalleeFuncOfCallsite(
3174                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
3175               // Update NewClone with the new Call clone of this callsite's Call
3176               // created for the new function clone created earlier.
3177               // Recall that we have already ensured when building the graph
3178               // that each caller can only call callsites within the same
3179               // function, so we are guaranteed that Callee Call is in the
3180               // current OrigFunc.
3181               // CallMap is set up as indexed by original Call at clone 0.
3182               CallInfo OrigCall(Callee->getOrigNode()->Call);
3183               OrigCall.setCloneNo(0);
3184               std::map<CallInfo, CallInfo> &CallMap =
3185                   FuncClonesToCallMap[NewFuncClone];
3186               assert(CallMap.count(OrigCall));
3187               CallInfo NewCall(CallMap[OrigCall]);
3188               assert(NewCall);
3189               NewClone->setCall(NewCall);
3190             }
3191           }
3192           // Fall through to handling below to perform the recording of the
3193           // function for this callsite clone. This enables handling of cases
3194           // where the callers were assigned to different clones of a function.
3195         }
3196 
3197         // See if we can use existing function clone. Walk through
3198         // all caller edges to see if any have already been assigned to
3199         // a clone of this callsite's function. If we can use it, do so. If not,
3200         // because that function clone is already assigned to a different clone
3201         // of this callsite, then we need to clone again.
3202         // Basically, this checking is needed to handle the case where different
3203         // caller functions/callsites may need versions of this function
3204         // containing different mixes of callsite clones across the different
3205         // callsites within the function. If that happens, we need to create
3206         // additional function clones to handle the various combinations.
3207         //
3208         // Keep track of any new clones of this callsite created by the
3209         // following loop, as well as any existing clone that we decided to
3210         // assign this clone to.
3211         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
3212         FuncInfo FuncCloneAssignedToCurCallsiteClone;
3213         // We need to be able to remove Edge from CallerEdges, so need to adjust
3214         // iterator in the loop.
3215         for (auto EI = Clone->CallerEdges.begin();
3216              EI != Clone->CallerEdges.end();) {
3217           auto Edge = *EI;
3218           // Ignore any caller that does not have a recorded callsite Call.
3219           if (!Edge->Caller->hasCall()) {
3220             EI++;
3221             continue;
3222           }
3223           // If this caller already assigned to call a version of OrigFunc, need
3224           // to ensure we can assign this callsite clone to that function clone.
3225           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
3226             FuncInfo FuncCloneCalledByCaller =
3227                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
3228             // First we need to confirm that this function clone is available
3229             // for use by this callsite node clone.
3230             //
3231             // While FuncCloneToCurNodeCloneMap is built only for this Node and
3232             // its callsite clones, one of those callsite clones X could have
3233             // been assigned to the same function clone called by Edge's caller
3234             // - if Edge's caller calls another callsite within Node's original
3235             // function, and that callsite has another caller reaching clone X.
3236             // We need to clone Node again in this case.
3237             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
3238                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
3239                      Clone) ||
3240                 // Detect when we have multiple callers of this callsite that
3241                 // have already been assigned to specific, and different, clones
3242                 // of OrigFunc (due to other unrelated callsites in Func they
3243                 // reach via call contexts). Is this Clone of callsite Node
3244                 // assigned to a different clone of OrigFunc? If so, clone Node
3245                 // again.
3246                 (FuncCloneAssignedToCurCallsiteClone &&
3247                  FuncCloneAssignedToCurCallsiteClone !=
3248                      FuncCloneCalledByCaller)) {
3249               // We need to use a different newly created callsite clone, in
3250               // order to assign it to another new function clone on a
3251               // subsequent iteration over the Clones array (adjusted below).
3252               // Note we specifically do not reset the
3253               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
3254               // when this new clone is processed later we know which version of
3255               // the function to copy (so that other callsite clones we have
3256               // assigned to that function clone are properly cloned over). See
3257               // comments in the function cloning handling earlier.
3258 
3259               // Check if we already have cloned this callsite again while
3260               // walking through caller edges, for a caller calling the same
3261               // function clone. If so, we can move this edge to that new clone
3262               // rather than creating yet another new clone.
3263               if (FuncCloneToNewCallsiteCloneMap.count(
3264                       FuncCloneCalledByCaller)) {
3265                 ContextNode *NewClone =
3266                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
3267                 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
3268                 // Cleanup any none type edges cloned over.
3269                 removeNoneTypeCalleeEdges(NewClone);
3270               } else {
3271                 // Create a new callsite clone.
3272                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
3273                 removeNoneTypeCalleeEdges(NewClone);
3274                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
3275                     NewClone;
3276                 // Add to list of clones and process later.
3277                 ClonesWorklist.push_back(NewClone);
3278                 assert(EI == Clone->CallerEdges.end() ||
3279                        Clone->AllocTypes != (uint8_t)AllocationType::None);
3280                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
3281               }
3282               // Moving the caller edge may have resulted in some none type
3283               // callee edges.
3284               removeNoneTypeCalleeEdges(Clone);
3285               // We will handle the newly created callsite clone in a subsequent
3286               // iteration over this Node's Clones. Continue here since we
3287               // already adjusted iterator EI while moving the edge.
3288               continue;
3289             }
3290 
3291             // Otherwise, we can use the function clone already assigned to this
3292             // caller.
3293             if (!FuncCloneAssignedToCurCallsiteClone) {
3294               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
3295               // Assign Clone to FuncCloneCalledByCaller
3296               AssignCallsiteCloneToFuncClone(
3297                   FuncCloneCalledByCaller, Call, Clone,
3298                   AllocationCallToContextNodeMap.count(Call));
3299             } else
3300               // Don't need to do anything - callsite is already calling this
3301               // function clone.
3302               assert(FuncCloneAssignedToCurCallsiteClone ==
3303                      FuncCloneCalledByCaller);
3304 
3305           } else {
3306             // We have not already assigned this caller to a version of
3307             // OrigFunc. Do the assignment now.
3308 
3309             // First check if we have already assigned this callsite clone to a
3310             // clone of OrigFunc for another caller during this iteration over
3311             // its caller edges.
3312             if (!FuncCloneAssignedToCurCallsiteClone) {
3313               // Find first function in FuncClonesToCallMap without an assigned
3314               // clone of this callsite Node. We should always have one
3315               // available at this point due to the earlier cloning when the
3316               // FuncClonesToCallMap size was smaller than the clone number.
3317               for (auto &CF : FuncClonesToCallMap) {
3318                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
3319                   FuncCloneAssignedToCurCallsiteClone = CF.first;
3320                   break;
3321                 }
3322               }
3323               assert(FuncCloneAssignedToCurCallsiteClone);
3324               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
3325               AssignCallsiteCloneToFuncClone(
3326                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
3327                   AllocationCallToContextNodeMap.count(Call));
3328             } else
3329               assert(FuncCloneToCurNodeCloneMap
3330                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
3331             // Update callers to record function version called.
3332             RecordCalleeFuncOfCallsite(Edge->Caller,
3333                                        FuncCloneAssignedToCurCallsiteClone);
3334           }
3335 
3336           EI++;
3337         }
3338       }
3339       if (VerifyCCG) {
3340         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
3341         for (const auto &PE : Node->CalleeEdges)
3342           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3343         for (const auto &CE : Node->CallerEdges)
3344           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3345         for (auto *Clone : Node->Clones) {
3346           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
3347           for (const auto &PE : Clone->CalleeEdges)
3348             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
3349           for (const auto &CE : Clone->CallerEdges)
3350             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
3351         }
3352       }
3353     }
3354   }
3355 
3356   auto UpdateCalls = [&](ContextNode *Node,
3357                          DenseSet<const ContextNode *> &Visited,
3358                          auto &&UpdateCalls) {
3359     auto Inserted = Visited.insert(Node);
3360     if (!Inserted.second)
3361       return;
3362 
3363     for (auto *Clone : Node->Clones)
3364       UpdateCalls(Clone, Visited, UpdateCalls);
3365 
3366     for (auto &Edge : Node->CallerEdges)
3367       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
3368 
3369     // Skip if either no call to update, or if we ended up with no context ids
3370     // (we moved all edges onto other clones).
3371     if (!Node->hasCall() || Node->emptyContextIds())
3372       return;
3373 
3374     if (Node->IsAllocation) {
3375       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
3376       return;
3377     }
3378 
3379     if (!CallsiteToCalleeFuncCloneMap.count(Node))
3380       return;
3381 
3382     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
3383     updateCall(Node->Call, CalleeFunc);
3384   };
3385 
3386   // Performs DFS traversal starting from allocation nodes to update calls to
3387   // reflect cloning decisions recorded earlier. For regular LTO this will
3388   // update the actual calls in the IR to call the appropriate function clone
3389   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
3390   // are recorded in the summary entries.
3391   DenseSet<const ContextNode *> Visited;
3392   for (auto &Entry : AllocationCallToContextNodeMap)
3393     UpdateCalls(Entry.second, Visited, UpdateCalls);
3394 
3395   return Changed;
3396 }
3397 
3398 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
3399     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
3400     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3401         &FuncToAliasMap) {
3402   // The first "clone" is the original copy, we should only call this if we
3403   // needed to create new clones.
3404   assert(NumClones > 1);
3405   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3406   VMaps.reserve(NumClones - 1);
3407   FunctionsClonedThinBackend++;
3408   for (unsigned I = 1; I < NumClones; I++) {
3409     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
3410     auto *NewF = CloneFunction(&F, *VMaps.back());
3411     FunctionClonesThinBackend++;
3412     // Strip memprof and callsite metadata from clone as they are no longer
3413     // needed.
3414     for (auto &BB : *NewF) {
3415       for (auto &Inst : BB) {
3416         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
3417         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
3418       }
3419     }
3420     std::string Name = getMemProfFuncName(F.getName(), I);
3421     auto *PrevF = M.getFunction(Name);
3422     if (PrevF) {
3423       // We might have created this when adjusting callsite in another
3424       // function. It should be a declaration.
3425       assert(PrevF->isDeclaration());
3426       NewF->takeName(PrevF);
3427       PrevF->replaceAllUsesWith(NewF);
3428       PrevF->eraseFromParent();
3429     } else
3430       NewF->setName(Name);
3431     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
3432              << "created clone " << ore::NV("NewFunction", NewF));
3433 
3434     // Now handle aliases to this function, and clone those as well.
3435     if (!FuncToAliasMap.count(&F))
3436       continue;
3437     for (auto *A : FuncToAliasMap[&F]) {
3438       std::string Name = getMemProfFuncName(A->getName(), I);
3439       auto *PrevA = M.getNamedAlias(Name);
3440       auto *NewA = GlobalAlias::create(A->getValueType(),
3441                                        A->getType()->getPointerAddressSpace(),
3442                                        A->getLinkage(), Name, NewF);
3443       NewA->copyAttributesFrom(A);
3444       if (PrevA) {
3445         // We might have created this when adjusting callsite in another
3446         // function. It should be a declaration.
3447         assert(PrevA->isDeclaration());
3448         NewA->takeName(PrevA);
3449         PrevA->replaceAllUsesWith(NewA);
3450         PrevA->eraseFromParent();
3451       }
3452     }
3453   }
3454   return VMaps;
3455 }
3456 
3457 // Locate the summary for F. This is complicated by the fact that it might
3458 // have been internalized or promoted.
3459 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
3460                                       const ModuleSummaryIndex *ImportSummary) {
3461   // FIXME: Ideally we would retain the original GUID in some fashion on the
3462   // function (e.g. as metadata), but for now do our best to locate the
3463   // summary without that information.
3464   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
3465   if (!TheFnVI)
3466     // See if theFn was internalized, by checking index directly with
3467     // original name (this avoids the name adjustment done by getGUID() for
3468     // internal symbols).
3469     TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
3470   if (TheFnVI)
3471     return TheFnVI;
3472   // Now query with the original name before any promotion was performed.
3473   StringRef OrigName =
3474       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
3475   std::string OrigId = GlobalValue::getGlobalIdentifier(
3476       OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
3477   TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
3478   if (TheFnVI)
3479     return TheFnVI;
3480   // Could be a promoted local imported from another module. We need to pass
3481   // down more info here to find the original module id. For now, try with
3482   // the OrigName which might have been stored in the OidGuidMap in the
3483   // index. This would not work if there were same-named locals in multiple
3484   // modules, however.
3485   auto OrigGUID =
3486       ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
3487   if (OrigGUID)
3488     TheFnVI = ImportSummary->getValueInfo(OrigGUID);
3489   return TheFnVI;
3490 }
3491 
3492 bool MemProfContextDisambiguation::applyImport(Module &M) {
3493   assert(ImportSummary);
3494   bool Changed = false;
3495 
3496   auto IsMemProfClone = [](const Function &F) {
3497     return F.getName().contains(MemProfCloneSuffix);
3498   };
3499 
3500   // We also need to clone any aliases that reference cloned functions, because
3501   // the modified callsites may invoke via the alias. Keep track of the aliases
3502   // for each function.
3503   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
3504       FuncToAliasMap;
3505   for (auto &A : M.aliases()) {
3506     auto *Aliasee = A.getAliaseeObject();
3507     if (auto *F = dyn_cast<Function>(Aliasee))
3508       FuncToAliasMap[F].insert(&A);
3509   }
3510 
3511   for (auto &F : M) {
3512     if (F.isDeclaration() || IsMemProfClone(F))
3513       continue;
3514 
3515     OptimizationRemarkEmitter ORE(&F);
3516 
3517     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
3518     bool ClonesCreated = false;
3519     unsigned NumClonesCreated = 0;
3520     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
3521       // We should at least have version 0 which is the original copy.
3522       assert(NumClones > 0);
3523       // If only one copy needed use original.
3524       if (NumClones == 1)
3525         return;
3526       // If we already performed cloning of this function, confirm that the
3527       // requested number of clones matches (the thin link should ensure the
3528       // number of clones for each constituent callsite is consistent within
3529       // each function), before returning.
3530       if (ClonesCreated) {
3531         assert(NumClonesCreated == NumClones);
3532         return;
3533       }
3534       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
3535       // The first "clone" is the original copy, which doesn't have a VMap.
3536       assert(VMaps.size() == NumClones - 1);
3537       Changed = true;
3538       ClonesCreated = true;
3539       NumClonesCreated = NumClones;
3540     };
3541 
3542     auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
3543                              Function *CalledFunction) {
3544       // Perform cloning if not yet done.
3545       CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3546 
3547       // Should have skipped indirect calls via mayHaveMemprofSummary.
3548       assert(CalledFunction);
3549       assert(!IsMemProfClone(*CalledFunction));
3550 
3551       // Update the calls per the summary info.
3552       // Save orig name since it gets updated in the first iteration
3553       // below.
3554       auto CalleeOrigName = CalledFunction->getName();
3555       for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3556         // Do nothing if this version calls the original version of its
3557         // callee.
3558         if (!StackNode.Clones[J])
3559           continue;
3560         auto NewF = M.getOrInsertFunction(
3561             getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3562             CalledFunction->getFunctionType());
3563         CallBase *CBClone;
3564         // Copy 0 is the original function.
3565         if (!J)
3566           CBClone = CB;
3567         else
3568           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3569         CBClone->setCalledFunction(NewF);
3570         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3571                  << ore::NV("Call", CBClone) << " in clone "
3572                  << ore::NV("Caller", CBClone->getFunction())
3573                  << " assigned to call function clone "
3574                  << ore::NV("Callee", NewF.getCallee()));
3575       }
3576     };
3577 
3578     // Locate the summary for F.
3579     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
3580     // If not found, this could be an imported local (see comment in
3581     // findValueInfoForFunc). Skip for now as it will be cloned in its original
3582     // module (where it would have been promoted to global scope so should
3583     // satisfy any reference in this module).
3584     if (!TheFnVI)
3585       continue;
3586 
3587     auto *GVSummary =
3588         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
3589     if (!GVSummary) {
3590       // Must have been imported, use the summary which matches the definition。
3591       // (might be multiple if this was a linkonce_odr).
3592       auto SrcModuleMD = F.getMetadata("thinlto_src_module");
3593       assert(SrcModuleMD &&
3594              "enable-import-metadata is needed to emit thinlto_src_module");
3595       StringRef SrcModule =
3596           dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
3597       for (auto &GVS : TheFnVI.getSummaryList()) {
3598         if (GVS->modulePath() == SrcModule) {
3599           GVSummary = GVS.get();
3600           break;
3601         }
3602       }
3603       assert(GVSummary && GVSummary->modulePath() == SrcModule);
3604     }
3605 
3606     // If this was an imported alias skip it as we won't have the function
3607     // summary, and it should be cloned in the original module.
3608     if (isa<AliasSummary>(GVSummary))
3609       continue;
3610 
3611     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
3612 
3613     if (FS->allocs().empty() && FS->callsites().empty())
3614       continue;
3615 
3616     auto SI = FS->callsites().begin();
3617     auto AI = FS->allocs().begin();
3618 
3619     // To handle callsite infos synthesized for tail calls which have missing
3620     // frames in the profiled context, map callee VI to the synthesized callsite
3621     // info.
3622     DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
3623     // Iterate the callsites for this function in reverse, since we place all
3624     // those synthesized for tail calls at the end.
3625     for (auto CallsiteIt = FS->callsites().rbegin();
3626          CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
3627       auto &Callsite = *CallsiteIt;
3628       // Stop as soon as we see a non-synthesized callsite info (see comment
3629       // above loop). All the entries added for discovered tail calls have empty
3630       // stack ids.
3631       if (!Callsite.StackIdIndices.empty())
3632         break;
3633       MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
3634     }
3635 
3636     // Assume for now that the instructions are in the exact same order
3637     // as when the summary was created, but confirm this is correct by
3638     // matching the stack ids.
3639     for (auto &BB : F) {
3640       for (auto &I : BB) {
3641         auto *CB = dyn_cast<CallBase>(&I);
3642         // Same handling as when creating module summary.
3643         if (!mayHaveMemprofSummary(CB))
3644           continue;
3645 
3646         auto *CalledValue = CB->getCalledOperand();
3647         auto *CalledFunction = CB->getCalledFunction();
3648         if (CalledValue && !CalledFunction) {
3649           CalledValue = CalledValue->stripPointerCasts();
3650           // Stripping pointer casts can reveal a called function.
3651           CalledFunction = dyn_cast<Function>(CalledValue);
3652         }
3653         // Check if this is an alias to a function. If so, get the
3654         // called aliasee for the checks below.
3655         if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
3656           assert(!CalledFunction &&
3657                  "Expected null called function in callsite for alias");
3658           CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
3659         }
3660 
3661         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
3662             I.getMetadata(LLVMContext::MD_callsite));
3663         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
3664 
3665         // Include allocs that were already assigned a memprof function
3666         // attribute in the statistics.
3667         if (CB->getAttributes().hasFnAttr("memprof")) {
3668           assert(!MemProfMD);
3669           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3670               ? AllocTypeColdThinBackend++
3671               : AllocTypeNotColdThinBackend++;
3672           OrigAllocsThinBackend++;
3673           AllocVersionsThinBackend++;
3674           if (!MaxAllocVersionsThinBackend)
3675             MaxAllocVersionsThinBackend = 1;
3676           // Remove any remaining callsite metadata and we can skip the rest of
3677           // the handling for this instruction, since no cloning needed.
3678           I.setMetadata(LLVMContext::MD_callsite, nullptr);
3679           continue;
3680         }
3681 
3682         if (MemProfMD) {
3683           // Consult the next alloc node.
3684           assert(AI != FS->allocs().end());
3685           auto &AllocNode = *(AI++);
3686 
3687           // Sanity check that the MIB stack ids match between the summary and
3688           // instruction metadata.
3689           auto MIBIter = AllocNode.MIBs.begin();
3690           for (auto &MDOp : MemProfMD->operands()) {
3691             assert(MIBIter != AllocNode.MIBs.end());
3692             LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3693                 MIBIter->StackIdIndices.begin();
3694             auto *MIBMD = cast<const MDNode>(MDOp);
3695             MDNode *StackMDNode = getMIBStackNode(MIBMD);
3696             assert(StackMDNode);
3697             CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3698             auto ContextIterBegin =
3699                 StackContext.beginAfterSharedPrefix(CallsiteContext);
3700             // Skip the checking on the first iteration.
3701             uint64_t LastStackContextId =
3702                 (ContextIterBegin != StackContext.end() &&
3703                  *ContextIterBegin == 0)
3704                     ? 1
3705                     : 0;
3706             for (auto ContextIter = ContextIterBegin;
3707                  ContextIter != StackContext.end(); ++ContextIter) {
3708               // If this is a direct recursion, simply skip the duplicate
3709               // entries, to be consistent with how the summary ids were
3710               // generated during ModuleSummaryAnalysis.
3711               if (LastStackContextId == *ContextIter)
3712                 continue;
3713               LastStackContextId = *ContextIter;
3714               assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3715               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3716                      *ContextIter);
3717               StackIdIndexIter++;
3718             }
3719             MIBIter++;
3720           }
3721 
3722           // Perform cloning if not yet done.
3723           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3724 
3725           OrigAllocsThinBackend++;
3726           AllocVersionsThinBackend += AllocNode.Versions.size();
3727           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3728             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3729 
3730           // If there is only one version that means we didn't end up
3731           // considering this function for cloning, and in that case the alloc
3732           // will still be none type or should have gotten the default NotCold.
3733           // Skip that after calling clone helper since that does some sanity
3734           // checks that confirm we haven't decided yet that we need cloning.
3735           if (AllocNode.Versions.size() == 1) {
3736             assert((AllocationType)AllocNode.Versions[0] ==
3737                        AllocationType::NotCold ||
3738                    (AllocationType)AllocNode.Versions[0] ==
3739                        AllocationType::None);
3740             UnclonableAllocsThinBackend++;
3741             continue;
3742           }
3743 
3744           // All versions should have a singular allocation type.
3745           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3746             return Type == ((uint8_t)AllocationType::NotCold |
3747                             (uint8_t)AllocationType::Cold);
3748           }));
3749 
3750           // Update the allocation types per the summary info.
3751           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3752             // Ignore any that didn't get an assigned allocation type.
3753             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3754               continue;
3755             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3756             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3757                                             : AllocTypeNotColdThinBackend++;
3758             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3759             auto A = llvm::Attribute::get(F.getContext(), "memprof",
3760                                           AllocTypeString);
3761             CallBase *CBClone;
3762             // Copy 0 is the original function.
3763             if (!J)
3764               CBClone = CB;
3765             else
3766               // Since VMaps are only created for new clones, we index with
3767               // clone J-1 (J==0 is the original clone and does not have a VMaps
3768               // entry).
3769               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3770             CBClone->addFnAttr(A);
3771             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3772                      << ore::NV("AllocationCall", CBClone) << " in clone "
3773                      << ore::NV("Caller", CBClone->getFunction())
3774                      << " marked with memprof allocation attribute "
3775                      << ore::NV("Attribute", AllocTypeString));
3776           }
3777         } else if (!CallsiteContext.empty()) {
3778           // Consult the next callsite node.
3779           assert(SI != FS->callsites().end());
3780           auto &StackNode = *(SI++);
3781 
3782 #ifndef NDEBUG
3783           // Sanity check that the stack ids match between the summary and
3784           // instruction metadata.
3785           auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3786           for (auto StackId : CallsiteContext) {
3787             assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3788             assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3789                    StackId);
3790             StackIdIndexIter++;
3791           }
3792 #endif
3793 
3794           CloneCallsite(StackNode, CB, CalledFunction);
3795         } else if (CB->isTailCall()) {
3796           // Locate the synthesized callsite info for the callee VI, if any was
3797           // created, and use that for cloning.
3798           ValueInfo CalleeVI =
3799               findValueInfoForFunc(*CalledFunction, M, ImportSummary);
3800           if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
3801             auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
3802             assert(Callsite != MapTailCallCalleeVIToCallsite.end());
3803             CloneCallsite(Callsite->second, CB, CalledFunction);
3804           }
3805         }
3806         // Memprof and callsite metadata on memory allocations no longer needed.
3807         I.setMetadata(LLVMContext::MD_memprof, nullptr);
3808         I.setMetadata(LLVMContext::MD_callsite, nullptr);
3809       }
3810     }
3811   }
3812 
3813   return Changed;
3814 }
3815 
3816 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3817 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3818   if (DumpCCG) {
3819     dbgs() << "CCG before cloning:\n";
3820     dbgs() << *this;
3821   }
3822   if (ExportToDot)
3823     exportToDot("postbuild");
3824 
3825   if (VerifyCCG) {
3826     check();
3827   }
3828 
3829   identifyClones();
3830 
3831   if (VerifyCCG) {
3832     check();
3833   }
3834 
3835   if (DumpCCG) {
3836     dbgs() << "CCG after cloning:\n";
3837     dbgs() << *this;
3838   }
3839   if (ExportToDot)
3840     exportToDot("cloned");
3841 
3842   bool Changed = assignFunctions();
3843 
3844   if (DumpCCG) {
3845     dbgs() << "CCG after assigning function clones:\n";
3846     dbgs() << *this;
3847   }
3848   if (ExportToDot)
3849     exportToDot("clonefuncassign");
3850 
3851   if (MemProfReportHintedSizes)
3852     printTotalSizes(errs());
3853 
3854   return Changed;
3855 }
3856 
3857 bool MemProfContextDisambiguation::processModule(
3858     Module &M,
3859     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3860 
3861   // If we have an import summary, then the cloning decisions were made during
3862   // the thin link on the index. Apply them and return.
3863   if (ImportSummary)
3864     return applyImport(M);
3865 
3866   // TODO: If/when other types of memprof cloning are enabled beyond just for
3867   // hot and cold, we will need to change this to individually control the
3868   // AllocationType passed to addStackNodesForMIB during CCG construction.
3869   // Note that we specifically check this after applying imports above, so that
3870   // the option isn't needed to be passed to distributed ThinLTO backend
3871   // clang processes, which won't necessarily have visibility into the linker
3872   // dependences. Instead the information is communicated from the LTO link to
3873   // the backends via the combined summary index.
3874   if (!SupportsHotColdNew)
3875     return false;
3876 
3877   ModuleCallsiteContextGraph CCG(M, OREGetter);
3878   return CCG.process();
3879 }
3880 
3881 MemProfContextDisambiguation::MemProfContextDisambiguation(
3882     const ModuleSummaryIndex *Summary)
3883     : ImportSummary(Summary) {
3884   if (ImportSummary) {
3885     // The MemProfImportSummary should only be used for testing ThinLTO
3886     // distributed backend handling via opt, in which case we don't have a
3887     // summary from the pass pipeline.
3888     assert(MemProfImportSummary.empty());
3889     return;
3890   }
3891   if (MemProfImportSummary.empty())
3892     return;
3893 
3894   auto ReadSummaryFile =
3895       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3896   if (!ReadSummaryFile) {
3897     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3898                           "Error loading file '" + MemProfImportSummary +
3899                               "': ");
3900     return;
3901   }
3902   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3903   if (!ImportSummaryForTestingOrErr) {
3904     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3905                           "Error parsing file '" + MemProfImportSummary +
3906                               "': ");
3907     return;
3908   }
3909   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3910   ImportSummary = ImportSummaryForTesting.get();
3911 }
3912 
3913 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3914                                                     ModuleAnalysisManager &AM) {
3915   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3916   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3917     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3918   };
3919   if (!processModule(M, OREGetter))
3920     return PreservedAnalyses::all();
3921   return PreservedAnalyses::none();
3922 }
3923 
3924 void MemProfContextDisambiguation::run(
3925     ModuleSummaryIndex &Index,
3926     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3927         isPrevailing) {
3928   // TODO: If/when other types of memprof cloning are enabled beyond just for
3929   // hot and cold, we will need to change this to individually control the
3930   // AllocationType passed to addStackNodesForMIB during CCG construction.
3931   // The index was set from the option, so these should be in sync.
3932   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3933   if (!SupportsHotColdNew)
3934     return;
3935 
3936   IndexCallsiteContextGraph CCG(Index, isPrevailing);
3937   CCG.process();
3938 }
3939