xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (revision 8311bc5f17dec348749f763b82dfe2737bc53cd7)
1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Constants.h"
37 #include "llvm/IR/Instructions.h"
38 #include "llvm/IR/Module.h"
39 #include "llvm/IR/ModuleSummaryIndex.h"
40 #include "llvm/Pass.h"
41 #include "llvm/Support/CommandLine.h"
42 #include "llvm/Support/FileSystem.h"
43 #include "llvm/Support/GraphWriter.h"
44 #include "llvm/Support/raw_ostream.h"
45 #include "llvm/Transforms/IPO.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include <sstream>
48 #include <vector>
49 using namespace llvm;
50 using namespace llvm::memprof;
51 
52 #define DEBUG_TYPE "memprof-context-disambiguation"
53 
54 STATISTIC(FunctionClonesAnalysis,
55           "Number of function clones created during whole program analysis");
56 STATISTIC(FunctionClonesThinBackend,
57           "Number of function clones created during ThinLTO backend");
58 STATISTIC(FunctionsClonedThinBackend,
59           "Number of functions that had clones created during ThinLTO backend");
60 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
61                             "cloned) during whole program analysis");
62 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
63                          "during whole program analysis");
64 STATISTIC(AllocTypeNotColdThinBackend,
65           "Number of not cold static allocations (possibly cloned) during "
66           "ThinLTO backend");
67 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
68                                     "(possibly cloned) during ThinLTO backend");
69 STATISTIC(OrigAllocsThinBackend,
70           "Number of original (not cloned) allocations with memprof profiles "
71           "during ThinLTO backend");
72 STATISTIC(
73     AllocVersionsThinBackend,
74     "Number of allocation versions (including clones) during ThinLTO backend");
75 STATISTIC(MaxAllocVersionsThinBackend,
76           "Maximum number of allocation versions created for an original "
77           "allocation during ThinLTO backend");
78 STATISTIC(UnclonableAllocsThinBackend,
79           "Number of unclonable ambigous allocations during ThinLTO backend");
80 
81 static cl::opt<std::string> DotFilePathPrefix(
82     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
83     cl::value_desc("filename"),
84     cl::desc("Specify the path prefix of the MemProf dot files."));
85 
86 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
87                                  cl::Hidden,
88                                  cl::desc("Export graph to dot files."));
89 
90 static cl::opt<bool>
91     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
92             cl::desc("Dump CallingContextGraph to stdout after each stage."));
93 
94 static cl::opt<bool>
95     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
96               cl::desc("Perform verification checks on CallingContextGraph."));
97 
98 static cl::opt<bool>
99     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
100                 cl::desc("Perform frequent verification checks on nodes."));
101 
102 static cl::opt<std::string> MemProfImportSummary(
103     "memprof-import-summary",
104     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
105     cl::Hidden);
106 
107 // Indicate we are linking with an allocator that supports hot/cold operator
108 // new interfaces.
109 cl::opt<bool> SupportsHotColdNew(
110     "supports-hot-cold-new", cl::init(false), cl::Hidden,
111     cl::desc("Linking with hot/cold operator new interfaces"));
112 
113 namespace {
114 /// CRTP base for graphs built from either IR or ThinLTO summary index.
115 ///
116 /// The graph represents the call contexts in all memprof metadata on allocation
117 /// calls, with nodes for the allocations themselves, as well as for the calls
118 /// in each context. The graph is initially built from the allocation memprof
119 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
120 /// metadata onto the nodes, updating it to reflect any inlining performed on
121 /// those calls.
122 ///
123 /// Each MIB (representing an allocation's call context with allocation
124 /// behavior) is assigned a unique context id during the graph build. The edges
125 /// and nodes in the graph are decorated with the context ids they carry. This
126 /// is used to correctly update the graph when cloning is performed so that we
127 /// can uniquify the context for a single (possibly cloned) allocation.
128 template <typename DerivedCCG, typename FuncTy, typename CallTy>
129 class CallsiteContextGraph {
130 public:
131   CallsiteContextGraph() = default;
132   CallsiteContextGraph(const CallsiteContextGraph &) = default;
133   CallsiteContextGraph(CallsiteContextGraph &&) = default;
134 
135   /// Main entry point to perform analysis and transformations on graph.
136   bool process();
137 
138   /// Perform cloning on the graph necessary to uniquely identify the allocation
139   /// behavior of an allocation based on its context.
140   void identifyClones();
141 
142   /// Assign callsite clones to functions, cloning functions as needed to
143   /// accommodate the combinations of their callsite clones reached by callers.
144   /// For regular LTO this clones functions and callsites in the IR, but for
145   /// ThinLTO the cloning decisions are noted in the summaries and later applied
146   /// in applyImport.
147   bool assignFunctions();
148 
149   void dump() const;
150   void print(raw_ostream &OS) const;
151 
152   friend raw_ostream &operator<<(raw_ostream &OS,
153                                  const CallsiteContextGraph &CCG) {
154     CCG.print(OS);
155     return OS;
156   }
157 
158   friend struct GraphTraits<
159       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
160   friend struct DOTGraphTraits<
161       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
162 
163   void exportToDot(std::string Label) const;
164 
165   /// Represents a function clone via FuncTy pointer and clone number pair.
166   struct FuncInfo final
167       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
168     using Base = std::pair<FuncTy *, unsigned>;
169     FuncInfo(const Base &B) : Base(B) {}
170     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
171     explicit operator bool() const { return this->first != nullptr; }
172     FuncTy *func() const { return this->first; }
173     unsigned cloneNo() const { return this->second; }
174   };
175 
176   /// Represents a callsite clone via CallTy and clone number pair.
177   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
178     using Base = std::pair<CallTy, unsigned>;
179     CallInfo(const Base &B) : Base(B) {}
180     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
181         : Base(Call, CloneNo) {}
182     explicit operator bool() const { return (bool)this->first; }
183     CallTy call() const { return this->first; }
184     unsigned cloneNo() const { return this->second; }
185     void setCloneNo(unsigned N) { this->second = N; }
186     void print(raw_ostream &OS) const {
187       if (!operator bool()) {
188         assert(!cloneNo());
189         OS << "null Call";
190         return;
191       }
192       call()->print(OS);
193       OS << "\t(clone " << cloneNo() << ")";
194     }
195     void dump() const {
196       print(dbgs());
197       dbgs() << "\n";
198     }
199     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
200       Call.print(OS);
201       return OS;
202     }
203   };
204 
205   struct ContextEdge;
206 
207   /// Node in the Callsite Context Graph
208   struct ContextNode {
209     // Keep this for now since in the IR case where we have an Instruction* it
210     // is not as immediately discoverable. Used for printing richer information
211     // when dumping graph.
212     bool IsAllocation;
213 
214     // Keeps track of when the Call was reset to null because there was
215     // recursion.
216     bool Recursive = false;
217 
218     // The corresponding allocation or interior call.
219     CallInfo Call;
220 
221     // For alloc nodes this is a unique id assigned when constructed, and for
222     // callsite stack nodes it is the original stack id when the node is
223     // constructed from the memprof MIB metadata on the alloc nodes. Note that
224     // this is only used when matching callsite metadata onto the stack nodes
225     // created when processing the allocation memprof MIBs, and for labeling
226     // nodes in the dot graph. Therefore we don't bother to assign a value for
227     // clones.
228     uint64_t OrigStackOrAllocId = 0;
229 
230     // This will be formed by ORing together the AllocationType enum values
231     // for contexts including this node.
232     uint8_t AllocTypes = 0;
233 
234     // Edges to all callees in the profiled call stacks.
235     // TODO: Should this be a map (from Callee node) for more efficient lookup?
236     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
237 
238     // Edges to all callers in the profiled call stacks.
239     // TODO: Should this be a map (from Caller node) for more efficient lookup?
240     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
241 
242     // The set of IDs for contexts including this node.
243     DenseSet<uint32_t> ContextIds;
244 
245     // List of clones of this ContextNode, initially empty.
246     std::vector<ContextNode *> Clones;
247 
248     // If a clone, points to the original uncloned node.
249     ContextNode *CloneOf = nullptr;
250 
251     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
252 
253     ContextNode(bool IsAllocation, CallInfo C)
254         : IsAllocation(IsAllocation), Call(C) {}
255 
256     void addClone(ContextNode *Clone) {
257       if (CloneOf) {
258         CloneOf->Clones.push_back(Clone);
259         Clone->CloneOf = CloneOf;
260       } else {
261         Clones.push_back(Clone);
262         assert(!Clone->CloneOf);
263         Clone->CloneOf = this;
264       }
265     }
266 
267     ContextNode *getOrigNode() {
268       if (!CloneOf)
269         return this;
270       return CloneOf;
271     }
272 
273     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
274                                unsigned int ContextId);
275 
276     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
277     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
278     void eraseCalleeEdge(const ContextEdge *Edge);
279     void eraseCallerEdge(const ContextEdge *Edge);
280 
281     void setCall(CallInfo C) { Call = C; }
282 
283     bool hasCall() const { return (bool)Call.call(); }
284 
285     void printCall(raw_ostream &OS) const { Call.print(OS); }
286 
287     // True if this node was effectively removed from the graph, in which case
288     // its context id set, caller edges, and callee edges should all be empty.
289     bool isRemoved() const {
290       assert(ContextIds.empty() ==
291              (CalleeEdges.empty() && CallerEdges.empty()));
292       return ContextIds.empty();
293     }
294 
295     void dump() const;
296     void print(raw_ostream &OS) const;
297 
298     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
299       Node.print(OS);
300       return OS;
301     }
302   };
303 
304   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
305   /// callee.
306   struct ContextEdge {
307     ContextNode *Callee;
308     ContextNode *Caller;
309 
310     // This will be formed by ORing together the AllocationType enum values
311     // for contexts including this edge.
312     uint8_t AllocTypes = 0;
313 
314     // The set of IDs for contexts including this edge.
315     DenseSet<uint32_t> ContextIds;
316 
317     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
318                 DenseSet<uint32_t> ContextIds)
319         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
320           ContextIds(ContextIds) {}
321 
322     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
323 
324     void dump() const;
325     void print(raw_ostream &OS) const;
326 
327     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
328       Edge.print(OS);
329       return OS;
330     }
331   };
332 
333   /// Helper to remove callee edges that have allocation type None (due to not
334   /// carrying any context ids) after transformations.
335   void removeNoneTypeCalleeEdges(ContextNode *Node);
336 
337 protected:
338   /// Get a list of nodes corresponding to the stack ids in the given callsite
339   /// context.
340   template <class NodeT, class IteratorT>
341   std::vector<uint64_t>
342   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
343 
344   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
345   /// metadata (or summary).
346   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
347 
348   /// Adds nodes for the given MIB stack ids.
349   template <class NodeT, class IteratorT>
350   void addStackNodesForMIB(ContextNode *AllocNode,
351                            CallStack<NodeT, IteratorT> &StackContext,
352                            CallStack<NodeT, IteratorT> &CallsiteContext,
353                            AllocationType AllocType);
354 
355   /// Matches all callsite metadata (or summary) to the nodes created for
356   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
357   /// inlining performed on those callsite instructions.
358   void updateStackNodes();
359 
360   /// Update graph to conservatively handle any callsite stack nodes that target
361   /// multiple different callee target functions.
362   void handleCallsitesWithMultipleTargets();
363 
364   /// Save lists of calls with MemProf metadata in each function, for faster
365   /// iteration.
366   std::vector<std::pair<FuncTy *, std::vector<CallInfo>>>
367       FuncToCallsWithMetadata;
368 
369   /// Map from callsite node to the enclosing caller function.
370   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
371 
372 private:
373   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
374 
375   using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>,
376                                      const FuncTy *, DenseSet<uint32_t>>;
377 
378   /// Assigns the given Node to calls at or inlined into the location with
379   /// the Node's stack id, after post order traversing and processing its
380   /// caller nodes. Uses the call information recorded in the given
381   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
382   /// as needed. Called by updateStackNodes which sets up the given
383   /// StackIdToMatchingCalls map.
384   void assignStackNodesPostOrder(
385       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
386       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls);
387 
388   /// Duplicates the given set of context ids, updating the provided
389   /// map from each original id with the newly generated context ids,
390   /// and returning the new duplicated id set.
391   DenseSet<uint32_t> duplicateContextIds(
392       const DenseSet<uint32_t> &StackSequenceContextIds,
393       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
394 
395   /// Propagates all duplicated context ids across the graph.
396   void propagateDuplicateContextIds(
397       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
398 
399   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
400   /// else to its callers. Also updates OrigNode's edges to remove any context
401   /// ids moved to the newly created edge.
402   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
403                       bool TowardsCallee);
404 
405   /// Get the stack id corresponding to the given Id or Index (for IR this will
406   /// return itself, for a summary index this will return the id recorded in the
407   /// index for that stack id index value).
408   uint64_t getStackId(uint64_t IdOrIndex) const {
409     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
410   }
411 
412   /// Returns true if the given call targets the given function.
413   bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) {
414     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func);
415   }
416 
417   /// Get a list of nodes corresponding to the stack ids in the given
418   /// callsite's context.
419   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
420     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
421         Call);
422   }
423 
424   /// Get the last stack id in the context for callsite.
425   uint64_t getLastStackId(CallTy Call) {
426     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
427   }
428 
429   /// Update the allocation call to record type of allocated memory.
430   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
431     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
432     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
433   }
434 
435   /// Update non-allocation call to invoke (possibly cloned) function
436   /// CalleeFunc.
437   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
438     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
439   }
440 
441   /// Clone the given function for the given callsite, recording mapping of all
442   /// of the functions tracked calls to their new versions in the CallMap.
443   /// Assigns new clones to clone number CloneNo.
444   FuncInfo cloneFunctionForCallsite(
445       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
446       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
447     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
448         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
449   }
450 
451   /// Gets a label to use in the dot graph for the given call clone in the given
452   /// function.
453   std::string getLabel(const FuncTy *Func, const CallTy Call,
454                        unsigned CloneNo) const {
455     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
456   }
457 
458   /// Helpers to find the node corresponding to the given call or stackid.
459   ContextNode *getNodeForInst(const CallInfo &C);
460   ContextNode *getNodeForAlloc(const CallInfo &C);
461   ContextNode *getNodeForStackId(uint64_t StackId);
462 
463   /// Removes the node information recorded for the given call.
464   void unsetNodeForInst(const CallInfo &C);
465 
466   /// Computes the alloc type corresponding to the given context ids, by
467   /// unioning their recorded alloc types.
468   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
469 
470   /// Returns the alloction type of the intersection of the contexts of two
471   /// nodes (based on their provided context id sets), optimized for the case
472   /// when Node1Ids is smaller than Node2Ids.
473   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
474                                   const DenseSet<uint32_t> &Node2Ids);
475 
476   /// Returns the alloction type of the intersection of the contexts of two
477   /// nodes (based on their provided context id sets).
478   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
479                               const DenseSet<uint32_t> &Node2Ids);
480 
481   /// Create a clone of Edge's callee and move Edge to that new callee node,
482   /// performing the necessary context id and allocation type updates.
483   /// If callee's caller edge iterator is supplied, it is updated when removing
484   /// the edge from that list.
485   ContextNode *
486   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
487                            EdgeIter *CallerEdgeI = nullptr);
488 
489   /// Change the callee of Edge to existing callee clone NewCallee, performing
490   /// the necessary context id and allocation type updates.
491   /// If callee's caller edge iterator is supplied, it is updated when removing
492   /// the edge from that list.
493   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
494                                      ContextNode *NewCallee,
495                                      EdgeIter *CallerEdgeI = nullptr,
496                                      bool NewClone = false);
497 
498   /// Recursively perform cloning on the graph for the given Node and its
499   /// callers, in order to uniquely identify the allocation behavior of an
500   /// allocation given its context.
501   void identifyClones(ContextNode *Node,
502                       DenseSet<const ContextNode *> &Visited);
503 
504   /// Map from each context ID to the AllocationType assigned to that context.
505   std::map<uint32_t, AllocationType> ContextIdToAllocationType;
506 
507   /// Identifies the context node created for a stack id when adding the MIB
508   /// contexts to the graph. This is used to locate the context nodes when
509   /// trying to assign the corresponding callsites with those stack ids to these
510   /// nodes.
511   std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
512 
513   /// Maps to track the calls to their corresponding nodes in the graph.
514   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
515   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
516 
517   /// Owner of all ContextNode unique_ptrs.
518   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
519 
520   /// Perform sanity checks on graph when requested.
521   void check() const;
522 
523   /// Keeps track of the last unique context id assigned.
524   unsigned int LastContextId = 0;
525 };
526 
527 template <typename DerivedCCG, typename FuncTy, typename CallTy>
528 using ContextNode =
529     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
530 template <typename DerivedCCG, typename FuncTy, typename CallTy>
531 using ContextEdge =
532     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
533 template <typename DerivedCCG, typename FuncTy, typename CallTy>
534 using FuncInfo =
535     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
536 template <typename DerivedCCG, typename FuncTy, typename CallTy>
537 using CallInfo =
538     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
539 
540 /// CRTP derived class for graphs built from IR (regular LTO).
541 class ModuleCallsiteContextGraph
542     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
543                                   Instruction *> {
544 public:
545   ModuleCallsiteContextGraph(
546       Module &M,
547       function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
548 
549 private:
550   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
551                               Instruction *>;
552 
553   uint64_t getStackId(uint64_t IdOrIndex) const;
554   bool calleeMatchesFunc(Instruction *Call, const Function *Func);
555   uint64_t getLastStackId(Instruction *Call);
556   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
557   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
558   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
559   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
560                        Instruction *>::FuncInfo
561   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
562                            std::map<CallInfo, CallInfo> &CallMap,
563                            std::vector<CallInfo> &CallsWithMetadataInFunc,
564                            unsigned CloneNo);
565   std::string getLabel(const Function *Func, const Instruction *Call,
566                        unsigned CloneNo) const;
567 
568   const Module &Mod;
569   function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
570 };
571 
572 /// Represents a call in the summary index graph, which can either be an
573 /// allocation or an interior callsite node in an allocation's context.
574 /// Holds a pointer to the corresponding data structure in the index.
575 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
576   IndexCall() : PointerUnion() {}
577   IndexCall(std::nullptr_t) : IndexCall() {}
578   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
579   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
580   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
581 
582   IndexCall *operator->() { return this; }
583 
584   PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; }
585 
586   void print(raw_ostream &OS) const {
587     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) {
588       OS << *AI;
589     } else {
590       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase());
591       assert(CI);
592       OS << *CI;
593     }
594   }
595 };
596 
597 /// CRTP derived class for graphs built from summary index (ThinLTO).
598 class IndexCallsiteContextGraph
599     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
600                                   IndexCall> {
601 public:
602   IndexCallsiteContextGraph(
603       ModuleSummaryIndex &Index,
604       function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
605           isPrevailing);
606 
607 private:
608   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
609                               IndexCall>;
610 
611   uint64_t getStackId(uint64_t IdOrIndex) const;
612   bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func);
613   uint64_t getLastStackId(IndexCall &Call);
614   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
615   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
616   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
617   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
618                        IndexCall>::FuncInfo
619   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
620                            std::map<CallInfo, CallInfo> &CallMap,
621                            std::vector<CallInfo> &CallsWithMetadataInFunc,
622                            unsigned CloneNo);
623   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
624                        unsigned CloneNo) const;
625 
626   // Saves mapping from function summaries containing memprof records back to
627   // its VI, for use in checking and debugging.
628   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
629 
630   const ModuleSummaryIndex &Index;
631 };
632 } // namespace
633 
634 namespace llvm {
635 template <>
636 struct DenseMapInfo<typename CallsiteContextGraph<
637     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
638     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
639 template <>
640 struct DenseMapInfo<typename CallsiteContextGraph<
641     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
642     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
643 template <>
644 struct DenseMapInfo<IndexCall>
645     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
646 } // end namespace llvm
647 
648 namespace {
649 
650 struct FieldSeparator {
651   bool Skip = true;
652   const char *Sep;
653 
654   FieldSeparator(const char *Sep = ", ") : Sep(Sep) {}
655 };
656 
657 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) {
658   if (FS.Skip) {
659     FS.Skip = false;
660     return OS;
661   }
662   return OS << FS.Sep;
663 }
664 
665 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
666 // type we should actually use on the corresponding allocation.
667 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
668 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
669 // from NotCold.
670 AllocationType allocTypeToUse(uint8_t AllocTypes) {
671   assert(AllocTypes != (uint8_t)AllocationType::None);
672   if (AllocTypes ==
673       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
674     return AllocationType::NotCold;
675   else
676     return (AllocationType)AllocTypes;
677 }
678 
679 // Helper to check if the alloc types for all edges recorded in the
680 // InAllocTypes vector match the alloc types for all edges in the Edges
681 // vector.
682 template <typename DerivedCCG, typename FuncTy, typename CallTy>
683 bool allocTypesMatch(
684     const std::vector<uint8_t> &InAllocTypes,
685     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
686         &Edges) {
687   return std::equal(
688       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(),
689       [](const uint8_t &l,
690          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
691         // Can share if one of the edges is None type - don't
692         // care about the type along that edge as it doesn't
693         // exist for those context ids.
694         if (l == (uint8_t)AllocationType::None ||
695             r->AllocTypes == (uint8_t)AllocationType::None)
696           return true;
697         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
698       });
699 }
700 
701 } // end anonymous namespace
702 
703 template <typename DerivedCCG, typename FuncTy, typename CallTy>
704 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
705 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
706     const CallInfo &C) {
707   ContextNode *Node = getNodeForAlloc(C);
708   if (Node)
709     return Node;
710 
711   return NonAllocationCallToContextNodeMap.lookup(C);
712 }
713 
714 template <typename DerivedCCG, typename FuncTy, typename CallTy>
715 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
716 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
717     const CallInfo &C) {
718   return AllocationCallToContextNodeMap.lookup(C);
719 }
720 
721 template <typename DerivedCCG, typename FuncTy, typename CallTy>
722 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
723 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
724     uint64_t StackId) {
725   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
726   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
727     return StackEntryNode->second;
728   return nullptr;
729 }
730 
731 template <typename DerivedCCG, typename FuncTy, typename CallTy>
732 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst(
733     const CallInfo &C) {
734   AllocationCallToContextNodeMap.erase(C) ||
735       NonAllocationCallToContextNodeMap.erase(C);
736   assert(!AllocationCallToContextNodeMap.count(C) &&
737          !NonAllocationCallToContextNodeMap.count(C));
738 }
739 
740 template <typename DerivedCCG, typename FuncTy, typename CallTy>
741 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
742     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
743                           unsigned int ContextId) {
744   for (auto &Edge : CallerEdges) {
745     if (Edge->Caller == Caller) {
746       Edge->AllocTypes |= (uint8_t)AllocType;
747       Edge->getContextIds().insert(ContextId);
748       return;
749     }
750   }
751   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
752       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
753   CallerEdges.push_back(Edge);
754   Caller->CalleeEdges.push_back(Edge);
755 }
756 
757 template <typename DerivedCCG, typename FuncTy, typename CallTy>
758 void CallsiteContextGraph<
759     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
760   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
761     auto Edge = *EI;
762     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
763       assert(Edge->ContextIds.empty());
764       Edge->Callee->eraseCallerEdge(Edge.get());
765       EI = Node->CalleeEdges.erase(EI);
766     } else
767       ++EI;
768   }
769 }
770 
771 template <typename DerivedCCG, typename FuncTy, typename CallTy>
772 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
773 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
774     findEdgeFromCallee(const ContextNode *Callee) {
775   for (const auto &Edge : CalleeEdges)
776     if (Edge->Callee == Callee)
777       return Edge.get();
778   return nullptr;
779 }
780 
781 template <typename DerivedCCG, typename FuncTy, typename CallTy>
782 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
783 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
784     findEdgeFromCaller(const ContextNode *Caller) {
785   for (const auto &Edge : CallerEdges)
786     if (Edge->Caller == Caller)
787       return Edge.get();
788   return nullptr;
789 }
790 
791 template <typename DerivedCCG, typename FuncTy, typename CallTy>
792 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
793     eraseCalleeEdge(const ContextEdge *Edge) {
794   auto EI =
795       std::find_if(CalleeEdges.begin(), CalleeEdges.end(),
796                    [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
797                      return CalleeEdge.get() == Edge;
798                    });
799   assert(EI != CalleeEdges.end());
800   CalleeEdges.erase(EI);
801 }
802 
803 template <typename DerivedCCG, typename FuncTy, typename CallTy>
804 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
805     eraseCallerEdge(const ContextEdge *Edge) {
806   auto EI =
807       std::find_if(CallerEdges.begin(), CallerEdges.end(),
808                    [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
809                      return CallerEdge.get() == Edge;
810                    });
811   assert(EI != CallerEdges.end());
812   CallerEdges.erase(EI);
813 }
814 
815 template <typename DerivedCCG, typename FuncTy, typename CallTy>
816 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
817     DenseSet<uint32_t> &ContextIds) {
818   uint8_t BothTypes =
819       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
820   uint8_t AllocType = (uint8_t)AllocationType::None;
821   for (auto Id : ContextIds) {
822     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
823     // Bail early if alloc type reached both, no further refinement.
824     if (AllocType == BothTypes)
825       return AllocType;
826   }
827   return AllocType;
828 }
829 
830 template <typename DerivedCCG, typename FuncTy, typename CallTy>
831 uint8_t
832 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
833     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
834   uint8_t BothTypes =
835       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
836   uint8_t AllocType = (uint8_t)AllocationType::None;
837   for (auto Id : Node1Ids) {
838     if (!Node2Ids.count(Id))
839       continue;
840     AllocType |= (uint8_t)ContextIdToAllocationType[Id];
841     // Bail early if alloc type reached both, no further refinement.
842     if (AllocType == BothTypes)
843       return AllocType;
844   }
845   return AllocType;
846 }
847 
848 template <typename DerivedCCG, typename FuncTy, typename CallTy>
849 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
850     const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) {
851   if (Node1Ids.size() < Node2Ids.size())
852     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
853   else
854     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
855 }
856 
857 template <typename DerivedCCG, typename FuncTy, typename CallTy>
858 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
859 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
860     CallInfo Call, const FuncTy *F) {
861   assert(!getNodeForAlloc(Call));
862   NodeOwner.push_back(
863       std::make_unique<ContextNode>(/*IsAllocation=*/true, Call));
864   ContextNode *AllocNode = NodeOwner.back().get();
865   AllocationCallToContextNodeMap[Call] = AllocNode;
866   NodeToCallingFunc[AllocNode] = F;
867   // Use LastContextId as a uniq id for MIB allocation nodes.
868   AllocNode->OrigStackOrAllocId = LastContextId;
869   // Alloc type should be updated as we add in the MIBs. We should assert
870   // afterwards that it is not still None.
871   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
872 
873   return AllocNode;
874 }
875 
876 template <typename DerivedCCG, typename FuncTy, typename CallTy>
877 template <class NodeT, class IteratorT>
878 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
879     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
880     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) {
881   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
882   // is done.
883   if (AllocType == AllocationType::Hot)
884     AllocType = AllocationType::NotCold;
885 
886   ContextIdToAllocationType[++LastContextId] = AllocType;
887 
888   // Update alloc type and context ids for this MIB.
889   AllocNode->AllocTypes |= (uint8_t)AllocType;
890   AllocNode->ContextIds.insert(LastContextId);
891 
892   // Now add or update nodes for each stack id in alloc's context.
893   // Later when processing the stack ids on non-alloc callsites we will adjust
894   // for any inlining in the context.
895   ContextNode *PrevNode = AllocNode;
896   // Look for recursion (direct recursion should have been collapsed by
897   // module summary analysis, here we should just be detecting mutual
898   // recursion). Mark these nodes so we don't try to clone.
899   SmallSet<uint64_t, 8> StackIdSet;
900   // Skip any on the allocation call (inlining).
901   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
902        ContextIter != StackContext.end(); ++ContextIter) {
903     auto StackId = getStackId(*ContextIter);
904     ContextNode *StackNode = getNodeForStackId(StackId);
905     if (!StackNode) {
906       NodeOwner.push_back(
907           std::make_unique<ContextNode>(/*IsAllocation=*/false));
908       StackNode = NodeOwner.back().get();
909       StackEntryIdToContextNodeMap[StackId] = StackNode;
910       StackNode->OrigStackOrAllocId = StackId;
911     }
912     auto Ins = StackIdSet.insert(StackId);
913     if (!Ins.second)
914       StackNode->Recursive = true;
915     StackNode->ContextIds.insert(LastContextId);
916     StackNode->AllocTypes |= (uint8_t)AllocType;
917     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
918     PrevNode = StackNode;
919   }
920 }
921 
922 template <typename DerivedCCG, typename FuncTy, typename CallTy>
923 DenseSet<uint32_t>
924 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
925     const DenseSet<uint32_t> &StackSequenceContextIds,
926     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
927   DenseSet<uint32_t> NewContextIds;
928   for (auto OldId : StackSequenceContextIds) {
929     NewContextIds.insert(++LastContextId);
930     OldToNewContextIds[OldId].insert(LastContextId);
931     assert(ContextIdToAllocationType.count(OldId));
932     // The new context has the same allocation type as original.
933     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
934   }
935   return NewContextIds;
936 }
937 
938 template <typename DerivedCCG, typename FuncTy, typename CallTy>
939 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
940     propagateDuplicateContextIds(
941         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
942   // Build a set of duplicated context ids corresponding to the input id set.
943   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
944     DenseSet<uint32_t> NewIds;
945     for (auto Id : ContextIds)
946       if (auto NewId = OldToNewContextIds.find(Id);
947           NewId != OldToNewContextIds.end())
948         NewIds.insert(NewId->second.begin(), NewId->second.end());
949     return NewIds;
950   };
951 
952   // Recursively update context ids sets along caller edges.
953   auto UpdateCallers = [&](ContextNode *Node,
954                            DenseSet<const ContextEdge *> &Visited,
955                            auto &&UpdateCallers) -> void {
956     for (const auto &Edge : Node->CallerEdges) {
957       auto Inserted = Visited.insert(Edge.get());
958       if (!Inserted.second)
959         continue;
960       ContextNode *NextNode = Edge->Caller;
961       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
962       // Only need to recursively iterate to NextNode via this caller edge if
963       // it resulted in any added ids to NextNode.
964       if (!NewIdsToAdd.empty()) {
965         Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
966         NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
967         UpdateCallers(NextNode, Visited, UpdateCallers);
968       }
969     }
970   };
971 
972   DenseSet<const ContextEdge *> Visited;
973   for (auto &Entry : AllocationCallToContextNodeMap) {
974     auto *Node = Entry.second;
975     // Update ids on the allocation nodes before calling the recursive
976     // update along caller edges, since this simplifies the logic during
977     // that traversal.
978     DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds);
979     Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end());
980     UpdateCallers(Node, Visited, UpdateCallers);
981   }
982 }
983 
984 template <typename DerivedCCG, typename FuncTy, typename CallTy>
985 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
986     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) {
987   // Make a copy of the context ids, since this will be adjusted below as they
988   // are moved.
989   DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds;
990   auto &OrigEdges =
991       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
992   // Increment iterator in loop so that we can remove edges as needed.
993   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
994     auto Edge = *EI;
995     // Remove any matching context ids from Edge, return set that were found and
996     // removed, these are the new edge's context ids. Also update the remaining
997     // (not found ids).
998     DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds;
999     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1000                  NotFoundContextIds);
1001     RemainingContextIds.swap(NotFoundContextIds);
1002     // If no matching context ids for this edge, skip it.
1003     if (NewEdgeContextIds.empty()) {
1004       ++EI;
1005       continue;
1006     }
1007     if (TowardsCallee) {
1008       auto NewEdge = std::make_shared<ContextEdge>(
1009           Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds),
1010           NewEdgeContextIds);
1011       NewNode->CalleeEdges.push_back(NewEdge);
1012       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1013     } else {
1014       auto NewEdge = std::make_shared<ContextEdge>(
1015           NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds),
1016           NewEdgeContextIds);
1017       NewNode->CallerEdges.push_back(NewEdge);
1018       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1019     }
1020     // Remove old edge if context ids empty.
1021     if (Edge->getContextIds().empty()) {
1022       if (TowardsCallee) {
1023         Edge->Callee->eraseCallerEdge(Edge.get());
1024         EI = OrigNode->CalleeEdges.erase(EI);
1025       } else {
1026         Edge->Caller->eraseCalleeEdge(Edge.get());
1027         EI = OrigNode->CallerEdges.erase(EI);
1028       }
1029       continue;
1030     }
1031     ++EI;
1032   }
1033 }
1034 
1035 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1036 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1037     assignStackNodesPostOrder(ContextNode *Node,
1038                               DenseSet<const ContextNode *> &Visited,
1039                               DenseMap<uint64_t, std::vector<CallContextInfo>>
1040                                   &StackIdToMatchingCalls) {
1041   auto Inserted = Visited.insert(Node);
1042   if (!Inserted.second)
1043     return;
1044   // Post order traversal. Iterate over a copy since we may add nodes and
1045   // therefore new callers during the recursive call, invalidating any
1046   // iterator over the original edge vector. We don't need to process these
1047   // new nodes as they were already processed on creation.
1048   auto CallerEdges = Node->CallerEdges;
1049   for (auto &Edge : CallerEdges) {
1050     // Skip any that have been removed during the recursion.
1051     if (!Edge)
1052       continue;
1053     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls);
1054   }
1055 
1056   // If this node's stack id is in the map, update the graph to contain new
1057   // nodes representing any inlining at interior callsites. Note we move the
1058   // associated context ids over to the new nodes.
1059 
1060   // Ignore this node if it is for an allocation or we didn't record any
1061   // stack id lists ending at it.
1062   if (Node->IsAllocation ||
1063       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1064     return;
1065 
1066   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1067   // Handle the simple case first. A single call with a single stack id.
1068   // In this case there is no need to create any new context nodes, simply
1069   // assign the context node for stack id to this Call.
1070   if (Calls.size() == 1) {
1071     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1072     if (Ids.size() == 1) {
1073       assert(SavedContextIds.empty());
1074       // It should be this Node
1075       assert(Node == getNodeForStackId(Ids[0]));
1076       if (Node->Recursive)
1077         return;
1078       Node->setCall(Call);
1079       NonAllocationCallToContextNodeMap[Call] = Node;
1080       NodeToCallingFunc[Node] = Func;
1081       return;
1082     }
1083   }
1084 
1085   // Find the node for the last stack id, which should be the same
1086   // across all calls recorded for this id, and is this node's id.
1087   uint64_t LastId = Node->OrigStackOrAllocId;
1088   ContextNode *LastNode = getNodeForStackId(LastId);
1089   // We should only have kept stack ids that had nodes.
1090   assert(LastNode);
1091 
1092   for (unsigned I = 0; I < Calls.size(); I++) {
1093     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1094     // Skip any for which we didn't assign any ids, these don't get a node in
1095     // the graph.
1096     if (SavedContextIds.empty())
1097       continue;
1098 
1099     assert(LastId == Ids.back());
1100 
1101     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1102     assert(FirstNode);
1103 
1104     // Recompute the context ids for this stack id sequence (the
1105     // intersection of the context ids of the corresponding nodes).
1106     // Start with the ids we saved in the map for this call, which could be
1107     // duplicated context ids. We have to recompute as we might have overlap
1108     // overlap between the saved context ids for different last nodes, and
1109     // removed them already during the post order traversal.
1110     set_intersect(SavedContextIds, FirstNode->ContextIds);
1111     ContextNode *PrevNode = nullptr;
1112     for (auto Id : Ids) {
1113       ContextNode *CurNode = getNodeForStackId(Id);
1114       // We should only have kept stack ids that had nodes and weren't
1115       // recursive.
1116       assert(CurNode);
1117       assert(!CurNode->Recursive);
1118       if (!PrevNode) {
1119         PrevNode = CurNode;
1120         continue;
1121       }
1122       auto *Edge = CurNode->findEdgeFromCallee(PrevNode);
1123       if (!Edge) {
1124         SavedContextIds.clear();
1125         break;
1126       }
1127       PrevNode = CurNode;
1128       set_intersect(SavedContextIds, Edge->getContextIds());
1129 
1130       // If we now have no context ids for clone, skip this call.
1131       if (SavedContextIds.empty())
1132         break;
1133     }
1134     if (SavedContextIds.empty())
1135       continue;
1136 
1137     // Create new context node.
1138     NodeOwner.push_back(
1139         std::make_unique<ContextNode>(/*IsAllocation=*/false, Call));
1140     ContextNode *NewNode = NodeOwner.back().get();
1141     NodeToCallingFunc[NewNode] = Func;
1142     NonAllocationCallToContextNodeMap[Call] = NewNode;
1143     NewNode->ContextIds = SavedContextIds;
1144     NewNode->AllocTypes = computeAllocType(NewNode->ContextIds);
1145 
1146     // Connect to callees of innermost stack frame in inlined call chain.
1147     // This updates context ids for FirstNode's callee's to reflect those
1148     // moved to NewNode.
1149     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true);
1150 
1151     // Connect to callers of outermost stack frame in inlined call chain.
1152     // This updates context ids for FirstNode's caller's to reflect those
1153     // moved to NewNode.
1154     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false);
1155 
1156     // Now we need to remove context ids from edges/nodes between First and
1157     // Last Node.
1158     PrevNode = nullptr;
1159     for (auto Id : Ids) {
1160       ContextNode *CurNode = getNodeForStackId(Id);
1161       // We should only have kept stack ids that had nodes.
1162       assert(CurNode);
1163 
1164       // Remove the context ids moved to NewNode from CurNode, and the
1165       // edge from the prior node.
1166       set_subtract(CurNode->ContextIds, NewNode->ContextIds);
1167       if (PrevNode) {
1168         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1169         assert(PrevEdge);
1170         set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds);
1171         if (PrevEdge->getContextIds().empty()) {
1172           PrevNode->eraseCallerEdge(PrevEdge);
1173           CurNode->eraseCalleeEdge(PrevEdge);
1174         }
1175       }
1176       PrevNode = CurNode;
1177     }
1178   }
1179 }
1180 
1181 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1182 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1183   // Map of stack id to all calls with that as the last (outermost caller)
1184   // callsite id that has a context node (some might not due to pruning
1185   // performed during matching of the allocation profile contexts).
1186   // The CallContextInfo contains the Call and a list of its stack ids with
1187   // ContextNodes, the function containing Call, and the set of context ids
1188   // the analysis will eventually identify for use in any new node created
1189   // for that callsite.
1190   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1191   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1192     for (auto &Call : CallsWithMetadata) {
1193       // Ignore allocations, already handled.
1194       if (AllocationCallToContextNodeMap.count(Call))
1195         continue;
1196       auto StackIdsWithContextNodes =
1197           getStackIdsWithContextNodesForCall(Call.call());
1198       // If there were no nodes created for MIBs on allocs (maybe this was in
1199       // the unambiguous part of the MIB stack that was pruned), ignore.
1200       if (StackIdsWithContextNodes.empty())
1201         continue;
1202       // Otherwise, record this Call along with the list of ids for the last
1203       // (outermost caller) stack id with a node.
1204       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1205           {Call.call(), StackIdsWithContextNodes, Func, {}});
1206     }
1207   }
1208 
1209   // First make a pass through all stack ids that correspond to a call,
1210   // as identified in the above loop. Compute the context ids corresponding to
1211   // each of these calls when they correspond to multiple stack ids due to
1212   // due to inlining. Perform any duplication of context ids required when
1213   // there is more than one call with the same stack ids. Their (possibly newly
1214   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1215   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1216   for (auto &It : StackIdToMatchingCalls) {
1217     auto &Calls = It.getSecond();
1218     // Skip single calls with a single stack id. These don't need a new node.
1219     if (Calls.size() == 1) {
1220       auto &Ids = std::get<1>(Calls[0]);
1221       if (Ids.size() == 1)
1222         continue;
1223     }
1224     // In order to do the best and maximal matching of inlined calls to context
1225     // node sequences we will sort the vectors of stack ids in descending order
1226     // of length, and within each length, lexicographically by stack id. The
1227     // latter is so that we can specially handle calls that have identical stack
1228     // id sequences (either due to cloning or artificially because of the MIB
1229     // context pruning).
1230     std::stable_sort(Calls.begin(), Calls.end(),
1231                      [](const CallContextInfo &A, const CallContextInfo &B) {
1232                        auto &IdsA = std::get<1>(A);
1233                        auto &IdsB = std::get<1>(B);
1234                        return IdsA.size() > IdsB.size() ||
1235                               (IdsA.size() == IdsB.size() && IdsA < IdsB);
1236                      });
1237 
1238     // Find the node for the last stack id, which should be the same
1239     // across all calls recorded for this id, and is the id for this
1240     // entry in the StackIdToMatchingCalls map.
1241     uint64_t LastId = It.getFirst();
1242     ContextNode *LastNode = getNodeForStackId(LastId);
1243     // We should only have kept stack ids that had nodes.
1244     assert(LastNode);
1245 
1246     if (LastNode->Recursive)
1247       continue;
1248 
1249     // Initialize the context ids with the last node's. We will subsequently
1250     // refine the context ids by computing the intersection along all edges.
1251     DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds;
1252     assert(!LastNodeContextIds.empty());
1253 
1254     for (unsigned I = 0; I < Calls.size(); I++) {
1255       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1256       assert(SavedContextIds.empty());
1257       assert(LastId == Ids.back());
1258 
1259       // First compute the context ids for this stack id sequence (the
1260       // intersection of the context ids of the corresponding nodes).
1261       // Start with the remaining saved ids for the last node.
1262       assert(!LastNodeContextIds.empty());
1263       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1264 
1265       ContextNode *PrevNode = LastNode;
1266       ContextNode *CurNode = LastNode;
1267       bool Skip = false;
1268 
1269       // Iterate backwards through the stack Ids, starting after the last Id
1270       // in the list, which was handled once outside for all Calls.
1271       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1272         auto Id = *IdIter;
1273         CurNode = getNodeForStackId(Id);
1274         // We should only have kept stack ids that had nodes.
1275         assert(CurNode);
1276 
1277         if (CurNode->Recursive) {
1278           Skip = true;
1279           break;
1280         }
1281 
1282         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1283         // If there is no edge then the nodes belong to different MIB contexts,
1284         // and we should skip this inlined context sequence. For example, this
1285         // particular inlined context may include stack ids A->B, and we may
1286         // indeed have nodes for both A and B, but it is possible that they were
1287         // never profiled in sequence in a single MIB for any allocation (i.e.
1288         // we might have profiled an allocation that involves the callsite A,
1289         // but through a different one of its callee callsites, and we might
1290         // have profiled an allocation that involves callsite B, but reached
1291         // from a different caller callsite).
1292         if (!Edge) {
1293           Skip = true;
1294           break;
1295         }
1296         PrevNode = CurNode;
1297 
1298         // Update the context ids, which is the intersection of the ids along
1299         // all edges in the sequence.
1300         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1301 
1302         // If we now have no context ids for clone, skip this call.
1303         if (StackSequenceContextIds.empty()) {
1304           Skip = true;
1305           break;
1306         }
1307       }
1308       if (Skip)
1309         continue;
1310 
1311       // If some of this call's stack ids did not have corresponding nodes (due
1312       // to pruning), don't include any context ids for contexts that extend
1313       // beyond these nodes. Otherwise we would be matching part of unrelated /
1314       // not fully matching stack contexts. To do this, subtract any context ids
1315       // found in caller nodes of the last node found above.
1316       if (Ids.back() != getLastStackId(Call)) {
1317         for (const auto &PE : CurNode->CallerEdges) {
1318           set_subtract(StackSequenceContextIds, PE->getContextIds());
1319           if (StackSequenceContextIds.empty())
1320             break;
1321         }
1322         // If we now have no context ids for clone, skip this call.
1323         if (StackSequenceContextIds.empty())
1324           continue;
1325       }
1326 
1327       // Check if the next set of stack ids is the same (since the Calls vector
1328       // of tuples is sorted by the stack ids we can just look at the next one).
1329       bool DuplicateContextIds = false;
1330       if (I + 1 < Calls.size()) {
1331         auto NextIds = std::get<1>(Calls[I + 1]);
1332         DuplicateContextIds = Ids == NextIds;
1333       }
1334 
1335       // If we don't have duplicate context ids, then we can assign all the
1336       // context ids computed for the original node sequence to this call.
1337       // If there are duplicate calls with the same stack ids then we synthesize
1338       // new context ids that are duplicates of the originals. These are
1339       // assigned to SavedContextIds, which is a reference into the map entry
1340       // for this call, allowing us to access these ids later on.
1341       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1342                                  StackSequenceContextIds.size());
1343       SavedContextIds =
1344           DuplicateContextIds
1345               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
1346               : StackSequenceContextIds;
1347       assert(!SavedContextIds.empty());
1348 
1349       if (!DuplicateContextIds) {
1350         // Update saved last node's context ids to remove those that are
1351         // assigned to other calls, so that it is ready for the next call at
1352         // this stack id.
1353         set_subtract(LastNodeContextIds, StackSequenceContextIds);
1354         if (LastNodeContextIds.empty())
1355           break;
1356       }
1357     }
1358   }
1359 
1360   // Propagate the duplicate context ids over the graph.
1361   propagateDuplicateContextIds(OldToNewContextIds);
1362 
1363   if (VerifyCCG)
1364     check();
1365 
1366   // Now perform a post-order traversal over the graph, starting with the
1367   // allocation nodes, essentially processing nodes from callers to callees.
1368   // For any that contains an id in the map, update the graph to contain new
1369   // nodes representing any inlining at interior callsites. Note we move the
1370   // associated context ids over to the new nodes.
1371   DenseSet<const ContextNode *> Visited;
1372   for (auto &Entry : AllocationCallToContextNodeMap)
1373     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls);
1374 }
1375 
1376 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
1377   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1378       Call->getMetadata(LLVMContext::MD_callsite));
1379   return CallsiteContext.back();
1380 }
1381 
1382 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
1383   assert(isa<CallsiteInfo *>(Call.getBase()));
1384   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1385       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1386   // Need to convert index into stack id.
1387   return Index.getStackIdAtIndex(CallsiteContext.back());
1388 }
1389 
1390 static const std::string MemProfCloneSuffix = ".memprof.";
1391 
1392 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
1393   // We use CloneNo == 0 to refer to the original version, which doesn't get
1394   // renamed with a suffix.
1395   if (!CloneNo)
1396     return Base.str();
1397   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
1398 }
1399 
1400 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
1401                                                  const Instruction *Call,
1402                                                  unsigned CloneNo) const {
1403   return (Twine(Call->getFunction()->getName()) + " -> " +
1404           cast<CallBase>(Call)->getCalledFunction()->getName())
1405       .str();
1406 }
1407 
1408 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
1409                                                 const IndexCall &Call,
1410                                                 unsigned CloneNo) const {
1411   auto VI = FSToVIMap.find(Func);
1412   assert(VI != FSToVIMap.end());
1413   if (isa<AllocInfo *>(Call.getBase()))
1414     return (VI->second.name() + " -> alloc").str();
1415   else {
1416     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase());
1417     return (VI->second.name() + " -> " +
1418             getMemProfFuncName(Callsite->Callee.name(),
1419                                Callsite->Clones[CloneNo]))
1420         .str();
1421   }
1422 }
1423 
1424 std::vector<uint64_t>
1425 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
1426     Instruction *Call) {
1427   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
1428       Call->getMetadata(LLVMContext::MD_callsite));
1429   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
1430       CallsiteContext);
1431 }
1432 
1433 std::vector<uint64_t>
1434 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
1435   assert(isa<CallsiteInfo *>(Call.getBase()));
1436   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
1437       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase()));
1438   return getStackIdsWithContextNodes<CallsiteInfo,
1439                                      SmallVector<unsigned>::const_iterator>(
1440       CallsiteContext);
1441 }
1442 
1443 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1444 template <class NodeT, class IteratorT>
1445 std::vector<uint64_t>
1446 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
1447     CallStack<NodeT, IteratorT> &CallsiteContext) {
1448   std::vector<uint64_t> StackIds;
1449   for (auto IdOrIndex : CallsiteContext) {
1450     auto StackId = getStackId(IdOrIndex);
1451     ContextNode *Node = getNodeForStackId(StackId);
1452     if (!Node)
1453       break;
1454     StackIds.push_back(StackId);
1455   }
1456   return StackIds;
1457 }
1458 
1459 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
1460     Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
1461     : Mod(M), OREGetter(OREGetter) {
1462   for (auto &F : M) {
1463     std::vector<CallInfo> CallsWithMetadata;
1464     for (auto &BB : F) {
1465       for (auto &I : BB) {
1466         if (!isa<CallBase>(I))
1467           continue;
1468         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
1469           CallsWithMetadata.push_back(&I);
1470           auto *AllocNode = addAllocNode(&I, &F);
1471           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
1472           assert(CallsiteMD);
1473           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
1474           // Add all of the MIBs and their stack nodes.
1475           for (auto &MDOp : MemProfMD->operands()) {
1476             auto *MIBMD = cast<const MDNode>(MDOp);
1477             MDNode *StackNode = getMIBStackNode(MIBMD);
1478             assert(StackNode);
1479             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
1480             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
1481                 AllocNode, StackContext, CallsiteContext,
1482                 getMIBAllocType(MIBMD));
1483           }
1484           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1485           // Memprof and callsite metadata on memory allocations no longer
1486           // needed.
1487           I.setMetadata(LLVMContext::MD_memprof, nullptr);
1488           I.setMetadata(LLVMContext::MD_callsite, nullptr);
1489         }
1490         // For callsite metadata, add to list for this function for later use.
1491         else if (I.getMetadata(LLVMContext::MD_callsite))
1492           CallsWithMetadata.push_back(&I);
1493       }
1494     }
1495     if (!CallsWithMetadata.empty())
1496       FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata});
1497   }
1498 
1499   if (DumpCCG) {
1500     dbgs() << "CCG before updating call stack chains:\n";
1501     dbgs() << *this;
1502   }
1503 
1504   if (ExportToDot)
1505     exportToDot("prestackupdate");
1506 
1507   updateStackNodes();
1508 
1509   handleCallsitesWithMultipleTargets();
1510 
1511   // Strip off remaining callsite metadata, no longer needed.
1512   for (auto &FuncEntry : FuncToCallsWithMetadata)
1513     for (auto &Call : FuncEntry.second)
1514       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
1515 }
1516 
1517 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
1518     ModuleSummaryIndex &Index,
1519     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1520         isPrevailing)
1521     : Index(Index) {
1522   for (auto &I : Index) {
1523     auto VI = Index.getValueInfo(I);
1524     for (auto &S : VI.getSummaryList()) {
1525       // We should only add the prevailing nodes. Otherwise we may try to clone
1526       // in a weak copy that won't be linked (and may be different than the
1527       // prevailing version).
1528       // We only keep the memprof summary on the prevailing copy now when
1529       // building the combined index, as a space optimization, however don't
1530       // rely on this optimization. The linker doesn't resolve local linkage
1531       // values so don't check whether those are prevailing.
1532       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
1533           !isPrevailing(VI.getGUID(), S.get()))
1534         continue;
1535       auto *FS = dyn_cast<FunctionSummary>(S.get());
1536       if (!FS)
1537         continue;
1538       std::vector<CallInfo> CallsWithMetadata;
1539       if (!FS->allocs().empty()) {
1540         for (auto &AN : FS->mutableAllocs()) {
1541           // This can happen because of recursion elimination handling that
1542           // currently exists in ModuleSummaryAnalysis. Skip these for now.
1543           // We still added them to the summary because we need to be able to
1544           // correlate properly in applyImport in the backends.
1545           if (AN.MIBs.empty())
1546             continue;
1547           CallsWithMetadata.push_back({&AN});
1548           auto *AllocNode = addAllocNode({&AN}, FS);
1549           // Pass an empty CallStack to the CallsiteContext (second)
1550           // parameter, since for ThinLTO we already collapsed out the inlined
1551           // stack ids on the allocation call during ModuleSummaryAnalysis.
1552           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1553               EmptyContext;
1554           // Now add all of the MIBs and their stack nodes.
1555           for (auto &MIB : AN.MIBs) {
1556             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
1557                 StackContext(&MIB);
1558             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
1559                 AllocNode, StackContext, EmptyContext, MIB.AllocType);
1560           }
1561           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
1562           // Initialize version 0 on the summary alloc node to the current alloc
1563           // type, unless it has both types in which case make it default, so
1564           // that in the case where we aren't able to clone the original version
1565           // always ends up with the default allocation behavior.
1566           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
1567         }
1568       }
1569       // For callsite metadata, add to list for this function for later use.
1570       if (!FS->callsites().empty())
1571         for (auto &SN : FS->mutableCallsites())
1572           CallsWithMetadata.push_back({&SN});
1573 
1574       if (!CallsWithMetadata.empty())
1575         FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata});
1576 
1577       if (!FS->allocs().empty() || !FS->callsites().empty())
1578         FSToVIMap[FS] = VI;
1579     }
1580   }
1581 
1582   if (DumpCCG) {
1583     dbgs() << "CCG before updating call stack chains:\n";
1584     dbgs() << *this;
1585   }
1586 
1587   if (ExportToDot)
1588     exportToDot("prestackupdate");
1589 
1590   updateStackNodes();
1591 
1592   handleCallsitesWithMultipleTargets();
1593 }
1594 
1595 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1596 void CallsiteContextGraph<DerivedCCG, FuncTy,
1597                           CallTy>::handleCallsitesWithMultipleTargets() {
1598   // Look for and workaround callsites that call multiple functions.
1599   // This can happen for indirect calls, which needs better handling, and in
1600   // more rare cases (e.g. macro expansion).
1601   // TODO: To fix this for indirect calls we will want to perform speculative
1602   // devirtualization using either the normal PGO info with ICP, or using the
1603   // information in the profiled MemProf contexts. We can do this prior to
1604   // this transformation for regular LTO, and for ThinLTO we can simulate that
1605   // effect in the summary and perform the actual speculative devirtualization
1606   // while cloning in the ThinLTO backend.
1607   for (auto Entry = NonAllocationCallToContextNodeMap.begin();
1608        Entry != NonAllocationCallToContextNodeMap.end();) {
1609     auto *Node = Entry->second;
1610     assert(Node->Clones.empty());
1611     // Check all node callees and see if in the same function.
1612     bool Removed = false;
1613     auto Call = Node->Call.call();
1614     for (auto &Edge : Node->CalleeEdges) {
1615       if (!Edge->Callee->hasCall())
1616         continue;
1617       assert(NodeToCallingFunc.count(Edge->Callee));
1618       // Check if the called function matches that of the callee node.
1619       if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee]))
1620         continue;
1621       // Work around by setting Node to have a null call, so it gets
1622       // skipped during cloning. Otherwise assignFunctions will assert
1623       // because its data structures are not designed to handle this case.
1624       Entry = NonAllocationCallToContextNodeMap.erase(Entry);
1625       Node->setCall(CallInfo());
1626       Removed = true;
1627       break;
1628     }
1629     if (!Removed)
1630       Entry++;
1631   }
1632 }
1633 
1634 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1635   // In the Module (IR) case this is already the Id.
1636   return IdOrIndex;
1637 }
1638 
1639 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
1640   // In the Index case this is an index into the stack id list in the summary
1641   // index, convert it to an Id.
1642   return Index.getStackIdAtIndex(IdOrIndex);
1643 }
1644 
1645 bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call,
1646                                                    const Function *Func) {
1647   auto *CB = dyn_cast<CallBase>(Call);
1648   if (!CB->getCalledOperand())
1649     return false;
1650   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
1651   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
1652   if (CalleeFunc == Func)
1653     return true;
1654   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
1655   return Alias && Alias->getAliasee() == Func;
1656 }
1657 
1658 bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call,
1659                                                   const FunctionSummary *Func) {
1660   ValueInfo Callee =
1661       dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee;
1662   // If there is no summary list then this is a call to an externally defined
1663   // symbol.
1664   AliasSummary *Alias =
1665       Callee.getSummaryList().empty()
1666           ? nullptr
1667           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
1668   assert(FSToVIMap.count(Func));
1669   return Callee == FSToVIMap[Func] ||
1670          // If callee is an alias, check the aliasee, since only function
1671          // summary base objects will contain the stack node summaries and thus
1672          // get a context node.
1673          (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]);
1674 }
1675 
1676 static std::string getAllocTypeString(uint8_t AllocTypes) {
1677   if (!AllocTypes)
1678     return "None";
1679   std::string Str;
1680   if (AllocTypes & (uint8_t)AllocationType::NotCold)
1681     Str += "NotCold";
1682   if (AllocTypes & (uint8_t)AllocationType::Cold)
1683     Str += "Cold";
1684   return Str;
1685 }
1686 
1687 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1688 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
1689     const {
1690   print(dbgs());
1691   dbgs() << "\n";
1692 }
1693 
1694 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1695 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
1696     raw_ostream &OS) const {
1697   OS << "Node " << this << "\n";
1698   OS << "\t";
1699   printCall(OS);
1700   if (Recursive)
1701     OS << " (recursive)";
1702   OS << "\n";
1703   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
1704   OS << "\tContextIds:";
1705   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1706   std::sort(SortedIds.begin(), SortedIds.end());
1707   for (auto Id : SortedIds)
1708     OS << " " << Id;
1709   OS << "\n";
1710   OS << "\tCalleeEdges:\n";
1711   for (auto &Edge : CalleeEdges)
1712     OS << "\t\t" << *Edge << "\n";
1713   OS << "\tCallerEdges:\n";
1714   for (auto &Edge : CallerEdges)
1715     OS << "\t\t" << *Edge << "\n";
1716   if (!Clones.empty()) {
1717     OS << "\tClones: ";
1718     FieldSeparator FS;
1719     for (auto *Clone : Clones)
1720       OS << FS << Clone;
1721     OS << "\n";
1722   } else if (CloneOf) {
1723     OS << "\tClone of " << CloneOf << "\n";
1724   }
1725 }
1726 
1727 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1728 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
1729     const {
1730   print(dbgs());
1731   dbgs() << "\n";
1732 }
1733 
1734 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1735 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
1736     raw_ostream &OS) const {
1737   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
1738      << " AllocTypes: " << getAllocTypeString(AllocTypes);
1739   OS << " ContextIds:";
1740   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1741   std::sort(SortedIds.begin(), SortedIds.end());
1742   for (auto Id : SortedIds)
1743     OS << " " << Id;
1744 }
1745 
1746 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1747 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
1748   print(dbgs());
1749 }
1750 
1751 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1752 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
1753     raw_ostream &OS) const {
1754   OS << "Callsite Context Graph:\n";
1755   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1756   for (const auto Node : nodes<GraphType>(this)) {
1757     if (Node->isRemoved())
1758       continue;
1759     Node->print(OS);
1760     OS << "\n";
1761   }
1762 }
1763 
1764 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1765 static void checkEdge(
1766     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1767   // Confirm that alloc type is not None and that we have at least one context
1768   // id.
1769   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1770   assert(!Edge->ContextIds.empty());
1771 }
1772 
1773 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1774 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1775                       bool CheckEdges = true) {
1776   if (Node->isRemoved())
1777     return;
1778   // Node's context ids should be the union of both its callee and caller edge
1779   // context ids.
1780   if (Node->CallerEdges.size()) {
1781     auto EI = Node->CallerEdges.begin();
1782     auto &FirstEdge = *EI;
1783     EI++;
1784     DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds);
1785     for (; EI != Node->CallerEdges.end(); EI++) {
1786       const auto &Edge = *EI;
1787       if (CheckEdges)
1788         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1789       set_union(CallerEdgeContextIds, Edge->ContextIds);
1790     }
1791     // Node can have more context ids than callers if some contexts terminate at
1792     // node and some are longer.
1793     assert(Node->ContextIds == CallerEdgeContextIds ||
1794            set_is_subset(CallerEdgeContextIds, Node->ContextIds));
1795   }
1796   if (Node->CalleeEdges.size()) {
1797     auto EI = Node->CalleeEdges.begin();
1798     auto &FirstEdge = *EI;
1799     EI++;
1800     DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds);
1801     for (; EI != Node->CalleeEdges.end(); EI++) {
1802       const auto &Edge = *EI;
1803       if (CheckEdges)
1804         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1805       set_union(CalleeEdgeContextIds, Edge->ContextIds);
1806     }
1807     assert(Node->ContextIds == CalleeEdgeContextIds);
1808   }
1809 }
1810 
1811 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1812 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
1813   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1814   for (const auto Node : nodes<GraphType>(this)) {
1815     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
1816     for (auto &Edge : Node->CallerEdges)
1817       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1818   }
1819 }
1820 
1821 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1822 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
1823   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1824   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
1825 
1826   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
1827   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
1828 
1829   using nodes_iterator =
1830       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
1831                       decltype(&getNode)>;
1832 
1833   static nodes_iterator nodes_begin(GraphType G) {
1834     return nodes_iterator(G->NodeOwner.begin(), &getNode);
1835   }
1836 
1837   static nodes_iterator nodes_end(GraphType G) {
1838     return nodes_iterator(G->NodeOwner.end(), &getNode);
1839   }
1840 
1841   static NodeRef getEntryNode(GraphType G) {
1842     return G->NodeOwner.begin()->get();
1843   }
1844 
1845   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
1846   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
1847   GetCallee(const EdgePtrTy &P) {
1848     return P->Callee;
1849   }
1850 
1851   using ChildIteratorType =
1852       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
1853                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
1854                       decltype(&GetCallee)>;
1855 
1856   static ChildIteratorType child_begin(NodeRef N) {
1857     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
1858   }
1859 
1860   static ChildIteratorType child_end(NodeRef N) {
1861     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
1862   }
1863 };
1864 
1865 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1866 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
1867     : public DefaultDOTGraphTraits {
1868   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {}
1869 
1870   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
1871   using GTraits = GraphTraits<GraphType>;
1872   using NodeRef = typename GTraits::NodeRef;
1873   using ChildIteratorType = typename GTraits::ChildIteratorType;
1874 
1875   static std::string getNodeLabel(NodeRef Node, GraphType G) {
1876     std::string LabelString =
1877         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
1878          Twine(Node->OrigStackOrAllocId))
1879             .str();
1880     LabelString += "\n";
1881     if (Node->hasCall()) {
1882       auto Func = G->NodeToCallingFunc.find(Node);
1883       assert(Func != G->NodeToCallingFunc.end());
1884       LabelString +=
1885           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
1886     } else {
1887       LabelString += "null call";
1888       if (Node->Recursive)
1889         LabelString += " (recursive)";
1890       else
1891         LabelString += " (external)";
1892     }
1893     return LabelString;
1894   }
1895 
1896   static std::string getNodeAttributes(NodeRef Node, GraphType) {
1897     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
1898                                    getContextIds(Node->ContextIds) + "\"")
1899                                       .str();
1900     AttributeString +=
1901         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str();
1902     AttributeString += ",style=\"filled\"";
1903     if (Node->CloneOf) {
1904       AttributeString += ",color=\"blue\"";
1905       AttributeString += ",style=\"filled,bold,dashed\"";
1906     } else
1907       AttributeString += ",style=\"filled\"";
1908     return AttributeString;
1909   }
1910 
1911   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
1912                                        GraphType) {
1913     auto &Edge = *(ChildIter.getCurrent());
1914     return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
1915             Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"")
1916         .str();
1917   }
1918 
1919   // Since the NodeOwners list includes nodes that are no longer connected to
1920   // the graph, skip them here.
1921   static bool isNodeHidden(NodeRef Node, GraphType) {
1922     return Node->isRemoved();
1923   }
1924 
1925 private:
1926   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
1927     std::string IdString = "ContextIds:";
1928     if (ContextIds.size() < 100) {
1929       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
1930       std::sort(SortedIds.begin(), SortedIds.end());
1931       for (auto Id : SortedIds)
1932         IdString += (" " + Twine(Id)).str();
1933     } else {
1934       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
1935     }
1936     return IdString;
1937   }
1938 
1939   static std::string getColor(uint8_t AllocTypes) {
1940     if (AllocTypes == (uint8_t)AllocationType::NotCold)
1941       // Color "brown1" actually looks like a lighter red.
1942       return "brown1";
1943     if (AllocTypes == (uint8_t)AllocationType::Cold)
1944       return "cyan";
1945     if (AllocTypes ==
1946         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1947       // Lighter purple.
1948       return "mediumorchid1";
1949     return "gray";
1950   }
1951 
1952   static std::string getNodeId(NodeRef Node) {
1953     std::stringstream SStream;
1954     SStream << std::hex << "N0x" << (unsigned long long)Node;
1955     std::string Result = SStream.str();
1956     return Result;
1957   }
1958 };
1959 
1960 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1961 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
1962     std::string Label) const {
1963   WriteGraph(this, "", false, Label,
1964              DotFilePathPrefix + "ccg." + Label + ".dot");
1965 }
1966 
1967 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1968 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
1969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
1970     const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) {
1971   ContextNode *Node = Edge->Callee;
1972   NodeOwner.push_back(
1973       std::make_unique<ContextNode>(Node->IsAllocation, Node->Call));
1974   ContextNode *Clone = NodeOwner.back().get();
1975   Node->addClone(Clone);
1976   assert(NodeToCallingFunc.count(Node));
1977   NodeToCallingFunc[Clone] = NodeToCallingFunc[Node];
1978   moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true);
1979   return Clone;
1980 }
1981 
1982 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1983 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
1984     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
1985                                   ContextNode *NewCallee, EdgeIter *CallerEdgeI,
1986                                   bool NewClone) {
1987   // NewCallee and Edge's current callee must be clones of the same original
1988   // node (Edge's current callee may be the original node too).
1989   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
1990   auto &EdgeContextIds = Edge->getContextIds();
1991   ContextNode *OldCallee = Edge->Callee;
1992   if (CallerEdgeI)
1993     *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI);
1994   else
1995     OldCallee->eraseCallerEdge(Edge.get());
1996   Edge->Callee = NewCallee;
1997   NewCallee->CallerEdges.push_back(Edge);
1998   // Don't need to update Edge's context ids since we are simply reconnecting
1999   // it.
2000   set_subtract(OldCallee->ContextIds, EdgeContextIds);
2001   NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end());
2002   NewCallee->AllocTypes |= Edge->AllocTypes;
2003   OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds);
2004   // OldCallee alloc type should be None iff its context id set is now empty.
2005   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
2006          OldCallee->ContextIds.empty());
2007   // Now walk the old callee node's callee edges and move Edge's context ids
2008   // over to the corresponding edge into the clone (which is created here if
2009   // this is a newly created clone).
2010   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
2011     // The context ids moving to the new callee are the subset of this edge's
2012     // context ids and the context ids on the caller edge being moved.
2013     DenseSet<uint32_t> EdgeContextIdsToMove =
2014         set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds);
2015     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
2016     OldCalleeEdge->AllocTypes =
2017         computeAllocType(OldCalleeEdge->getContextIds());
2018     if (!NewClone) {
2019       // Update context ids / alloc type on corresponding edge to NewCallee.
2020       // There is a chance this may not exist if we are reusing an existing
2021       // clone, specifically during function assignment, where we would have
2022       // removed none type edges after creating the clone. If we can't find
2023       // a corresponding edge there, fall through to the cloning below.
2024       if (auto *NewCalleeEdge =
2025               NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) {
2026         NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(),
2027                                               EdgeContextIdsToMove.end());
2028         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
2029         continue;
2030       }
2031     }
2032     auto NewEdge = std::make_shared<ContextEdge>(
2033         OldCalleeEdge->Callee, NewCallee,
2034         computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove);
2035     NewCallee->CalleeEdges.push_back(NewEdge);
2036     NewEdge->Callee->CallerEdges.push_back(NewEdge);
2037   }
2038   if (VerifyCCG) {
2039     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
2040     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
2041     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
2042       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
2043                                             /*CheckEdges=*/false);
2044     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
2045       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
2046                                             /*CheckEdges=*/false);
2047   }
2048 }
2049 
2050 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2051 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
2052   DenseSet<const ContextNode *> Visited;
2053   for (auto &Entry : AllocationCallToContextNodeMap)
2054     identifyClones(Entry.second, Visited);
2055 }
2056 
2057 // helper function to check an AllocType is cold or notcold or both.
2058 bool checkColdOrNotCold(uint8_t AllocType) {
2059   return (AllocType == (uint8_t)AllocationType::Cold) ||
2060          (AllocType == (uint8_t)AllocationType::NotCold) ||
2061          (AllocType ==
2062           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
2063 }
2064 
2065 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2066 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
2067     ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
2068   if (VerifyNodes)
2069     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2070   assert(!Node->CloneOf);
2071 
2072   // If Node as a null call, then either it wasn't found in the module (regular
2073   // LTO) or summary index (ThinLTO), or there were other conditions blocking
2074   // cloning (e.g. recursion, calls multiple targets, etc).
2075   // Do this here so that we don't try to recursively clone callers below, which
2076   // isn't useful at least for this node.
2077   if (!Node->hasCall())
2078     return;
2079 
2080 #ifndef NDEBUG
2081   auto Insert =
2082 #endif
2083       Visited.insert(Node);
2084   // We should not have visited this node yet.
2085   assert(Insert.second);
2086   // The recursive call to identifyClones may delete the current edge from the
2087   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
2088   // in an iterator and having recursive call erase from it. Other edges may
2089   // also get removed during the recursion, which will have null Callee and
2090   // Caller pointers (and are deleted later), so we skip those below.
2091   {
2092     auto CallerEdges = Node->CallerEdges;
2093     for (auto &Edge : CallerEdges) {
2094       // Skip any that have been removed by an earlier recursive call.
2095       if (Edge->Callee == nullptr && Edge->Caller == nullptr) {
2096         assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2097                            Edge));
2098         continue;
2099       }
2100       // Ignore any caller we previously visited via another edge.
2101       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
2102         identifyClones(Edge->Caller, Visited);
2103       }
2104     }
2105   }
2106 
2107   // Check if we reached an unambiguous call or have have only a single caller.
2108   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2109     return;
2110 
2111   // We need to clone.
2112 
2113   // Try to keep the original version as alloc type NotCold. This will make
2114   // cases with indirect calls or any other situation with an unknown call to
2115   // the original function get the default behavior. We do this by sorting the
2116   // CallerEdges of the Node we will clone by alloc type.
2117   //
2118   // Give NotCold edge the lowest sort priority so those edges are at the end of
2119   // the caller edges vector, and stay on the original version (since the below
2120   // code clones greedily until it finds all remaining edges have the same type
2121   // and leaves the remaining ones on the original Node).
2122   //
2123   // We shouldn't actually have any None type edges, so the sorting priority for
2124   // that is arbitrary, and we assert in that case below.
2125   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
2126                                                /*Cold*/ 1,
2127                                                /*NotColdCold*/ 2};
2128   std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(),
2129                    [&](const std::shared_ptr<ContextEdge> &A,
2130                        const std::shared_ptr<ContextEdge> &B) {
2131                      assert(checkColdOrNotCold(A->AllocTypes) &&
2132                             checkColdOrNotCold(B->AllocTypes));
2133 
2134                      if (A->AllocTypes == B->AllocTypes)
2135                        // Use the first context id for each edge as a
2136                        // tie-breaker.
2137                        return *A->ContextIds.begin() < *B->ContextIds.begin();
2138                      return AllocTypeCloningPriority[A->AllocTypes] <
2139                             AllocTypeCloningPriority[B->AllocTypes];
2140                    });
2141 
2142   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2143 
2144   // Iterate until we find no more opportunities for disambiguating the alloc
2145   // types via cloning. In most cases this loop will terminate once the Node
2146   // has a single allocation type, in which case no more cloning is needed.
2147   // We need to be able to remove Edge from CallerEdges, so need to adjust
2148   // iterator inside the loop.
2149   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
2150     auto CallerEdge = *EI;
2151 
2152     // See if cloning the prior caller edge left this node with a single alloc
2153     // type or a single caller. In that case no more cloning of Node is needed.
2154     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
2155       break;
2156 
2157     // Compute the node callee edge alloc types corresponding to the context ids
2158     // for this caller edge.
2159     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
2160     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
2161     for (auto &CalleeEdge : Node->CalleeEdges)
2162       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
2163           CalleeEdge->getContextIds(), CallerEdge->getContextIds()));
2164 
2165     // Don't clone if doing so will not disambiguate any alloc types amongst
2166     // caller edges (including the callee edges that would be cloned).
2167     // Otherwise we will simply move all edges to the clone.
2168     //
2169     // First check if by cloning we will disambiguate the caller allocation
2170     // type from node's allocation type. Query allocTypeToUse so that we don't
2171     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
2172     // neither of these should be None type.
2173     //
2174     // Then check if by cloning node at least one of the callee edges will be
2175     // disambiguated by splitting out different context ids.
2176     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
2177     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2178     if (allocTypeToUse(CallerEdge->AllocTypes) ==
2179             allocTypeToUse(Node->AllocTypes) &&
2180         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2181             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
2182       ++EI;
2183       continue;
2184     }
2185 
2186     // First see if we can use an existing clone. Check each clone and its
2187     // callee edges for matching alloc types.
2188     ContextNode *Clone = nullptr;
2189     for (auto *CurClone : Node->Clones) {
2190       if (allocTypeToUse(CurClone->AllocTypes) !=
2191           allocTypeToUse(CallerEdge->AllocTypes))
2192         continue;
2193 
2194       if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
2195               CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges))
2196         continue;
2197       Clone = CurClone;
2198       break;
2199     }
2200 
2201     // The edge iterator is adjusted when we move the CallerEdge to the clone.
2202     if (Clone)
2203       moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI);
2204     else
2205       Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI);
2206 
2207     assert(EI == Node->CallerEdges.end() ||
2208            Node->AllocTypes != (uint8_t)AllocationType::None);
2209     // Sanity check that no alloc types on clone or its edges are None.
2210     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
2211     assert(llvm::none_of(
2212         Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2213           return E->AllocTypes == (uint8_t)AllocationType::None;
2214         }));
2215   }
2216 
2217   // Cloning may have resulted in some cloned callee edges with type None,
2218   // because they aren't carrying any contexts. Remove those edges.
2219   for (auto *Clone : Node->Clones) {
2220     removeNoneTypeCalleeEdges(Clone);
2221     if (VerifyNodes)
2222       checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2223   }
2224   // We should still have some context ids on the original Node.
2225   assert(!Node->ContextIds.empty());
2226 
2227   // Remove any callee edges that ended up with alloc type None after creating
2228   // clones and updating callee edges.
2229   removeNoneTypeCalleeEdges(Node);
2230 
2231   // Sanity check that no alloc types on node or edges are None.
2232   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
2233   assert(llvm::none_of(Node->CalleeEdges,
2234                        [&](const std::shared_ptr<ContextEdge> &E) {
2235                          return E->AllocTypes == (uint8_t)AllocationType::None;
2236                        }));
2237   assert(llvm::none_of(Node->CallerEdges,
2238                        [&](const std::shared_ptr<ContextEdge> &E) {
2239                          return E->AllocTypes == (uint8_t)AllocationType::None;
2240                        }));
2241 
2242   if (VerifyNodes)
2243     checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2244 }
2245 
2246 void ModuleCallsiteContextGraph::updateAllocationCall(
2247     CallInfo &Call, AllocationType AllocType) {
2248   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
2249   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
2250                                 "memprof", AllocTypeString);
2251   cast<CallBase>(Call.call())->addFnAttr(A);
2252   OREGetter(Call.call()->getFunction())
2253       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
2254             << ore::NV("AllocationCall", Call.call()) << " in clone "
2255             << ore::NV("Caller", Call.call()->getFunction())
2256             << " marked with memprof allocation attribute "
2257             << ore::NV("Attribute", AllocTypeString));
2258 }
2259 
2260 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
2261                                                      AllocationType AllocType) {
2262   auto *AI = Call.call().dyn_cast<AllocInfo *>();
2263   assert(AI);
2264   assert(AI->Versions.size() > Call.cloneNo());
2265   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
2266 }
2267 
2268 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2269                                             FuncInfo CalleeFunc) {
2270   if (CalleeFunc.cloneNo() > 0)
2271     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
2272   OREGetter(CallerCall.call()->getFunction())
2273       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
2274             << ore::NV("Call", CallerCall.call()) << " in clone "
2275             << ore::NV("Caller", CallerCall.call()->getFunction())
2276             << " assigned to call function clone "
2277             << ore::NV("Callee", CalleeFunc.func()));
2278 }
2279 
2280 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
2281                                            FuncInfo CalleeFunc) {
2282   auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>();
2283   assert(CI &&
2284          "Caller cannot be an allocation which should not have profiled calls");
2285   assert(CI->Clones.size() > CallerCall.cloneNo());
2286   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
2287 }
2288 
2289 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
2290                      Instruction *>::FuncInfo
2291 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
2292     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2293     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2294   // Use existing LLVM facilities for cloning and obtaining Call in clone
2295   ValueToValueMapTy VMap;
2296   auto *NewFunc = CloneFunction(Func.func(), VMap);
2297   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
2298   assert(!Func.func()->getParent()->getFunction(Name));
2299   NewFunc->setName(Name);
2300   for (auto &Inst : CallsWithMetadataInFunc) {
2301     // This map always has the initial version in it.
2302     assert(Inst.cloneNo() == 0);
2303     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
2304   }
2305   OREGetter(Func.func())
2306       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
2307             << "created clone " << ore::NV("NewFunction", NewFunc));
2308   return {NewFunc, CloneNo};
2309 }
2310 
2311 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
2312                      IndexCall>::FuncInfo
2313 IndexCallsiteContextGraph::cloneFunctionForCallsite(
2314     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
2315     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
2316   // Check how many clones we have of Call (and therefore function).
2317   // The next clone number is the current size of versions array.
2318   // Confirm this matches the CloneNo provided by the caller, which is based on
2319   // the number of function clones we have.
2320   assert(CloneNo ==
2321          (Call.call().is<AllocInfo *>()
2322               ? Call.call().dyn_cast<AllocInfo *>()->Versions.size()
2323               : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size()));
2324   // Walk all the instructions in this function. Create a new version for
2325   // each (by adding an entry to the Versions/Clones summary array), and copy
2326   // over the version being called for the function clone being cloned here.
2327   // Additionally, add an entry to the CallMap for the new function clone,
2328   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
2329   // to the new call clone.
2330   for (auto &Inst : CallsWithMetadataInFunc) {
2331     // This map always has the initial version in it.
2332     assert(Inst.cloneNo() == 0);
2333     if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) {
2334       assert(AI->Versions.size() == CloneNo);
2335       // We assign the allocation type later (in updateAllocationCall), just add
2336       // an entry for it here.
2337       AI->Versions.push_back(0);
2338     } else {
2339       auto *CI = Inst.call().dyn_cast<CallsiteInfo *>();
2340       assert(CI && CI->Clones.size() == CloneNo);
2341       // We assign the clone number later (in updateCall), just add an entry for
2342       // it here.
2343       CI->Clones.push_back(0);
2344     }
2345     CallMap[Inst] = {Inst.call(), CloneNo};
2346   }
2347   return {Func.func(), CloneNo};
2348 }
2349 
2350 // This method assigns cloned callsites to functions, cloning the functions as
2351 // needed. The assignment is greedy and proceeds roughly as follows:
2352 //
2353 // For each function Func:
2354 //   For each call with graph Node having clones:
2355 //     Initialize ClonesWorklist to Node and its clones
2356 //     Initialize NodeCloneCount to 0
2357 //     While ClonesWorklist is not empty:
2358 //        Clone = pop front ClonesWorklist
2359 //        NodeCloneCount++
2360 //        If Func has been cloned less than NodeCloneCount times:
2361 //           If NodeCloneCount is 1:
2362 //             Assign Clone to original Func
2363 //             Continue
2364 //           Create a new function clone
2365 //           If other callers not assigned to call a function clone yet:
2366 //              Assign them to call new function clone
2367 //              Continue
2368 //           Assign any other caller calling the cloned version to new clone
2369 //
2370 //        For each caller of Clone:
2371 //           If caller is assigned to call a specific function clone:
2372 //             If we cannot assign Clone to that function clone:
2373 //               Create new callsite Clone NewClone
2374 //               Add NewClone to ClonesWorklist
2375 //               Continue
2376 //             Assign Clone to existing caller's called function clone
2377 //           Else:
2378 //             If Clone not already assigned to a function clone:
2379 //                Assign to first function clone without assignment
2380 //             Assign caller to selected function clone
2381 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2382 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
2383   bool Changed = false;
2384 
2385   // Keep track of the assignment of nodes (callsites) to function clones they
2386   // call.
2387   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
2388 
2389   // Update caller node to call function version CalleeFunc, by recording the
2390   // assignment in CallsiteToCalleeFuncCloneMap.
2391   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
2392                                         const FuncInfo &CalleeFunc) {
2393     assert(Caller->hasCall());
2394     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
2395   };
2396 
2397   // Walk all functions for which we saw calls with memprof metadata, and handle
2398   // cloning for each of its calls.
2399   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
2400     FuncInfo OrigFunc(Func);
2401     // Map from each clone of OrigFunc to a map of remappings of each call of
2402     // interest (from original uncloned call to the corresponding cloned call in
2403     // that function clone).
2404     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
2405     for (auto &Call : CallsWithMetadata) {
2406       ContextNode *Node = getNodeForInst(Call);
2407       // Skip call if we do not have a node for it (all uses of its stack ids
2408       // were either on inlined chains or pruned from the MIBs), or if we did
2409       // not create any clones for it.
2410       if (!Node || Node->Clones.empty())
2411         continue;
2412       assert(Node->hasCall() &&
2413              "Not having a call should have prevented cloning");
2414 
2415       // Track the assignment of function clones to clones of the current
2416       // callsite Node being handled.
2417       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
2418 
2419       // Assign callsite version CallsiteClone to function version FuncClone,
2420       // and also assign (possibly cloned) Call to CallsiteClone.
2421       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
2422                                                 CallInfo &Call,
2423                                                 ContextNode *CallsiteClone,
2424                                                 bool IsAlloc) {
2425         // Record the clone of callsite node assigned to this function clone.
2426         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
2427 
2428         assert(FuncClonesToCallMap.count(FuncClone));
2429         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
2430         CallInfo CallClone(Call);
2431         if (CallMap.count(Call))
2432           CallClone = CallMap[Call];
2433         CallsiteClone->setCall(CallClone);
2434       };
2435 
2436       // Keep track of the clones of callsite Node that need to be assigned to
2437       // function clones. This list may be expanded in the loop body below if we
2438       // find additional cloning is required.
2439       std::deque<ContextNode *> ClonesWorklist;
2440       // Ignore original Node if we moved all of its contexts to clones.
2441       if (!Node->ContextIds.empty())
2442         ClonesWorklist.push_back(Node);
2443       ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(),
2444                             Node->Clones.end());
2445 
2446       // Now walk through all of the clones of this callsite Node that we need,
2447       // and determine the assignment to a corresponding clone of the current
2448       // function (creating new function clones as needed).
2449       unsigned NodeCloneCount = 0;
2450       while (!ClonesWorklist.empty()) {
2451         ContextNode *Clone = ClonesWorklist.front();
2452         ClonesWorklist.pop_front();
2453         NodeCloneCount++;
2454         if (VerifyNodes)
2455           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2456 
2457         // Need to create a new function clone if we have more callsite clones
2458         // than existing function clones, which would have been assigned to an
2459         // earlier clone in the list (we assign callsite clones to function
2460         // clones greedily).
2461         if (FuncClonesToCallMap.size() < NodeCloneCount) {
2462           // If this is the first callsite copy, assign to original function.
2463           if (NodeCloneCount == 1) {
2464             // Since FuncClonesToCallMap is empty in this case, no clones have
2465             // been created for this function yet, and no callers should have
2466             // been assigned a function clone for this callee node yet.
2467             assert(llvm::none_of(
2468                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2469                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2470                 }));
2471             // Initialize with empty call map, assign Clone to original function
2472             // and its callers, and skip to the next clone.
2473             FuncClonesToCallMap[OrigFunc] = {};
2474             AssignCallsiteCloneToFuncClone(
2475                 OrigFunc, Call, Clone,
2476                 AllocationCallToContextNodeMap.count(Call));
2477             for (auto &CE : Clone->CallerEdges) {
2478               // Ignore any caller that does not have a recorded callsite Call.
2479               if (!CE->Caller->hasCall())
2480                 continue;
2481               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
2482             }
2483             continue;
2484           }
2485 
2486           // First locate which copy of OrigFunc to clone again. If a caller
2487           // of this callsite clone was already assigned to call a particular
2488           // function clone, we need to redirect all of those callers to the
2489           // new function clone, and update their other callees within this
2490           // function.
2491           FuncInfo PreviousAssignedFuncClone;
2492           auto EI = llvm::find_if(
2493               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
2494                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
2495               });
2496           bool CallerAssignedToCloneOfFunc = false;
2497           if (EI != Clone->CallerEdges.end()) {
2498             const std::shared_ptr<ContextEdge> &Edge = *EI;
2499             PreviousAssignedFuncClone =
2500                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2501             CallerAssignedToCloneOfFunc = true;
2502           }
2503 
2504           // Clone function and save it along with the CallInfo map created
2505           // during cloning in the FuncClonesToCallMap.
2506           std::map<CallInfo, CallInfo> NewCallMap;
2507           unsigned CloneNo = FuncClonesToCallMap.size();
2508           assert(CloneNo > 0 && "Clone 0 is the original function, which "
2509                                 "should already exist in the map");
2510           FuncInfo NewFuncClone = cloneFunctionForCallsite(
2511               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
2512           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
2513           FunctionClonesAnalysis++;
2514           Changed = true;
2515 
2516           // If no caller callsites were already assigned to a clone of this
2517           // function, we can simply assign this clone to the new func clone
2518           // and update all callers to it, then skip to the next clone.
2519           if (!CallerAssignedToCloneOfFunc) {
2520             AssignCallsiteCloneToFuncClone(
2521                 NewFuncClone, Call, Clone,
2522                 AllocationCallToContextNodeMap.count(Call));
2523             for (auto &CE : Clone->CallerEdges) {
2524               // Ignore any caller that does not have a recorded callsite Call.
2525               if (!CE->Caller->hasCall())
2526                 continue;
2527               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2528             }
2529             continue;
2530           }
2531 
2532           // We may need to do additional node cloning in this case.
2533           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
2534           // that were previously assigned to call PreviousAssignedFuncClone,
2535           // to record that they now call NewFuncClone.
2536           for (auto CE : Clone->CallerEdges) {
2537             // Ignore any caller that does not have a recorded callsite Call.
2538             if (!CE->Caller->hasCall())
2539               continue;
2540 
2541             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
2542                 // We subsequently fall through to later handling that
2543                 // will perform any additional cloning required for
2544                 // callers that were calling other function clones.
2545                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
2546                     PreviousAssignedFuncClone)
2547               continue;
2548 
2549             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
2550 
2551             // If we are cloning a function that was already assigned to some
2552             // callers, then essentially we are creating new callsite clones
2553             // of the other callsites in that function that are reached by those
2554             // callers. Clone the other callees of the current callsite's caller
2555             // that were already assigned to PreviousAssignedFuncClone
2556             // accordingly. This is important since we subsequently update the
2557             // calls from the nodes in the graph and their assignments to callee
2558             // functions recorded in CallsiteToCalleeFuncCloneMap.
2559             for (auto CalleeEdge : CE->Caller->CalleeEdges) {
2560               // Skip any that have been removed on an earlier iteration when
2561               // cleaning up newly None type callee edges.
2562               if (!CalleeEdge)
2563                 continue;
2564               ContextNode *Callee = CalleeEdge->Callee;
2565               // Skip the current callsite, we are looking for other
2566               // callsites Caller calls, as well as any that does not have a
2567               // recorded callsite Call.
2568               if (Callee == Clone || !Callee->hasCall())
2569                 continue;
2570               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
2571               removeNoneTypeCalleeEdges(NewClone);
2572               // Moving the edge may have resulted in some none type
2573               // callee edges on the original Callee.
2574               removeNoneTypeCalleeEdges(Callee);
2575               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2576               // If the Callee node was already assigned to call a specific
2577               // function version, make sure its new clone is assigned to call
2578               // that same function clone.
2579               if (CallsiteToCalleeFuncCloneMap.count(Callee))
2580                 RecordCalleeFuncOfCallsite(
2581                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
2582               // Update NewClone with the new Call clone of this callsite's Call
2583               // created for the new function clone created earlier.
2584               // Recall that we have already ensured when building the graph
2585               // that each caller can only call callsites within the same
2586               // function, so we are guaranteed that Callee Call is in the
2587               // current OrigFunc.
2588               // CallMap is set up as indexed by original Call at clone 0.
2589               CallInfo OrigCall(Callee->getOrigNode()->Call);
2590               OrigCall.setCloneNo(0);
2591               std::map<CallInfo, CallInfo> &CallMap =
2592                   FuncClonesToCallMap[NewFuncClone];
2593               assert(CallMap.count(OrigCall));
2594               CallInfo NewCall(CallMap[OrigCall]);
2595               assert(NewCall);
2596               NewClone->setCall(NewCall);
2597             }
2598           }
2599           // Fall through to handling below to perform the recording of the
2600           // function for this callsite clone. This enables handling of cases
2601           // where the callers were assigned to different clones of a function.
2602         }
2603 
2604         // See if we can use existing function clone. Walk through
2605         // all caller edges to see if any have already been assigned to
2606         // a clone of this callsite's function. If we can use it, do so. If not,
2607         // because that function clone is already assigned to a different clone
2608         // of this callsite, then we need to clone again.
2609         // Basically, this checking is needed to handle the case where different
2610         // caller functions/callsites may need versions of this function
2611         // containing different mixes of callsite clones across the different
2612         // callsites within the function. If that happens, we need to create
2613         // additional function clones to handle the various combinations.
2614         //
2615         // Keep track of any new clones of this callsite created by the
2616         // following loop, as well as any existing clone that we decided to
2617         // assign this clone to.
2618         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
2619         FuncInfo FuncCloneAssignedToCurCallsiteClone;
2620         // We need to be able to remove Edge from CallerEdges, so need to adjust
2621         // iterator in the loop.
2622         for (auto EI = Clone->CallerEdges.begin();
2623              EI != Clone->CallerEdges.end();) {
2624           auto Edge = *EI;
2625           // Ignore any caller that does not have a recorded callsite Call.
2626           if (!Edge->Caller->hasCall()) {
2627             EI++;
2628             continue;
2629           }
2630           // If this caller already assigned to call a version of OrigFunc, need
2631           // to ensure we can assign this callsite clone to that function clone.
2632           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
2633             FuncInfo FuncCloneCalledByCaller =
2634                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
2635             // First we need to confirm that this function clone is available
2636             // for use by this callsite node clone.
2637             //
2638             // While FuncCloneToCurNodeCloneMap is built only for this Node and
2639             // its callsite clones, one of those callsite clones X could have
2640             // been assigned to the same function clone called by Edge's caller
2641             // - if Edge's caller calls another callsite within Node's original
2642             // function, and that callsite has another caller reaching clone X.
2643             // We need to clone Node again in this case.
2644             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
2645                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
2646                      Clone) ||
2647                 // Detect when we have multiple callers of this callsite that
2648                 // have already been assigned to specific, and different, clones
2649                 // of OrigFunc (due to other unrelated callsites in Func they
2650                 // reach via call contexts). Is this Clone of callsite Node
2651                 // assigned to a different clone of OrigFunc? If so, clone Node
2652                 // again.
2653                 (FuncCloneAssignedToCurCallsiteClone &&
2654                  FuncCloneAssignedToCurCallsiteClone !=
2655                      FuncCloneCalledByCaller)) {
2656               // We need to use a different newly created callsite clone, in
2657               // order to assign it to another new function clone on a
2658               // subsequent iteration over the Clones array (adjusted below).
2659               // Note we specifically do not reset the
2660               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
2661               // when this new clone is processed later we know which version of
2662               // the function to copy (so that other callsite clones we have
2663               // assigned to that function clone are properly cloned over). See
2664               // comments in the function cloning handling earlier.
2665 
2666               // Check if we already have cloned this callsite again while
2667               // walking through caller edges, for a caller calling the same
2668               // function clone. If so, we can move this edge to that new clone
2669               // rather than creating yet another new clone.
2670               if (FuncCloneToNewCallsiteCloneMap.count(
2671                       FuncCloneCalledByCaller)) {
2672                 ContextNode *NewClone =
2673                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
2674                 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI);
2675                 // Cleanup any none type edges cloned over.
2676                 removeNoneTypeCalleeEdges(NewClone);
2677               } else {
2678                 // Create a new callsite clone.
2679                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI);
2680                 removeNoneTypeCalleeEdges(NewClone);
2681                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
2682                     NewClone;
2683                 // Add to list of clones and process later.
2684                 ClonesWorklist.push_back(NewClone);
2685                 assert(EI == Clone->CallerEdges.end() ||
2686                        Clone->AllocTypes != (uint8_t)AllocationType::None);
2687                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
2688               }
2689               // Moving the caller edge may have resulted in some none type
2690               // callee edges.
2691               removeNoneTypeCalleeEdges(Clone);
2692               // We will handle the newly created callsite clone in a subsequent
2693               // iteration over this Node's Clones. Continue here since we
2694               // already adjusted iterator EI while moving the edge.
2695               continue;
2696             }
2697 
2698             // Otherwise, we can use the function clone already assigned to this
2699             // caller.
2700             if (!FuncCloneAssignedToCurCallsiteClone) {
2701               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
2702               // Assign Clone to FuncCloneCalledByCaller
2703               AssignCallsiteCloneToFuncClone(
2704                   FuncCloneCalledByCaller, Call, Clone,
2705                   AllocationCallToContextNodeMap.count(Call));
2706             } else
2707               // Don't need to do anything - callsite is already calling this
2708               // function clone.
2709               assert(FuncCloneAssignedToCurCallsiteClone ==
2710                      FuncCloneCalledByCaller);
2711 
2712           } else {
2713             // We have not already assigned this caller to a version of
2714             // OrigFunc. Do the assignment now.
2715 
2716             // First check if we have already assigned this callsite clone to a
2717             // clone of OrigFunc for another caller during this iteration over
2718             // its caller edges.
2719             if (!FuncCloneAssignedToCurCallsiteClone) {
2720               // Find first function in FuncClonesToCallMap without an assigned
2721               // clone of this callsite Node. We should always have one
2722               // available at this point due to the earlier cloning when the
2723               // FuncClonesToCallMap size was smaller than the clone number.
2724               for (auto &CF : FuncClonesToCallMap) {
2725                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
2726                   FuncCloneAssignedToCurCallsiteClone = CF.first;
2727                   break;
2728                 }
2729               }
2730               assert(FuncCloneAssignedToCurCallsiteClone);
2731               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
2732               AssignCallsiteCloneToFuncClone(
2733                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
2734                   AllocationCallToContextNodeMap.count(Call));
2735             } else
2736               assert(FuncCloneToCurNodeCloneMap
2737                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
2738             // Update callers to record function version called.
2739             RecordCalleeFuncOfCallsite(Edge->Caller,
2740                                        FuncCloneAssignedToCurCallsiteClone);
2741           }
2742 
2743           EI++;
2744         }
2745       }
2746       if (VerifyCCG) {
2747         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
2748         for (const auto &PE : Node->CalleeEdges)
2749           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2750         for (const auto &CE : Node->CallerEdges)
2751           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2752         for (auto *Clone : Node->Clones) {
2753           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
2754           for (const auto &PE : Clone->CalleeEdges)
2755             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
2756           for (const auto &CE : Clone->CallerEdges)
2757             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
2758         }
2759       }
2760     }
2761   }
2762 
2763   auto UpdateCalls = [&](ContextNode *Node,
2764                          DenseSet<const ContextNode *> &Visited,
2765                          auto &&UpdateCalls) {
2766     auto Inserted = Visited.insert(Node);
2767     if (!Inserted.second)
2768       return;
2769 
2770     for (auto *Clone : Node->Clones)
2771       UpdateCalls(Clone, Visited, UpdateCalls);
2772 
2773     for (auto &Edge : Node->CallerEdges)
2774       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
2775 
2776     // Skip if either no call to update, or if we ended up with no context ids
2777     // (we moved all edges onto other clones).
2778     if (!Node->hasCall() || Node->ContextIds.empty())
2779       return;
2780 
2781     if (Node->IsAllocation) {
2782       updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes));
2783       return;
2784     }
2785 
2786     if (!CallsiteToCalleeFuncCloneMap.count(Node))
2787       return;
2788 
2789     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
2790     updateCall(Node->Call, CalleeFunc);
2791   };
2792 
2793   // Performs DFS traversal starting from allocation nodes to update calls to
2794   // reflect cloning decisions recorded earlier. For regular LTO this will
2795   // update the actual calls in the IR to call the appropriate function clone
2796   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
2797   // are recorded in the summary entries.
2798   DenseSet<const ContextNode *> Visited;
2799   for (auto &Entry : AllocationCallToContextNodeMap)
2800     UpdateCalls(Entry.second, Visited, UpdateCalls);
2801 
2802   return Changed;
2803 }
2804 
2805 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
2806     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
2807     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
2808         &FuncToAliasMap) {
2809   // The first "clone" is the original copy, we should only call this if we
2810   // needed to create new clones.
2811   assert(NumClones > 1);
2812   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
2813   VMaps.reserve(NumClones - 1);
2814   FunctionsClonedThinBackend++;
2815   for (unsigned I = 1; I < NumClones; I++) {
2816     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
2817     auto *NewF = CloneFunction(&F, *VMaps.back());
2818     FunctionClonesThinBackend++;
2819     // Strip memprof and callsite metadata from clone as they are no longer
2820     // needed.
2821     for (auto &BB : *NewF) {
2822       for (auto &Inst : BB) {
2823         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
2824         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
2825       }
2826     }
2827     std::string Name = getMemProfFuncName(F.getName(), I);
2828     auto *PrevF = M.getFunction(Name);
2829     if (PrevF) {
2830       // We might have created this when adjusting callsite in another
2831       // function. It should be a declaration.
2832       assert(PrevF->isDeclaration());
2833       NewF->takeName(PrevF);
2834       PrevF->replaceAllUsesWith(NewF);
2835       PrevF->eraseFromParent();
2836     } else
2837       NewF->setName(Name);
2838     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
2839              << "created clone " << ore::NV("NewFunction", NewF));
2840 
2841     // Now handle aliases to this function, and clone those as well.
2842     if (!FuncToAliasMap.count(&F))
2843       continue;
2844     for (auto *A : FuncToAliasMap[&F]) {
2845       std::string Name = getMemProfFuncName(A->getName(), I);
2846       auto *PrevA = M.getNamedAlias(Name);
2847       auto *NewA = GlobalAlias::create(A->getValueType(),
2848                                        A->getType()->getPointerAddressSpace(),
2849                                        A->getLinkage(), Name, NewF);
2850       NewA->copyAttributesFrom(A);
2851       if (PrevA) {
2852         // We might have created this when adjusting callsite in another
2853         // function. It should be a declaration.
2854         assert(PrevA->isDeclaration());
2855         NewA->takeName(PrevA);
2856         PrevA->replaceAllUsesWith(NewA);
2857         PrevA->eraseFromParent();
2858       }
2859     }
2860   }
2861   return VMaps;
2862 }
2863 
2864 // Locate the summary for F. This is complicated by the fact that it might
2865 // have been internalized or promoted.
2866 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
2867                                       const ModuleSummaryIndex *ImportSummary) {
2868   // FIXME: Ideally we would retain the original GUID in some fashion on the
2869   // function (e.g. as metadata), but for now do our best to locate the
2870   // summary without that information.
2871   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
2872   if (!TheFnVI)
2873     // See if theFn was internalized, by checking index directly with
2874     // original name (this avoids the name adjustment done by getGUID() for
2875     // internal symbols).
2876     TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName()));
2877   if (TheFnVI)
2878     return TheFnVI;
2879   // Now query with the original name before any promotion was performed.
2880   StringRef OrigName =
2881       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
2882   std::string OrigId = GlobalValue::getGlobalIdentifier(
2883       OrigName, GlobalValue::InternalLinkage, M.getSourceFileName());
2884   TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId));
2885   if (TheFnVI)
2886     return TheFnVI;
2887   // Could be a promoted local imported from another module. We need to pass
2888   // down more info here to find the original module id. For now, try with
2889   // the OrigName which might have been stored in the OidGuidMap in the
2890   // index. This would not work if there were same-named locals in multiple
2891   // modules, however.
2892   auto OrigGUID =
2893       ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName));
2894   if (OrigGUID)
2895     TheFnVI = ImportSummary->getValueInfo(OrigGUID);
2896   return TheFnVI;
2897 }
2898 
2899 bool MemProfContextDisambiguation::applyImport(Module &M) {
2900   assert(ImportSummary);
2901   bool Changed = false;
2902 
2903   auto IsMemProfClone = [](const Function &F) {
2904     return F.getName().contains(MemProfCloneSuffix);
2905   };
2906 
2907   // We also need to clone any aliases that reference cloned functions, because
2908   // the modified callsites may invoke via the alias. Keep track of the aliases
2909   // for each function.
2910   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
2911       FuncToAliasMap;
2912   for (auto &A : M.aliases()) {
2913     auto *Aliasee = A.getAliaseeObject();
2914     if (auto *F = dyn_cast<Function>(Aliasee))
2915       FuncToAliasMap[F].insert(&A);
2916   }
2917 
2918   for (auto &F : M) {
2919     if (F.isDeclaration() || IsMemProfClone(F))
2920       continue;
2921 
2922     OptimizationRemarkEmitter ORE(&F);
2923 
2924     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
2925     bool ClonesCreated = false;
2926     unsigned NumClonesCreated = 0;
2927     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
2928       // We should at least have version 0 which is the original copy.
2929       assert(NumClones > 0);
2930       // If only one copy needed use original.
2931       if (NumClones == 1)
2932         return;
2933       // If we already performed cloning of this function, confirm that the
2934       // requested number of clones matches (the thin link should ensure the
2935       // number of clones for each constituent callsite is consistent within
2936       // each function), before returning.
2937       if (ClonesCreated) {
2938         assert(NumClonesCreated == NumClones);
2939         return;
2940       }
2941       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
2942       // The first "clone" is the original copy, which doesn't have a VMap.
2943       assert(VMaps.size() == NumClones - 1);
2944       Changed = true;
2945       ClonesCreated = true;
2946       NumClonesCreated = NumClones;
2947     };
2948 
2949     // Locate the summary for F.
2950     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
2951     // If not found, this could be an imported local (see comment in
2952     // findValueInfoForFunc). Skip for now as it will be cloned in its original
2953     // module (where it would have been promoted to global scope so should
2954     // satisfy any reference in this module).
2955     if (!TheFnVI)
2956       continue;
2957 
2958     auto *GVSummary =
2959         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
2960     if (!GVSummary)
2961       // Must have been imported, use the first summary (might be multiple if
2962       // this was a linkonce_odr).
2963       GVSummary = TheFnVI.getSummaryList().front().get();
2964 
2965     // If this was an imported alias skip it as we won't have the function
2966     // summary, and it should be cloned in the original module.
2967     if (isa<AliasSummary>(GVSummary))
2968       continue;
2969 
2970     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
2971 
2972     if (FS->allocs().empty() && FS->callsites().empty())
2973       continue;
2974 
2975     auto SI = FS->callsites().begin();
2976     auto AI = FS->allocs().begin();
2977 
2978     // Assume for now that the instructions are in the exact same order
2979     // as when the summary was created, but confirm this is correct by
2980     // matching the stack ids.
2981     for (auto &BB : F) {
2982       for (auto &I : BB) {
2983         auto *CB = dyn_cast<CallBase>(&I);
2984         // Same handling as when creating module summary.
2985         if (!mayHaveMemprofSummary(CB))
2986           continue;
2987 
2988         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2989             I.getMetadata(LLVMContext::MD_callsite));
2990         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
2991 
2992         // Include allocs that were already assigned a memprof function
2993         // attribute in the statistics.
2994         if (CB->getAttributes().hasFnAttr("memprof")) {
2995           assert(!MemProfMD);
2996           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
2997               ? AllocTypeColdThinBackend++
2998               : AllocTypeNotColdThinBackend++;
2999           OrigAllocsThinBackend++;
3000           AllocVersionsThinBackend++;
3001           if (!MaxAllocVersionsThinBackend)
3002             MaxAllocVersionsThinBackend = 1;
3003           // Remove any remaining callsite metadata and we can skip the rest of
3004           // the handling for this instruction, since no cloning needed.
3005           I.setMetadata(LLVMContext::MD_callsite, nullptr);
3006           continue;
3007         }
3008 
3009         if (MemProfMD) {
3010           // Consult the next alloc node.
3011           assert(AI != FS->allocs().end());
3012           auto &AllocNode = *(AI++);
3013 
3014           // Sanity check that the MIB stack ids match between the summary and
3015           // instruction metadata.
3016           auto MIBIter = AllocNode.MIBs.begin();
3017           for (auto &MDOp : MemProfMD->operands()) {
3018             assert(MIBIter != AllocNode.MIBs.end());
3019             LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter =
3020                 MIBIter->StackIdIndices.begin();
3021             auto *MIBMD = cast<const MDNode>(MDOp);
3022             MDNode *StackMDNode = getMIBStackNode(MIBMD);
3023             assert(StackMDNode);
3024             SmallVector<unsigned> StackIdsFromMetadata;
3025             CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
3026             for (auto ContextIter =
3027                      StackContext.beginAfterSharedPrefix(CallsiteContext);
3028                  ContextIter != StackContext.end(); ++ContextIter) {
3029               // If this is a direct recursion, simply skip the duplicate
3030               // entries, to be consistent with how the summary ids were
3031               // generated during ModuleSummaryAnalysis.
3032               if (!StackIdsFromMetadata.empty() &&
3033                   StackIdsFromMetadata.back() == *ContextIter)
3034                 continue;
3035               assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
3036               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3037                      *ContextIter);
3038               StackIdIndexIter++;
3039             }
3040             MIBIter++;
3041           }
3042 
3043           // Perform cloning if not yet done.
3044           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
3045 
3046           OrigAllocsThinBackend++;
3047           AllocVersionsThinBackend += AllocNode.Versions.size();
3048           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
3049             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
3050 
3051           // If there is only one version that means we didn't end up
3052           // considering this function for cloning, and in that case the alloc
3053           // will still be none type or should have gotten the default NotCold.
3054           // Skip that after calling clone helper since that does some sanity
3055           // checks that confirm we haven't decided yet that we need cloning.
3056           if (AllocNode.Versions.size() == 1) {
3057             assert((AllocationType)AllocNode.Versions[0] ==
3058                        AllocationType::NotCold ||
3059                    (AllocationType)AllocNode.Versions[0] ==
3060                        AllocationType::None);
3061             UnclonableAllocsThinBackend++;
3062             continue;
3063           }
3064 
3065           // All versions should have a singular allocation type.
3066           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
3067             return Type == ((uint8_t)AllocationType::NotCold |
3068                             (uint8_t)AllocationType::Cold);
3069           }));
3070 
3071           // Update the allocation types per the summary info.
3072           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
3073             // Ignore any that didn't get an assigned allocation type.
3074             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
3075               continue;
3076             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
3077             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
3078                                             : AllocTypeNotColdThinBackend++;
3079             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
3080             auto A = llvm::Attribute::get(F.getContext(), "memprof",
3081                                           AllocTypeString);
3082             CallBase *CBClone;
3083             // Copy 0 is the original function.
3084             if (!J)
3085               CBClone = CB;
3086             else
3087               // Since VMaps are only created for new clones, we index with
3088               // clone J-1 (J==0 is the original clone and does not have a VMaps
3089               // entry).
3090               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3091             CBClone->addFnAttr(A);
3092             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
3093                      << ore::NV("AllocationCall", CBClone) << " in clone "
3094                      << ore::NV("Caller", CBClone->getFunction())
3095                      << " marked with memprof allocation attribute "
3096                      << ore::NV("Attribute", AllocTypeString));
3097           }
3098         } else if (!CallsiteContext.empty()) {
3099           // Consult the next callsite node.
3100           assert(SI != FS->callsites().end());
3101           auto &StackNode = *(SI++);
3102 
3103 #ifndef NDEBUG
3104           // Sanity check that the stack ids match between the summary and
3105           // instruction metadata.
3106           auto StackIdIndexIter = StackNode.StackIdIndices.begin();
3107           for (auto StackId : CallsiteContext) {
3108             assert(StackIdIndexIter != StackNode.StackIdIndices.end());
3109             assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
3110                    StackId);
3111             StackIdIndexIter++;
3112           }
3113 #endif
3114 
3115           // Perform cloning if not yet done.
3116           CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
3117 
3118           // Should have skipped indirect calls via mayHaveMemprofSummary.
3119           assert(CB->getCalledFunction());
3120           assert(!IsMemProfClone(*CB->getCalledFunction()));
3121 
3122           // Update the calls per the summary info.
3123           // Save orig name since it gets updated in the first iteration
3124           // below.
3125           auto CalleeOrigName = CB->getCalledFunction()->getName();
3126           for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
3127             // Do nothing if this version calls the original version of its
3128             // callee.
3129             if (!StackNode.Clones[J])
3130               continue;
3131             auto NewF = M.getOrInsertFunction(
3132                 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
3133                 CB->getCalledFunction()->getFunctionType());
3134             CallBase *CBClone;
3135             // Copy 0 is the original function.
3136             if (!J)
3137               CBClone = CB;
3138             else
3139               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
3140             CBClone->setCalledFunction(NewF);
3141             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
3142                      << ore::NV("Call", CBClone) << " in clone "
3143                      << ore::NV("Caller", CBClone->getFunction())
3144                      << " assigned to call function clone "
3145                      << ore::NV("Callee", NewF.getCallee()));
3146           }
3147         }
3148         // Memprof and callsite metadata on memory allocations no longer needed.
3149         I.setMetadata(LLVMContext::MD_memprof, nullptr);
3150         I.setMetadata(LLVMContext::MD_callsite, nullptr);
3151       }
3152     }
3153   }
3154 
3155   return Changed;
3156 }
3157 
3158 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3159 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
3160   if (DumpCCG) {
3161     dbgs() << "CCG before cloning:\n";
3162     dbgs() << *this;
3163   }
3164   if (ExportToDot)
3165     exportToDot("postbuild");
3166 
3167   if (VerifyCCG) {
3168     check();
3169   }
3170 
3171   identifyClones();
3172 
3173   if (VerifyCCG) {
3174     check();
3175   }
3176 
3177   if (DumpCCG) {
3178     dbgs() << "CCG after cloning:\n";
3179     dbgs() << *this;
3180   }
3181   if (ExportToDot)
3182     exportToDot("cloned");
3183 
3184   bool Changed = assignFunctions();
3185 
3186   if (DumpCCG) {
3187     dbgs() << "CCG after assigning function clones:\n";
3188     dbgs() << *this;
3189   }
3190   if (ExportToDot)
3191     exportToDot("clonefuncassign");
3192 
3193   return Changed;
3194 }
3195 
3196 bool MemProfContextDisambiguation::processModule(
3197     Module &M,
3198     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
3199 
3200   // If we have an import summary, then the cloning decisions were made during
3201   // the thin link on the index. Apply them and return.
3202   if (ImportSummary)
3203     return applyImport(M);
3204 
3205   // TODO: If/when other types of memprof cloning are enabled beyond just for
3206   // hot and cold, we will need to change this to individually control the
3207   // AllocationType passed to addStackNodesForMIB during CCG construction.
3208   // Note that we specifically check this after applying imports above, so that
3209   // the option isn't needed to be passed to distributed ThinLTO backend
3210   // clang processes, which won't necessarily have visibility into the linker
3211   // dependences. Instead the information is communicated from the LTO link to
3212   // the backends via the combined summary index.
3213   if (!SupportsHotColdNew)
3214     return false;
3215 
3216   ModuleCallsiteContextGraph CCG(M, OREGetter);
3217   return CCG.process();
3218 }
3219 
3220 MemProfContextDisambiguation::MemProfContextDisambiguation(
3221     const ModuleSummaryIndex *Summary)
3222     : ImportSummary(Summary) {
3223   if (ImportSummary) {
3224     // The MemProfImportSummary should only be used for testing ThinLTO
3225     // distributed backend handling via opt, in which case we don't have a
3226     // summary from the pass pipeline.
3227     assert(MemProfImportSummary.empty());
3228     return;
3229   }
3230   if (MemProfImportSummary.empty())
3231     return;
3232 
3233   auto ReadSummaryFile =
3234       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
3235   if (!ReadSummaryFile) {
3236     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
3237                           "Error loading file '" + MemProfImportSummary +
3238                               "': ");
3239     return;
3240   }
3241   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
3242   if (!ImportSummaryForTestingOrErr) {
3243     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
3244                           "Error parsing file '" + MemProfImportSummary +
3245                               "': ");
3246     return;
3247   }
3248   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
3249   ImportSummary = ImportSummaryForTesting.get();
3250 }
3251 
3252 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
3253                                                     ModuleAnalysisManager &AM) {
3254   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
3255   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
3256     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
3257   };
3258   if (!processModule(M, OREGetter))
3259     return PreservedAnalyses::all();
3260   return PreservedAnalyses::none();
3261 }
3262 
3263 void MemProfContextDisambiguation::run(
3264     ModuleSummaryIndex &Index,
3265     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
3266         isPrevailing) {
3267   // TODO: If/when other types of memprof cloning are enabled beyond just for
3268   // hot and cold, we will need to change this to individually control the
3269   // AllocationType passed to addStackNodesForMIB during CCG construction.
3270   // The index was set from the option, so these should be in sync.
3271   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
3272   if (!SupportsHotColdNew)
3273     return;
3274 
3275   IndexCallsiteContextGraph CCG(Index, isPrevailing);
3276   CCG.process();
3277 }
3278