xref: /freebsd/contrib/llvm-project/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements support for context disambiguation of allocation
10 // calls for profile guided heap optimization. Specifically, it uses Memprof
11 // profiles which indicate context specific allocation behavior (currently
12 // distinguishing cold vs hot memory allocations). Cloning is performed to
13 // expose the cold allocation call contexts, and the allocation calls are
14 // subsequently annotated with an attribute for later transformation.
15 //
16 // The transformations can be performed either directly on IR (regular LTO), or
17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend).
18 // Both types of LTO operate on a the same base graph representation, which
19 // uses CRTP to support either IR or Index formats.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h"
24 #include "llvm/ADT/DenseMap.h"
25 #include "llvm/ADT/DenseSet.h"
26 #include "llvm/ADT/MapVector.h"
27 #include "llvm/ADT/SetOperations.h"
28 #include "llvm/ADT/SmallPtrSet.h"
29 #include "llvm/ADT/SmallSet.h"
30 #include "llvm/ADT/SmallVector.h"
31 #include "llvm/ADT/Statistic.h"
32 #include "llvm/Analysis/MemoryProfileInfo.h"
33 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
34 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
35 #include "llvm/Bitcode/BitcodeReader.h"
36 #include "llvm/IR/Instructions.h"
37 #include "llvm/IR/Module.h"
38 #include "llvm/IR/ModuleSummaryIndex.h"
39 #include "llvm/Pass.h"
40 #include "llvm/Support/CommandLine.h"
41 #include "llvm/Support/GraphWriter.h"
42 #include "llvm/Support/InterleavedRange.h"
43 #include "llvm/Support/raw_ostream.h"
44 #include "llvm/Transforms/IPO.h"
45 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
46 #include "llvm/Transforms/Utils/Cloning.h"
47 #include "llvm/Transforms/Utils/Instrumentation.h"
48 #include <deque>
49 #include <sstream>
50 #include <unordered_map>
51 #include <vector>
52 using namespace llvm;
53 using namespace llvm::memprof;
54 
55 #define DEBUG_TYPE "memprof-context-disambiguation"
56 
57 STATISTIC(FunctionClonesAnalysis,
58           "Number of function clones created during whole program analysis");
59 STATISTIC(FunctionClonesThinBackend,
60           "Number of function clones created during ThinLTO backend");
61 STATISTIC(FunctionsClonedThinBackend,
62           "Number of functions that had clones created during ThinLTO backend");
63 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly "
64                             "cloned) during whole program analysis");
65 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) "
66                          "during whole program analysis");
67 STATISTIC(AllocTypeNotColdThinBackend,
68           "Number of not cold static allocations (possibly cloned) during "
69           "ThinLTO backend");
70 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations "
71                                     "(possibly cloned) during ThinLTO backend");
72 STATISTIC(OrigAllocsThinBackend,
73           "Number of original (not cloned) allocations with memprof profiles "
74           "during ThinLTO backend");
75 STATISTIC(
76     AllocVersionsThinBackend,
77     "Number of allocation versions (including clones) during ThinLTO backend");
78 STATISTIC(MaxAllocVersionsThinBackend,
79           "Maximum number of allocation versions created for an original "
80           "allocation during ThinLTO backend");
81 STATISTIC(UnclonableAllocsThinBackend,
82           "Number of unclonable ambigous allocations during ThinLTO backend");
83 STATISTIC(RemovedEdgesWithMismatchedCallees,
84           "Number of edges removed due to mismatched callees (profiled vs IR)");
85 STATISTIC(FoundProfiledCalleeCount,
86           "Number of profiled callees found via tail calls");
87 STATISTIC(FoundProfiledCalleeDepth,
88           "Aggregate depth of profiled callees found via tail calls");
89 STATISTIC(FoundProfiledCalleeMaxDepth,
90           "Maximum depth of profiled callees found via tail calls");
91 STATISTIC(FoundProfiledCalleeNonUniquelyCount,
92           "Number of profiled callees found via multiple tail call chains");
93 STATISTIC(DeferredBackedges, "Number of backedges with deferred cloning");
94 STATISTIC(NewMergedNodes, "Number of new nodes created during merging");
95 STATISTIC(NonNewMergedNodes, "Number of non new nodes used during merging");
96 STATISTIC(MissingAllocForContextId,
97           "Number of missing alloc nodes for context ids");
98 STATISTIC(SkippedCallsCloning,
99           "Number of calls skipped during cloning due to unexpected operand");
100 
101 static cl::opt<std::string> DotFilePathPrefix(
102     "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden,
103     cl::value_desc("filename"),
104     cl::desc("Specify the path prefix of the MemProf dot files."));
105 
106 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false),
107                                  cl::Hidden,
108                                  cl::desc("Export graph to dot files."));
109 
110 // How much of the graph to export to dot.
111 enum DotScope {
112   All,     // The full CCG graph.
113   Alloc,   // Only contexts for the specified allocation.
114   Context, // Only the specified context.
115 };
116 
117 static cl::opt<DotScope> DotGraphScope(
118     "memprof-dot-scope", cl::desc("Scope of graph to export to dot"),
119     cl::Hidden, cl::init(DotScope::All),
120     cl::values(
121         clEnumValN(DotScope::All, "all", "Export full callsite graph"),
122         clEnumValN(DotScope::Alloc, "alloc",
123                    "Export only nodes with contexts feeding given "
124                    "-memprof-dot-alloc-id"),
125         clEnumValN(DotScope::Context, "context",
126                    "Export only nodes with given -memprof-dot-context-id")));
127 
128 static cl::opt<unsigned>
129     AllocIdForDot("memprof-dot-alloc-id", cl::init(0), cl::Hidden,
130                   cl::desc("Id of alloc to export if -memprof-dot-scope=alloc "
131                            "or to highlight if -memprof-dot-scope=all"));
132 
133 static cl::opt<unsigned> ContextIdForDot(
134     "memprof-dot-context-id", cl::init(0), cl::Hidden,
135     cl::desc("Id of context to export if -memprof-dot-scope=context or to "
136              "highlight otherwise"));
137 
138 static cl::opt<bool>
139     DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden,
140             cl::desc("Dump CallingContextGraph to stdout after each stage."));
141 
142 static cl::opt<bool>
143     VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden,
144               cl::desc("Perform verification checks on CallingContextGraph."));
145 
146 static cl::opt<bool>
147     VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden,
148                 cl::desc("Perform frequent verification checks on nodes."));
149 
150 static cl::opt<std::string> MemProfImportSummary(
151     "memprof-import-summary",
152     cl::desc("Import summary to use for testing the ThinLTO backend via opt"),
153     cl::Hidden);
154 
155 static cl::opt<unsigned>
156     TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5),
157                         cl::Hidden,
158                         cl::desc("Max depth to recursively search for missing "
159                                  "frames through tail calls."));
160 
161 // Optionally enable cloning of callsites involved with recursive cycles
162 static cl::opt<bool> AllowRecursiveCallsites(
163     "memprof-allow-recursive-callsites", cl::init(true), cl::Hidden,
164     cl::desc("Allow cloning of callsites involved in recursive cycles"));
165 
166 static cl::opt<bool> CloneRecursiveContexts(
167     "memprof-clone-recursive-contexts", cl::init(true), cl::Hidden,
168     cl::desc("Allow cloning of contexts through recursive cycles"));
169 
170 // Generally this is needed for correct assignment of allocation clones to
171 // function clones, however, allow it to be disabled for debugging while the
172 // functionality is new and being tested more widely.
173 static cl::opt<bool>
174     MergeClones("memprof-merge-clones", cl::init(true), cl::Hidden,
175                 cl::desc("Merge clones before assigning functions"));
176 
177 // When disabled, try to detect and prevent cloning of recursive contexts.
178 // This is only necessary until we support cloning through recursive cycles.
179 // Leave on by default for now, as disabling requires a little bit of compile
180 // time overhead and doesn't affect correctness, it will just inflate the cold
181 // hinted bytes reporting a bit when -memprof-report-hinted-sizes is enabled.
182 static cl::opt<bool> AllowRecursiveContexts(
183     "memprof-allow-recursive-contexts", cl::init(true), cl::Hidden,
184     cl::desc("Allow cloning of contexts having recursive cycles"));
185 
186 // Set the minimum absolute count threshold for allowing inlining of indirect
187 // calls promoted during cloning.
188 static cl::opt<unsigned> MemProfICPNoInlineThreshold(
189     "memprof-icp-noinline-threshold", cl::init(2), cl::Hidden,
190     cl::desc("Minimum absolute count for promoted target to be inlinable"));
191 
192 namespace llvm {
193 cl::opt<bool> EnableMemProfContextDisambiguation(
194     "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden,
195     cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation"));
196 
197 // Indicate we are linking with an allocator that supports hot/cold operator
198 // new interfaces.
199 cl::opt<bool> SupportsHotColdNew(
200     "supports-hot-cold-new", cl::init(false), cl::Hidden,
201     cl::desc("Linking with hot/cold operator new interfaces"));
202 
203 static cl::opt<bool> MemProfRequireDefinitionForPromotion(
204     "memprof-require-definition-for-promotion", cl::init(false), cl::Hidden,
205     cl::desc(
206         "Require target function definition when promoting indirect calls"));
207 } // namespace llvm
208 
209 extern cl::opt<bool> MemProfReportHintedSizes;
210 extern cl::opt<unsigned> MinClonedColdBytePercent;
211 
212 namespace {
213 /// CRTP base for graphs built from either IR or ThinLTO summary index.
214 ///
215 /// The graph represents the call contexts in all memprof metadata on allocation
216 /// calls, with nodes for the allocations themselves, as well as for the calls
217 /// in each context. The graph is initially built from the allocation memprof
218 /// metadata (or summary) MIBs. It is then updated to match calls with callsite
219 /// metadata onto the nodes, updating it to reflect any inlining performed on
220 /// those calls.
221 ///
222 /// Each MIB (representing an allocation's call context with allocation
223 /// behavior) is assigned a unique context id during the graph build. The edges
224 /// and nodes in the graph are decorated with the context ids they carry. This
225 /// is used to correctly update the graph when cloning is performed so that we
226 /// can uniquify the context for a single (possibly cloned) allocation.
227 template <typename DerivedCCG, typename FuncTy, typename CallTy>
228 class CallsiteContextGraph {
229 public:
230   CallsiteContextGraph() = default;
231   CallsiteContextGraph(const CallsiteContextGraph &) = default;
232   CallsiteContextGraph(CallsiteContextGraph &&) = default;
233 
234   /// Main entry point to perform analysis and transformations on graph.
235   bool process();
236 
237   /// Perform cloning on the graph necessary to uniquely identify the allocation
238   /// behavior of an allocation based on its context.
239   void identifyClones();
240 
241   /// Assign callsite clones to functions, cloning functions as needed to
242   /// accommodate the combinations of their callsite clones reached by callers.
243   /// For regular LTO this clones functions and callsites in the IR, but for
244   /// ThinLTO the cloning decisions are noted in the summaries and later applied
245   /// in applyImport.
246   bool assignFunctions();
247 
248   void dump() const;
249   void print(raw_ostream &OS) const;
250   void printTotalSizes(raw_ostream &OS) const;
251 
operator <<(raw_ostream & OS,const CallsiteContextGraph & CCG)252   friend raw_ostream &operator<<(raw_ostream &OS,
253                                  const CallsiteContextGraph &CCG) {
254     CCG.print(OS);
255     return OS;
256   }
257 
258   friend struct GraphTraits<
259       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
260   friend struct DOTGraphTraits<
261       const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>;
262 
263   void exportToDot(std::string Label) const;
264 
265   /// Represents a function clone via FuncTy pointer and clone number pair.
266   struct FuncInfo final
267       : public std::pair<FuncTy *, unsigned /*Clone number*/> {
268     using Base = std::pair<FuncTy *, unsigned>;
FuncInfo__anonf01a58700111::CallsiteContextGraph::FuncInfo269     FuncInfo(const Base &B) : Base(B) {}
FuncInfo__anonf01a58700111::CallsiteContextGraph::FuncInfo270     FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {}
operator bool__anonf01a58700111::CallsiteContextGraph::FuncInfo271     explicit operator bool() const { return this->first != nullptr; }
func__anonf01a58700111::CallsiteContextGraph::FuncInfo272     FuncTy *func() const { return this->first; }
cloneNo__anonf01a58700111::CallsiteContextGraph::FuncInfo273     unsigned cloneNo() const { return this->second; }
274   };
275 
276   /// Represents a callsite clone via CallTy and clone number pair.
277   struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> {
278     using Base = std::pair<CallTy, unsigned>;
CallInfo__anonf01a58700111::CallsiteContextGraph::CallInfo279     CallInfo(const Base &B) : Base(B) {}
CallInfo__anonf01a58700111::CallsiteContextGraph::CallInfo280     CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0)
281         : Base(Call, CloneNo) {}
operator bool__anonf01a58700111::CallsiteContextGraph::CallInfo282     explicit operator bool() const { return (bool)this->first; }
call__anonf01a58700111::CallsiteContextGraph::CallInfo283     CallTy call() const { return this->first; }
cloneNo__anonf01a58700111::CallsiteContextGraph::CallInfo284     unsigned cloneNo() const { return this->second; }
setCloneNo__anonf01a58700111::CallsiteContextGraph::CallInfo285     void setCloneNo(unsigned N) { this->second = N; }
print__anonf01a58700111::CallsiteContextGraph::CallInfo286     void print(raw_ostream &OS) const {
287       if (!operator bool()) {
288         assert(!cloneNo());
289         OS << "null Call";
290         return;
291       }
292       call()->print(OS);
293       OS << "\t(clone " << cloneNo() << ")";
294     }
dump__anonf01a58700111::CallsiteContextGraph::CallInfo295     void dump() const {
296       print(dbgs());
297       dbgs() << "\n";
298     }
operator <<(raw_ostream & OS,const CallInfo & Call)299     friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) {
300       Call.print(OS);
301       return OS;
302     }
303   };
304 
305   struct ContextEdge;
306 
307   /// Node in the Callsite Context Graph
308   struct ContextNode {
309     // Keep this for now since in the IR case where we have an Instruction* it
310     // is not as immediately discoverable. Used for printing richer information
311     // when dumping graph.
312     bool IsAllocation;
313 
314     // Keeps track of when the Call was reset to null because there was
315     // recursion.
316     bool Recursive = false;
317 
318     // This will be formed by ORing together the AllocationType enum values
319     // for contexts including this node.
320     uint8_t AllocTypes = 0;
321 
322     // The corresponding allocation or interior call. This is the primary call
323     // for which we have created this node.
324     CallInfo Call;
325 
326     // List of other calls that can be treated the same as the primary call
327     // through cloning. I.e. located in the same function and have the same
328     // (possibly pruned) stack ids. They will be updated the same way as the
329     // primary call when assigning to function clones.
330     SmallVector<CallInfo, 0> MatchingCalls;
331 
332     // For alloc nodes this is a unique id assigned when constructed, and for
333     // callsite stack nodes it is the original stack id when the node is
334     // constructed from the memprof MIB metadata on the alloc nodes. Note that
335     // this is only used when matching callsite metadata onto the stack nodes
336     // created when processing the allocation memprof MIBs, and for labeling
337     // nodes in the dot graph. Therefore we don't bother to assign a value for
338     // clones.
339     uint64_t OrigStackOrAllocId = 0;
340 
341     // Edges to all callees in the profiled call stacks.
342     // TODO: Should this be a map (from Callee node) for more efficient lookup?
343     std::vector<std::shared_ptr<ContextEdge>> CalleeEdges;
344 
345     // Edges to all callers in the profiled call stacks.
346     // TODO: Should this be a map (from Caller node) for more efficient lookup?
347     std::vector<std::shared_ptr<ContextEdge>> CallerEdges;
348 
349     // Returns true if we need to look at the callee edges for determining the
350     // node context ids and allocation type.
useCallerEdgesForContextInfo__anonf01a58700111::CallsiteContextGraph::ContextNode351     bool useCallerEdgesForContextInfo() const {
352       // Typically if the callee edges are empty either the caller edges are
353       // also empty, or this is an allocation (leaf node). However, if we are
354       // allowing recursive callsites and contexts this will be violated for
355       // incompletely cloned recursive cycles.
356       assert(!CalleeEdges.empty() || CallerEdges.empty() || IsAllocation ||
357              (AllowRecursiveCallsites && AllowRecursiveContexts));
358       // When cloning for a recursive context, during cloning we might be in the
359       // midst of cloning for a recurrence and have moved context ids off of a
360       // caller edge onto the clone but not yet off of the incoming caller
361       // (back) edge. If we don't look at those we miss the fact that this node
362       // still has context ids of interest.
363       return IsAllocation || CloneRecursiveContexts;
364     }
365 
366     // Compute the context ids for this node from the union of its edge context
367     // ids.
getContextIds__anonf01a58700111::CallsiteContextGraph::ContextNode368     DenseSet<uint32_t> getContextIds() const {
369       unsigned Count = 0;
370       // Compute the number of ids for reserve below. In general we only need to
371       // look at one set of edges, typically the callee edges, since other than
372       // allocations and in some cases during recursion cloning, all the context
373       // ids on the callers should also flow out via callee edges.
374       for (auto &Edge : CalleeEdges.empty() ? CallerEdges : CalleeEdges)
375         Count += Edge->getContextIds().size();
376       DenseSet<uint32_t> ContextIds;
377       ContextIds.reserve(Count);
378       auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
379           CalleeEdges, useCallerEdgesForContextInfo()
380                            ? CallerEdges
381                            : std::vector<std::shared_ptr<ContextEdge>>());
382       for (const auto &Edge : Edges)
383         ContextIds.insert_range(Edge->getContextIds());
384       return ContextIds;
385     }
386 
387     // Compute the allocation type for this node from the OR of its edge
388     // allocation types.
computeAllocType__anonf01a58700111::CallsiteContextGraph::ContextNode389     uint8_t computeAllocType() const {
390       uint8_t BothTypes =
391           (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
392       uint8_t AllocType = (uint8_t)AllocationType::None;
393       auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
394           CalleeEdges, useCallerEdgesForContextInfo()
395                            ? CallerEdges
396                            : std::vector<std::shared_ptr<ContextEdge>>());
397       for (const auto &Edge : Edges) {
398         AllocType |= Edge->AllocTypes;
399         // Bail early if alloc type reached both, no further refinement.
400         if (AllocType == BothTypes)
401           return AllocType;
402       }
403       return AllocType;
404     }
405 
406     // The context ids set for this node is empty if its edge context ids are
407     // also all empty.
emptyContextIds__anonf01a58700111::CallsiteContextGraph::ContextNode408     bool emptyContextIds() const {
409       auto Edges = llvm::concat<const std::shared_ptr<ContextEdge>>(
410           CalleeEdges, useCallerEdgesForContextInfo()
411                            ? CallerEdges
412                            : std::vector<std::shared_ptr<ContextEdge>>());
413       for (const auto &Edge : Edges) {
414         if (!Edge->getContextIds().empty())
415           return false;
416       }
417       return true;
418     }
419 
420     // List of clones of this ContextNode, initially empty.
421     std::vector<ContextNode *> Clones;
422 
423     // If a clone, points to the original uncloned node.
424     ContextNode *CloneOf = nullptr;
425 
ContextNode__anonf01a58700111::CallsiteContextGraph::ContextNode426     ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {}
427 
ContextNode__anonf01a58700111::CallsiteContextGraph::ContextNode428     ContextNode(bool IsAllocation, CallInfo C)
429         : IsAllocation(IsAllocation), Call(C) {}
430 
addClone__anonf01a58700111::CallsiteContextGraph::ContextNode431     void addClone(ContextNode *Clone) {
432       if (CloneOf) {
433         CloneOf->Clones.push_back(Clone);
434         Clone->CloneOf = CloneOf;
435       } else {
436         Clones.push_back(Clone);
437         assert(!Clone->CloneOf);
438         Clone->CloneOf = this;
439       }
440     }
441 
getOrigNode__anonf01a58700111::CallsiteContextGraph::ContextNode442     ContextNode *getOrigNode() {
443       if (!CloneOf)
444         return this;
445       return CloneOf;
446     }
447 
448     void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
449                                unsigned int ContextId);
450 
451     ContextEdge *findEdgeFromCallee(const ContextNode *Callee);
452     ContextEdge *findEdgeFromCaller(const ContextNode *Caller);
453     void eraseCalleeEdge(const ContextEdge *Edge);
454     void eraseCallerEdge(const ContextEdge *Edge);
455 
setCall__anonf01a58700111::CallsiteContextGraph::ContextNode456     void setCall(CallInfo C) { Call = C; }
457 
hasCall__anonf01a58700111::CallsiteContextGraph::ContextNode458     bool hasCall() const { return (bool)Call.call(); }
459 
printCall__anonf01a58700111::CallsiteContextGraph::ContextNode460     void printCall(raw_ostream &OS) const { Call.print(OS); }
461 
462     // True if this node was effectively removed from the graph, in which case
463     // it should have an allocation type of None and empty context ids.
isRemoved__anonf01a58700111::CallsiteContextGraph::ContextNode464     bool isRemoved() const {
465       // Typically if the callee edges are empty either the caller edges are
466       // also empty, or this is an allocation (leaf node). However, if we are
467       // allowing recursive callsites and contexts this will be violated for
468       // incompletely cloned recursive cycles.
469       assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
470              (AllocTypes == (uint8_t)AllocationType::None) ==
471                  emptyContextIds());
472       return AllocTypes == (uint8_t)AllocationType::None;
473     }
474 
475     void dump() const;
476     void print(raw_ostream &OS) const;
477 
operator <<(raw_ostream & OS,const ContextNode & Node)478     friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) {
479       Node.print(OS);
480       return OS;
481     }
482   };
483 
484   /// Edge in the Callsite Context Graph from a ContextNode N to a caller or
485   /// callee.
486   struct ContextEdge {
487     ContextNode *Callee;
488     ContextNode *Caller;
489 
490     // This will be formed by ORing together the AllocationType enum values
491     // for contexts including this edge.
492     uint8_t AllocTypes = 0;
493 
494     // Set just before initiating cloning when cloning of recursive contexts is
495     // enabled. Used to defer cloning of backedges until we have done cloning of
496     // the callee node for non-backedge caller edges. This exposes cloning
497     // opportunities through the backedge of the cycle.
498     // TODO: Note that this is not updated during cloning, and it is unclear
499     // whether that would be needed.
500     bool IsBackedge = false;
501 
502     // The set of IDs for contexts including this edge.
503     DenseSet<uint32_t> ContextIds;
504 
ContextEdge__anonf01a58700111::CallsiteContextGraph::ContextEdge505     ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType,
506                 DenseSet<uint32_t> ContextIds)
507         : Callee(Callee), Caller(Caller), AllocTypes(AllocType),
508           ContextIds(std::move(ContextIds)) {}
509 
getContextIds__anonf01a58700111::CallsiteContextGraph::ContextEdge510     DenseSet<uint32_t> &getContextIds() { return ContextIds; }
511 
512     // Helper to clear the fields of this edge when we are removing it from the
513     // graph.
clear__anonf01a58700111::CallsiteContextGraph::ContextEdge514     inline void clear() {
515       ContextIds.clear();
516       AllocTypes = (uint8_t)AllocationType::None;
517       Caller = nullptr;
518       Callee = nullptr;
519     }
520 
521     // Check if edge was removed from the graph. This is useful while iterating
522     // over a copy of edge lists when performing operations that mutate the
523     // graph in ways that might remove one of the edges.
isRemoved__anonf01a58700111::CallsiteContextGraph::ContextEdge524     inline bool isRemoved() const {
525       if (Callee || Caller)
526         return false;
527       // Any edges that have been removed from the graph but are still in a
528       // shared_ptr somewhere should have all fields null'ed out by clear()
529       // above.
530       assert(AllocTypes == (uint8_t)AllocationType::None);
531       assert(ContextIds.empty());
532       return true;
533     }
534 
535     void dump() const;
536     void print(raw_ostream &OS) const;
537 
operator <<(raw_ostream & OS,const ContextEdge & Edge)538     friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) {
539       Edge.print(OS);
540       return OS;
541     }
542   };
543 
544   /// Helpers to remove edges that have allocation type None (due to not
545   /// carrying any context ids) after transformations.
546   void removeNoneTypeCalleeEdges(ContextNode *Node);
547   void removeNoneTypeCallerEdges(ContextNode *Node);
548   void
549   recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node,
550                                        DenseSet<const ContextNode *> &Visited);
551 
552 protected:
553   /// Get a list of nodes corresponding to the stack ids in the given callsite
554   /// context.
555   template <class NodeT, class IteratorT>
556   std::vector<uint64_t>
557   getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext);
558 
559   /// Adds nodes for the given allocation and any stack ids on its memprof MIB
560   /// metadata (or summary).
561   ContextNode *addAllocNode(CallInfo Call, const FuncTy *F);
562 
563   /// Adds nodes for the given MIB stack ids.
564   template <class NodeT, class IteratorT>
565   void addStackNodesForMIB(ContextNode *AllocNode,
566                            CallStack<NodeT, IteratorT> &StackContext,
567                            CallStack<NodeT, IteratorT> &CallsiteContext,
568                            AllocationType AllocType,
569                            ArrayRef<ContextTotalSize> ContextSizeInfo);
570 
571   /// Matches all callsite metadata (or summary) to the nodes created for
572   /// allocation memprof MIB metadata, synthesizing new nodes to reflect any
573   /// inlining performed on those callsite instructions.
574   void updateStackNodes();
575 
576   /// Update graph to conservatively handle any callsite stack nodes that target
577   /// multiple different callee target functions.
578   void handleCallsitesWithMultipleTargets();
579 
580   /// Mark backedges via the standard DFS based backedge algorithm.
581   void markBackedges();
582 
583   /// Merge clones generated during cloning for different allocations but that
584   /// are called by the same caller node, to ensure proper function assignment.
585   void mergeClones();
586 
587   // Try to partition calls on the given node (already placed into the AllCalls
588   // array) by callee function, creating new copies of Node as needed to hold
589   // calls with different callees, and moving the callee edges appropriately.
590   // Returns true if partitioning was successful.
591   bool partitionCallsByCallee(
592       ContextNode *Node, ArrayRef<CallInfo> AllCalls,
593       std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode);
594 
595   /// Save lists of calls with MemProf metadata in each function, for faster
596   /// iteration.
597   MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata;
598 
599   /// Map from callsite node to the enclosing caller function.
600   std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc;
601 
602   // When exporting to dot, and an allocation id is specified, contains the
603   // context ids on that allocation.
604   DenseSet<uint32_t> DotAllocContextIds;
605 
606 private:
607   using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator;
608 
609   // Structure to keep track of information for each call as we are matching
610   // non-allocation callsites onto context nodes created from the allocation
611   // call metadata / summary contexts.
612   struct CallContextInfo {
613     // The callsite we're trying to match.
614     CallTy Call;
615     // The callsites stack ids that have a context node in the graph.
616     std::vector<uint64_t> StackIds;
617     // The function containing this callsite.
618     const FuncTy *Func;
619     // Initially empty, if needed this will be updated to contain the context
620     // ids for use in a new context node created for this callsite.
621     DenseSet<uint32_t> ContextIds;
622   };
623 
624   /// Helper to remove edge from graph, updating edge iterator if it is provided
625   /// (in which case CalleeIter indicates which edge list is being iterated).
626   /// This will also perform the necessary clearing of the ContextEdge members
627   /// to enable later checking if the edge has been removed (since we may have
628   /// other copies of the shared_ptr in existence, and in fact rely on this to
629   /// enable removal while iterating over a copy of a node's edge list).
630   void removeEdgeFromGraph(ContextEdge *Edge, EdgeIter *EI = nullptr,
631                            bool CalleeIter = true);
632 
633   /// Assigns the given Node to calls at or inlined into the location with
634   /// the Node's stack id, after post order traversing and processing its
635   /// caller nodes. Uses the call information recorded in the given
636   /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences
637   /// as needed. Called by updateStackNodes which sets up the given
638   /// StackIdToMatchingCalls map.
639   void assignStackNodesPostOrder(
640       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
641       DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls,
642       DenseMap<CallInfo, CallInfo> &CallToMatchingCall);
643 
644   /// Duplicates the given set of context ids, updating the provided
645   /// map from each original id with the newly generated context ids,
646   /// and returning the new duplicated id set.
647   DenseSet<uint32_t> duplicateContextIds(
648       const DenseSet<uint32_t> &StackSequenceContextIds,
649       DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
650 
651   /// Propagates all duplicated context ids across the graph.
652   void propagateDuplicateContextIds(
653       const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds);
654 
655   /// Connect the NewNode to OrigNode's callees if TowardsCallee is true,
656   /// else to its callers. Also updates OrigNode's edges to remove any context
657   /// ids moved to the newly created edge.
658   void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode,
659                       bool TowardsCallee,
660                       DenseSet<uint32_t> RemainingContextIds);
661 
662   /// Get the stack id corresponding to the given Id or Index (for IR this will
663   /// return itself, for a summary index this will return the id recorded in the
664   /// index for that stack id index value).
getStackId(uint64_t IdOrIndex) const665   uint64_t getStackId(uint64_t IdOrIndex) const {
666     return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex);
667   }
668 
669   /// Returns true if the given call targets the callee of the given edge, or if
670   /// we were able to identify the call chain through intermediate tail calls.
671   /// In the latter case new context nodes are added to the graph for the
672   /// identified tail calls, and their synthesized nodes are added to
673   /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for
674   /// the updated edges and to prepare it for an increment in the caller.
675   bool
676   calleesMatch(CallTy Call, EdgeIter &EI,
677                MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap);
678 
679   // Return the callee function of the given call, or nullptr if it can't be
680   // determined
getCalleeFunc(CallTy Call)681   const FuncTy *getCalleeFunc(CallTy Call) {
682     return static_cast<DerivedCCG *>(this)->getCalleeFunc(Call);
683   }
684 
685   /// Returns true if the given call targets the given function, or if we were
686   /// able to identify the call chain through intermediate tail calls (in which
687   /// case FoundCalleeChain will be populated).
calleeMatchesFunc(CallTy Call,const FuncTy * Func,const FuncTy * CallerFunc,std::vector<std::pair<CallTy,FuncTy * >> & FoundCalleeChain)688   bool calleeMatchesFunc(
689       CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc,
690       std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) {
691     return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(
692         Call, Func, CallerFunc, FoundCalleeChain);
693   }
694 
695   /// Returns true if both call instructions have the same callee.
sameCallee(CallTy Call1,CallTy Call2)696   bool sameCallee(CallTy Call1, CallTy Call2) {
697     return static_cast<DerivedCCG *>(this)->sameCallee(Call1, Call2);
698   }
699 
700   /// Get a list of nodes corresponding to the stack ids in the given
701   /// callsite's context.
getStackIdsWithContextNodesForCall(CallTy Call)702   std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) {
703     return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall(
704         Call);
705   }
706 
707   /// Get the last stack id in the context for callsite.
getLastStackId(CallTy Call)708   uint64_t getLastStackId(CallTy Call) {
709     return static_cast<DerivedCCG *>(this)->getLastStackId(Call);
710   }
711 
712   /// Update the allocation call to record type of allocated memory.
updateAllocationCall(CallInfo & Call,AllocationType AllocType)713   void updateAllocationCall(CallInfo &Call, AllocationType AllocType) {
714     AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++;
715     static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType);
716   }
717 
718   /// Get the AllocationType assigned to the given allocation instruction clone.
getAllocationCallType(const CallInfo & Call) const719   AllocationType getAllocationCallType(const CallInfo &Call) const {
720     return static_cast<const DerivedCCG *>(this)->getAllocationCallType(Call);
721   }
722 
723   /// Update non-allocation call to invoke (possibly cloned) function
724   /// CalleeFunc.
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)725   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) {
726     static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc);
727   }
728 
729   /// Clone the given function for the given callsite, recording mapping of all
730   /// of the functions tracked calls to their new versions in the CallMap.
731   /// Assigns new clones to clone number CloneNo.
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)732   FuncInfo cloneFunctionForCallsite(
733       FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
734       std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
735     return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite(
736         Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo);
737   }
738 
739   /// Gets a label to use in the dot graph for the given call clone in the given
740   /// function.
getLabel(const FuncTy * Func,const CallTy Call,unsigned CloneNo) const741   std::string getLabel(const FuncTy *Func, const CallTy Call,
742                        unsigned CloneNo) const {
743     return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo);
744   }
745 
746   // Create and return a new ContextNode.
createNewNode(bool IsAllocation,const FuncTy * F=nullptr,CallInfo C=CallInfo ())747   ContextNode *createNewNode(bool IsAllocation, const FuncTy *F = nullptr,
748                              CallInfo C = CallInfo()) {
749     NodeOwner.push_back(std::make_unique<ContextNode>(IsAllocation, C));
750     auto *NewNode = NodeOwner.back().get();
751     if (F)
752       NodeToCallingFunc[NewNode] = F;
753     return NewNode;
754   }
755 
756   /// Helpers to find the node corresponding to the given call or stackid.
757   ContextNode *getNodeForInst(const CallInfo &C);
758   ContextNode *getNodeForAlloc(const CallInfo &C);
759   ContextNode *getNodeForStackId(uint64_t StackId);
760 
761   /// Computes the alloc type corresponding to the given context ids, by
762   /// unioning their recorded alloc types.
763   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds) const;
764 
765   /// Returns the allocation type of the intersection of the contexts of two
766   /// nodes (based on their provided context id sets), optimized for the case
767   /// when Node1Ids is smaller than Node2Ids.
768   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
769                                   const DenseSet<uint32_t> &Node2Ids) const;
770 
771   /// Returns the allocation type of the intersection of the contexts of two
772   /// nodes (based on their provided context id sets).
773   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
774                               const DenseSet<uint32_t> &Node2Ids) const;
775 
776   /// Create a clone of Edge's callee and move Edge to that new callee node,
777   /// performing the necessary context id and allocation type updates.
778   /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
779   /// moved to an edge to the new callee.
780   ContextNode *
781   moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
782                            DenseSet<uint32_t> ContextIdsToMove = {});
783 
784   /// Change the callee of Edge to existing callee clone NewCallee, performing
785   /// the necessary context id and allocation type updates.
786   /// If ContextIdsToMove is non-empty, only that subset of Edge's ids are
787   /// moved to an edge to the new callee.
788   void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
789                                      ContextNode *NewCallee,
790                                      bool NewClone = false,
791                                      DenseSet<uint32_t> ContextIdsToMove = {});
792 
793   /// Change the caller of the edge at the given callee edge iterator to be
794   /// NewCaller, performing the necessary context id and allocation type
795   /// updates. This is similar to the above moveEdgeToExistingCalleeClone, but
796   /// a simplified version of it as we always move the given edge and all of its
797   /// context ids.
798   void moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
799                                  ContextNode *NewCaller);
800 
801   /// Recursive helper for marking backedges via DFS.
802   void markBackedges(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
803                      DenseSet<const ContextNode *> &CurrentStack);
804 
805   /// Recursive helper for merging clones.
806   void
807   mergeClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
808               DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
809   /// Main worker for merging callee clones for a given node.
810   void mergeNodeCalleeClones(
811       ContextNode *Node, DenseSet<const ContextNode *> &Visited,
812       DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode);
813   /// Helper to find other callers of the given set of callee edges that can
814   /// share the same callee merge node.
815   void findOtherCallersToShareMerge(
816       ContextNode *Node, std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
817       DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
818       DenseSet<ContextNode *> &OtherCallersToShareMerge);
819 
820   /// Recursively perform cloning on the graph for the given Node and its
821   /// callers, in order to uniquely identify the allocation behavior of an
822   /// allocation given its context. The context ids of the allocation being
823   /// processed are given in AllocContextIds.
824   void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited,
825                       const DenseSet<uint32_t> &AllocContextIds);
826 
827   /// Map from each context ID to the AllocationType assigned to that context.
828   DenseMap<uint32_t, AllocationType> ContextIdToAllocationType;
829 
830   /// Map from each contextID to the profiled full contexts and their total
831   /// sizes (there may be more than one due to context trimming),
832   /// optionally populated when requested (via MemProfReportHintedSizes or
833   /// MinClonedColdBytePercent).
834   DenseMap<uint32_t, std::vector<ContextTotalSize>> ContextIdToContextSizeInfos;
835 
836   /// Identifies the context node created for a stack id when adding the MIB
837   /// contexts to the graph. This is used to locate the context nodes when
838   /// trying to assign the corresponding callsites with those stack ids to these
839   /// nodes.
840   DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap;
841 
842   /// Maps to track the calls to their corresponding nodes in the graph.
843   MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap;
844   MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap;
845 
846   /// Owner of all ContextNode unique_ptrs.
847   std::vector<std::unique_ptr<ContextNode>> NodeOwner;
848 
849   /// Perform sanity checks on graph when requested.
850   void check() const;
851 
852   /// Keeps track of the last unique context id assigned.
853   unsigned int LastContextId = 0;
854 };
855 
856 template <typename DerivedCCG, typename FuncTy, typename CallTy>
857 using ContextNode =
858     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode;
859 template <typename DerivedCCG, typename FuncTy, typename CallTy>
860 using ContextEdge =
861     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge;
862 template <typename DerivedCCG, typename FuncTy, typename CallTy>
863 using FuncInfo =
864     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo;
865 template <typename DerivedCCG, typename FuncTy, typename CallTy>
866 using CallInfo =
867     typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo;
868 
869 /// CRTP derived class for graphs built from IR (regular LTO).
870 class ModuleCallsiteContextGraph
871     : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
872                                   Instruction *> {
873 public:
874   ModuleCallsiteContextGraph(
875       Module &M,
876       llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter);
877 
878 private:
879   friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
880                               Instruction *>;
881 
882   uint64_t getStackId(uint64_t IdOrIndex) const;
883   const Function *getCalleeFunc(Instruction *Call);
884   bool calleeMatchesFunc(
885       Instruction *Call, const Function *Func, const Function *CallerFunc,
886       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain);
887   bool sameCallee(Instruction *Call1, Instruction *Call2);
888   bool findProfiledCalleeThroughTailCalls(
889       const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
890       std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
891       bool &FoundMultipleCalleeChains);
892   uint64_t getLastStackId(Instruction *Call);
893   std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call);
894   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
895   AllocationType getAllocationCallType(const CallInfo &Call) const;
896   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
897   CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
898                        Instruction *>::FuncInfo
899   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
900                            std::map<CallInfo, CallInfo> &CallMap,
901                            std::vector<CallInfo> &CallsWithMetadataInFunc,
902                            unsigned CloneNo);
903   std::string getLabel(const Function *Func, const Instruction *Call,
904                        unsigned CloneNo) const;
905 
906   const Module &Mod;
907   llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
908 };
909 
910 /// Represents a call in the summary index graph, which can either be an
911 /// allocation or an interior callsite node in an allocation's context.
912 /// Holds a pointer to the corresponding data structure in the index.
913 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> {
IndexCall__anonf01a58700111::IndexCall914   IndexCall() : PointerUnion() {}
IndexCall__anonf01a58700111::IndexCall915   IndexCall(std::nullptr_t) : IndexCall() {}
IndexCall__anonf01a58700111::IndexCall916   IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {}
IndexCall__anonf01a58700111::IndexCall917   IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {}
IndexCall__anonf01a58700111::IndexCall918   IndexCall(PointerUnion PT) : PointerUnion(PT) {}
919 
operator ->__anonf01a58700111::IndexCall920   IndexCall *operator->() { return this; }
921 
print__anonf01a58700111::IndexCall922   void print(raw_ostream &OS) const {
923     PointerUnion<CallsiteInfo *, AllocInfo *> Base = *this;
924     if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(Base)) {
925       OS << *AI;
926     } else {
927       auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(Base);
928       assert(CI);
929       OS << *CI;
930     }
931   }
932 };
933 } // namespace
934 
935 namespace llvm {
936 template <> struct simplify_type<IndexCall> {
937   using SimpleType = PointerUnion<CallsiteInfo *, AllocInfo *>;
getSimplifiedValuellvm::simplify_type938   static SimpleType getSimplifiedValue(IndexCall &Val) { return Val; }
939 };
940 template <> struct simplify_type<const IndexCall> {
941   using SimpleType = const PointerUnion<CallsiteInfo *, AllocInfo *>;
getSimplifiedValuellvm::simplify_type942   static SimpleType getSimplifiedValue(const IndexCall &Val) { return Val; }
943 };
944 } // namespace llvm
945 
946 namespace {
947 /// CRTP derived class for graphs built from summary index (ThinLTO).
948 class IndexCallsiteContextGraph
949     : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
950                                   IndexCall> {
951 public:
952   IndexCallsiteContextGraph(
953       ModuleSummaryIndex &Index,
954       llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
955           isPrevailing);
956 
~IndexCallsiteContextGraph()957   ~IndexCallsiteContextGraph() {
958     // Now that we are done with the graph it is safe to add the new
959     // CallsiteInfo structs to the function summary vectors. The graph nodes
960     // point into locations within these vectors, so we don't want to add them
961     // any earlier.
962     for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) {
963       auto *FS = I.first;
964       for (auto &Callsite : I.second)
965         FS->addCallsite(*Callsite.second);
966     }
967   }
968 
969 private:
970   friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
971                               IndexCall>;
972 
973   uint64_t getStackId(uint64_t IdOrIndex) const;
974   const FunctionSummary *getCalleeFunc(IndexCall &Call);
975   bool calleeMatchesFunc(
976       IndexCall &Call, const FunctionSummary *Func,
977       const FunctionSummary *CallerFunc,
978       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain);
979   bool sameCallee(IndexCall &Call1, IndexCall &Call2);
980   bool findProfiledCalleeThroughTailCalls(
981       ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
982       std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
983       bool &FoundMultipleCalleeChains);
984   uint64_t getLastStackId(IndexCall &Call);
985   std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call);
986   void updateAllocationCall(CallInfo &Call, AllocationType AllocType);
987   AllocationType getAllocationCallType(const CallInfo &Call) const;
988   void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc);
989   CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
990                        IndexCall>::FuncInfo
991   cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call,
992                            std::map<CallInfo, CallInfo> &CallMap,
993                            std::vector<CallInfo> &CallsWithMetadataInFunc,
994                            unsigned CloneNo);
995   std::string getLabel(const FunctionSummary *Func, const IndexCall &Call,
996                        unsigned CloneNo) const;
997 
998   // Saves mapping from function summaries containing memprof records back to
999   // its VI, for use in checking and debugging.
1000   std::map<const FunctionSummary *, ValueInfo> FSToVIMap;
1001 
1002   const ModuleSummaryIndex &Index;
1003   llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
1004       isPrevailing;
1005 
1006   // Saves/owns the callsite info structures synthesized for missing tail call
1007   // frames that we discover while building the graph.
1008   // It maps from the summary of the function making the tail call, to a map
1009   // of callee ValueInfo to corresponding synthesized callsite info.
1010   std::unordered_map<FunctionSummary *,
1011                      std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>>
1012       FunctionCalleesToSynthesizedCallsiteInfos;
1013 };
1014 } // namespace
1015 
1016 namespace llvm {
1017 template <>
1018 struct DenseMapInfo<typename CallsiteContextGraph<
1019     ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo>
1020     : public DenseMapInfo<std::pair<Instruction *, unsigned>> {};
1021 template <>
1022 struct DenseMapInfo<typename CallsiteContextGraph<
1023     IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo>
1024     : public DenseMapInfo<std::pair<IndexCall, unsigned>> {};
1025 template <>
1026 struct DenseMapInfo<IndexCall>
1027     : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {};
1028 } // end namespace llvm
1029 
1030 namespace {
1031 
1032 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc
1033 // type we should actually use on the corresponding allocation.
1034 // If we can't clone a node that has NotCold+Cold alloc type, we will fall
1035 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold
1036 // from NotCold.
allocTypeToUse(uint8_t AllocTypes)1037 AllocationType allocTypeToUse(uint8_t AllocTypes) {
1038   assert(AllocTypes != (uint8_t)AllocationType::None);
1039   if (AllocTypes ==
1040       ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
1041     return AllocationType::NotCold;
1042   else
1043     return (AllocationType)AllocTypes;
1044 }
1045 
1046 // Helper to check if the alloc types for all edges recorded in the
1047 // InAllocTypes vector match the alloc types for all edges in the Edges
1048 // vector.
1049 template <typename DerivedCCG, typename FuncTy, typename CallTy>
allocTypesMatch(const std::vector<uint8_t> & InAllocTypes,const std::vector<std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>>> & Edges)1050 bool allocTypesMatch(
1051     const std::vector<uint8_t> &InAllocTypes,
1052     const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>>
1053         &Edges) {
1054   // This should be called only when the InAllocTypes vector was computed for
1055   // this set of Edges. Make sure the sizes are the same.
1056   assert(InAllocTypes.size() == Edges.size());
1057   return std::equal(
1058       InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), Edges.end(),
1059       [](const uint8_t &l,
1060          const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) {
1061         // Can share if one of the edges is None type - don't
1062         // care about the type along that edge as it doesn't
1063         // exist for those context ids.
1064         if (l == (uint8_t)AllocationType::None ||
1065             r->AllocTypes == (uint8_t)AllocationType::None)
1066           return true;
1067         return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes);
1068       });
1069 }
1070 
1071 // Helper to check if the alloc types for all edges recorded in the
1072 // InAllocTypes vector match the alloc types for callee edges in the given
1073 // clone. Because the InAllocTypes were computed from the original node's callee
1074 // edges, and other cloning could have happened after this clone was created, we
1075 // need to find the matching clone callee edge, which may or may not exist.
1076 template <typename DerivedCCG, typename FuncTy, typename CallTy>
allocTypesMatchClone(const std::vector<uint8_t> & InAllocTypes,const ContextNode<DerivedCCG,FuncTy,CallTy> * Clone)1077 bool allocTypesMatchClone(
1078     const std::vector<uint8_t> &InAllocTypes,
1079     const ContextNode<DerivedCCG, FuncTy, CallTy> *Clone) {
1080   const ContextNode<DerivedCCG, FuncTy, CallTy> *Node = Clone->CloneOf;
1081   assert(Node);
1082   // InAllocTypes should have been computed for the original node's callee
1083   // edges.
1084   assert(InAllocTypes.size() == Node->CalleeEdges.size());
1085   // First create a map of the clone callee edge callees to the edge alloc type.
1086   DenseMap<const ContextNode<DerivedCCG, FuncTy, CallTy> *, uint8_t>
1087       EdgeCalleeMap;
1088   for (const auto &E : Clone->CalleeEdges) {
1089     assert(!EdgeCalleeMap.contains(E->Callee));
1090     EdgeCalleeMap[E->Callee] = E->AllocTypes;
1091   }
1092   // Next, walk the original node's callees, and look for the corresponding
1093   // clone edge to that callee.
1094   for (unsigned I = 0; I < Node->CalleeEdges.size(); I++) {
1095     auto Iter = EdgeCalleeMap.find(Node->CalleeEdges[I]->Callee);
1096     // Not found is ok, we will simply add an edge if we use this clone.
1097     if (Iter == EdgeCalleeMap.end())
1098       continue;
1099     // Can share if one of the edges is None type - don't
1100     // care about the type along that edge as it doesn't
1101     // exist for those context ids.
1102     if (InAllocTypes[I] == (uint8_t)AllocationType::None ||
1103         Iter->second == (uint8_t)AllocationType::None)
1104       continue;
1105     if (allocTypeToUse(Iter->second) != allocTypeToUse(InAllocTypes[I]))
1106       return false;
1107   }
1108   return true;
1109 }
1110 
1111 } // end anonymous namespace
1112 
1113 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1114 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForInst(const CallInfo & C)1115 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst(
1116     const CallInfo &C) {
1117   ContextNode *Node = getNodeForAlloc(C);
1118   if (Node)
1119     return Node;
1120 
1121   return NonAllocationCallToContextNodeMap.lookup(C);
1122 }
1123 
1124 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1125 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForAlloc(const CallInfo & C)1126 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc(
1127     const CallInfo &C) {
1128   return AllocationCallToContextNodeMap.lookup(C);
1129 }
1130 
1131 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1132 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
getNodeForStackId(uint64_t StackId)1133 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId(
1134     uint64_t StackId) {
1135   auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId);
1136   if (StackEntryNode != StackEntryIdToContextNodeMap.end())
1137     return StackEntryNode->second;
1138   return nullptr;
1139 }
1140 
1141 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1142 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
addOrUpdateCallerEdge(ContextNode * Caller,AllocationType AllocType,unsigned int ContextId)1143     addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType,
1144                           unsigned int ContextId) {
1145   for (auto &Edge : CallerEdges) {
1146     if (Edge->Caller == Caller) {
1147       Edge->AllocTypes |= (uint8_t)AllocType;
1148       Edge->getContextIds().insert(ContextId);
1149       return;
1150     }
1151   }
1152   std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>(
1153       this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId}));
1154   CallerEdges.push_back(Edge);
1155   Caller->CalleeEdges.push_back(Edge);
1156 }
1157 
1158 template <typename DerivedCCG, typename FuncTy, typename CallTy>
removeEdgeFromGraph(ContextEdge * Edge,EdgeIter * EI,bool CalleeIter)1159 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::removeEdgeFromGraph(
1160     ContextEdge *Edge, EdgeIter *EI, bool CalleeIter) {
1161   assert(!EI || (*EI)->get() == Edge);
1162   assert(!Edge->isRemoved());
1163   // Save the Caller and Callee pointers so we can erase Edge from their edge
1164   // lists after clearing Edge below. We do the clearing first in case it is
1165   // destructed after removing from the edge lists (if those were the last
1166   // shared_ptr references to Edge).
1167   auto *Callee = Edge->Callee;
1168   auto *Caller = Edge->Caller;
1169 
1170   // Make sure the edge fields are cleared out so we can properly detect
1171   // removed edges if Edge is not destructed because there is still a shared_ptr
1172   // reference.
1173   Edge->clear();
1174 
1175 #ifndef NDEBUG
1176   auto CalleeCallerCount = Callee->CallerEdges.size();
1177   auto CallerCalleeCount = Caller->CalleeEdges.size();
1178 #endif
1179   if (!EI) {
1180     Callee->eraseCallerEdge(Edge);
1181     Caller->eraseCalleeEdge(Edge);
1182   } else if (CalleeIter) {
1183     Callee->eraseCallerEdge(Edge);
1184     *EI = Caller->CalleeEdges.erase(*EI);
1185   } else {
1186     Caller->eraseCalleeEdge(Edge);
1187     *EI = Callee->CallerEdges.erase(*EI);
1188   }
1189   assert(Callee->CallerEdges.size() < CalleeCallerCount);
1190   assert(Caller->CalleeEdges.size() < CallerCalleeCount);
1191 }
1192 
1193 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1194 void CallsiteContextGraph<
removeNoneTypeCalleeEdges(ContextNode * Node)1195     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) {
1196   for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) {
1197     auto Edge = *EI;
1198     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1199       assert(Edge->ContextIds.empty());
1200       removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
1201     } else
1202       ++EI;
1203   }
1204 }
1205 
1206 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1207 void CallsiteContextGraph<
removeNoneTypeCallerEdges(ContextNode * Node)1208     DerivedCCG, FuncTy, CallTy>::removeNoneTypeCallerEdges(ContextNode *Node) {
1209   for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) {
1210     auto Edge = *EI;
1211     if (Edge->AllocTypes == (uint8_t)AllocationType::None) {
1212       assert(Edge->ContextIds.empty());
1213       Edge->Caller->eraseCalleeEdge(Edge.get());
1214       EI = Node->CallerEdges.erase(EI);
1215     } else
1216       ++EI;
1217   }
1218 }
1219 
1220 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1221 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1222 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCallee(const ContextNode * Callee)1223     findEdgeFromCallee(const ContextNode *Callee) {
1224   for (const auto &Edge : CalleeEdges)
1225     if (Edge->Callee == Callee)
1226       return Edge.get();
1227   return nullptr;
1228 }
1229 
1230 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1231 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge *
1232 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
findEdgeFromCaller(const ContextNode * Caller)1233     findEdgeFromCaller(const ContextNode *Caller) {
1234   for (const auto &Edge : CallerEdges)
1235     if (Edge->Caller == Caller)
1236       return Edge.get();
1237   return nullptr;
1238 }
1239 
1240 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1241 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCalleeEdge(const ContextEdge * Edge)1242     eraseCalleeEdge(const ContextEdge *Edge) {
1243   auto EI = llvm::find_if(
1244       CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) {
1245         return CalleeEdge.get() == Edge;
1246       });
1247   assert(EI != CalleeEdges.end());
1248   CalleeEdges.erase(EI);
1249 }
1250 
1251 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1252 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::
eraseCallerEdge(const ContextEdge * Edge)1253     eraseCallerEdge(const ContextEdge *Edge) {
1254   auto EI = llvm::find_if(
1255       CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) {
1256         return CallerEdge.get() == Edge;
1257       });
1258   assert(EI != CallerEdges.end());
1259   CallerEdges.erase(EI);
1260 }
1261 
1262 template <typename DerivedCCG, typename FuncTy, typename CallTy>
computeAllocType(DenseSet<uint32_t> & ContextIds) const1263 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType(
1264     DenseSet<uint32_t> &ContextIds) const {
1265   uint8_t BothTypes =
1266       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1267   uint8_t AllocType = (uint8_t)AllocationType::None;
1268   for (auto Id : ContextIds) {
1269     AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1270     // Bail early if alloc type reached both, no further refinement.
1271     if (AllocType == BothTypes)
1272       return AllocType;
1273   }
1274   return AllocType;
1275 }
1276 
1277 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1278 uint8_t
intersectAllocTypesImpl(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids) const1279 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl(
1280     const DenseSet<uint32_t> &Node1Ids,
1281     const DenseSet<uint32_t> &Node2Ids) const {
1282   uint8_t BothTypes =
1283       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
1284   uint8_t AllocType = (uint8_t)AllocationType::None;
1285   for (auto Id : Node1Ids) {
1286     if (!Node2Ids.count(Id))
1287       continue;
1288     AllocType |= (uint8_t)ContextIdToAllocationType.at(Id);
1289     // Bail early if alloc type reached both, no further refinement.
1290     if (AllocType == BothTypes)
1291       return AllocType;
1292   }
1293   return AllocType;
1294 }
1295 
1296 template <typename DerivedCCG, typename FuncTy, typename CallTy>
intersectAllocTypes(const DenseSet<uint32_t> & Node1Ids,const DenseSet<uint32_t> & Node2Ids) const1297 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes(
1298     const DenseSet<uint32_t> &Node1Ids,
1299     const DenseSet<uint32_t> &Node2Ids) const {
1300   if (Node1Ids.size() < Node2Ids.size())
1301     return intersectAllocTypesImpl(Node1Ids, Node2Ids);
1302   else
1303     return intersectAllocTypesImpl(Node2Ids, Node1Ids);
1304 }
1305 
1306 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1307 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
addAllocNode(CallInfo Call,const FuncTy * F)1308 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode(
1309     CallInfo Call, const FuncTy *F) {
1310   assert(!getNodeForAlloc(Call));
1311   ContextNode *AllocNode = createNewNode(/*IsAllocation=*/true, F, Call);
1312   AllocationCallToContextNodeMap[Call] = AllocNode;
1313   // Use LastContextId as a uniq id for MIB allocation nodes.
1314   AllocNode->OrigStackOrAllocId = LastContextId;
1315   // Alloc type should be updated as we add in the MIBs. We should assert
1316   // afterwards that it is not still None.
1317   AllocNode->AllocTypes = (uint8_t)AllocationType::None;
1318 
1319   return AllocNode;
1320 }
1321 
getAllocTypeString(uint8_t AllocTypes)1322 static std::string getAllocTypeString(uint8_t AllocTypes) {
1323   if (!AllocTypes)
1324     return "None";
1325   std::string Str;
1326   if (AllocTypes & (uint8_t)AllocationType::NotCold)
1327     Str += "NotCold";
1328   if (AllocTypes & (uint8_t)AllocationType::Cold)
1329     Str += "Cold";
1330   return Str;
1331 }
1332 
1333 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1334 template <class NodeT, class IteratorT>
addStackNodesForMIB(ContextNode * AllocNode,CallStack<NodeT,IteratorT> & StackContext,CallStack<NodeT,IteratorT> & CallsiteContext,AllocationType AllocType,ArrayRef<ContextTotalSize> ContextSizeInfo)1335 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB(
1336     ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext,
1337     CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType,
1338     ArrayRef<ContextTotalSize> ContextSizeInfo) {
1339   // Treating the hot alloc type as NotCold before the disambiguation for "hot"
1340   // is done.
1341   if (AllocType == AllocationType::Hot)
1342     AllocType = AllocationType::NotCold;
1343 
1344   ContextIdToAllocationType[++LastContextId] = AllocType;
1345 
1346   if (!ContextSizeInfo.empty()) {
1347     auto &Entry = ContextIdToContextSizeInfos[LastContextId];
1348     Entry.insert(Entry.begin(), ContextSizeInfo.begin(), ContextSizeInfo.end());
1349   }
1350 
1351   // Update alloc type and context ids for this MIB.
1352   AllocNode->AllocTypes |= (uint8_t)AllocType;
1353 
1354   // Now add or update nodes for each stack id in alloc's context.
1355   // Later when processing the stack ids on non-alloc callsites we will adjust
1356   // for any inlining in the context.
1357   ContextNode *PrevNode = AllocNode;
1358   // Look for recursion (direct recursion should have been collapsed by
1359   // module summary analysis, here we should just be detecting mutual
1360   // recursion). Mark these nodes so we don't try to clone.
1361   SmallSet<uint64_t, 8> StackIdSet;
1362   // Skip any on the allocation call (inlining).
1363   for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext);
1364        ContextIter != StackContext.end(); ++ContextIter) {
1365     auto StackId = getStackId(*ContextIter);
1366     ContextNode *StackNode = getNodeForStackId(StackId);
1367     if (!StackNode) {
1368       StackNode = createNewNode(/*IsAllocation=*/false);
1369       StackEntryIdToContextNodeMap[StackId] = StackNode;
1370       StackNode->OrigStackOrAllocId = StackId;
1371     }
1372     // Marking a node recursive will prevent its cloning completely, even for
1373     // non-recursive contexts flowing through it.
1374     if (!AllowRecursiveCallsites) {
1375       auto Ins = StackIdSet.insert(StackId);
1376       if (!Ins.second)
1377         StackNode->Recursive = true;
1378     }
1379     StackNode->AllocTypes |= (uint8_t)AllocType;
1380     PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId);
1381     PrevNode = StackNode;
1382   }
1383 }
1384 
1385 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1386 DenseSet<uint32_t>
duplicateContextIds(const DenseSet<uint32_t> & StackSequenceContextIds,DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)1387 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds(
1388     const DenseSet<uint32_t> &StackSequenceContextIds,
1389     DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1390   DenseSet<uint32_t> NewContextIds;
1391   for (auto OldId : StackSequenceContextIds) {
1392     NewContextIds.insert(++LastContextId);
1393     OldToNewContextIds[OldId].insert(LastContextId);
1394     assert(ContextIdToAllocationType.count(OldId));
1395     // The new context has the same allocation type as original.
1396     ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId];
1397     if (DotAllocContextIds.contains(OldId))
1398       DotAllocContextIds.insert(LastContextId);
1399   }
1400   return NewContextIds;
1401 }
1402 
1403 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1404 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
propagateDuplicateContextIds(const DenseMap<uint32_t,DenseSet<uint32_t>> & OldToNewContextIds)1405     propagateDuplicateContextIds(
1406         const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) {
1407   // Build a set of duplicated context ids corresponding to the input id set.
1408   auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) {
1409     DenseSet<uint32_t> NewIds;
1410     for (auto Id : ContextIds)
1411       if (auto NewId = OldToNewContextIds.find(Id);
1412           NewId != OldToNewContextIds.end())
1413         NewIds.insert_range(NewId->second);
1414     return NewIds;
1415   };
1416 
1417   // Recursively update context ids sets along caller edges.
1418   auto UpdateCallers = [&](ContextNode *Node,
1419                            DenseSet<const ContextEdge *> &Visited,
1420                            auto &&UpdateCallers) -> void {
1421     for (const auto &Edge : Node->CallerEdges) {
1422       auto Inserted = Visited.insert(Edge.get());
1423       if (!Inserted.second)
1424         continue;
1425       ContextNode *NextNode = Edge->Caller;
1426       DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds());
1427       // Only need to recursively iterate to NextNode via this caller edge if
1428       // it resulted in any added ids to NextNode.
1429       if (!NewIdsToAdd.empty()) {
1430         Edge->getContextIds().insert_range(NewIdsToAdd);
1431         UpdateCallers(NextNode, Visited, UpdateCallers);
1432       }
1433     }
1434   };
1435 
1436   DenseSet<const ContextEdge *> Visited;
1437   for (auto &Entry : AllocationCallToContextNodeMap) {
1438     auto *Node = Entry.second;
1439     UpdateCallers(Node, Visited, UpdateCallers);
1440   }
1441 }
1442 
1443 template <typename DerivedCCG, typename FuncTy, typename CallTy>
connectNewNode(ContextNode * NewNode,ContextNode * OrigNode,bool TowardsCallee,DenseSet<uint32_t> RemainingContextIds)1444 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode(
1445     ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee,
1446     // This must be passed by value to make a copy since it will be adjusted
1447     // as ids are moved.
1448     DenseSet<uint32_t> RemainingContextIds) {
1449   auto &OrigEdges =
1450       TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges;
1451   DenseSet<uint32_t> RecursiveContextIds;
1452   DenseSet<uint32_t> AllCallerContextIds;
1453   if (AllowRecursiveCallsites) {
1454     // Identify which context ids are recursive which is needed to properly
1455     // update the RemainingContextIds set. The relevant recursive context ids
1456     // are those that are in multiple edges.
1457     for (auto &CE : OrigEdges) {
1458       AllCallerContextIds.reserve(CE->getContextIds().size());
1459       for (auto Id : CE->getContextIds())
1460         if (!AllCallerContextIds.insert(Id).second)
1461           RecursiveContextIds.insert(Id);
1462     }
1463   }
1464   // Increment iterator in loop so that we can remove edges as needed.
1465   for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) {
1466     auto Edge = *EI;
1467     DenseSet<uint32_t> NewEdgeContextIds;
1468     DenseSet<uint32_t> NotFoundContextIds;
1469     // Remove any matching context ids from Edge, return set that were found and
1470     // removed, these are the new edge's context ids. Also update the remaining
1471     // (not found ids).
1472     set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds,
1473                  NotFoundContextIds);
1474     // Update the remaining context ids set for the later edges. This is a
1475     // compile time optimization.
1476     if (RecursiveContextIds.empty()) {
1477       // No recursive ids, so all of the previously remaining context ids that
1478       // were not seen on this edge are the new remaining set.
1479       RemainingContextIds.swap(NotFoundContextIds);
1480     } else {
1481       // Keep the recursive ids in the remaining set as we expect to see those
1482       // on another edge. We can remove the non-recursive remaining ids that
1483       // were seen on this edge, however. We already have the set of remaining
1484       // ids that were on this edge (in NewEdgeContextIds). Figure out which are
1485       // non-recursive and only remove those. Note that despite the higher
1486       // overhead of updating the remaining context ids set when recursion
1487       // handling is enabled, it was found to be at worst performance neutral
1488       // and in one case a clear win.
1489       DenseSet<uint32_t> NonRecursiveRemainingCurEdgeIds =
1490           set_difference(NewEdgeContextIds, RecursiveContextIds);
1491       set_subtract(RemainingContextIds, NonRecursiveRemainingCurEdgeIds);
1492     }
1493     // If no matching context ids for this edge, skip it.
1494     if (NewEdgeContextIds.empty()) {
1495       ++EI;
1496       continue;
1497     }
1498     if (TowardsCallee) {
1499       uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1500       auto NewEdge = std::make_shared<ContextEdge>(
1501           Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds));
1502       NewNode->CalleeEdges.push_back(NewEdge);
1503       NewEdge->Callee->CallerEdges.push_back(NewEdge);
1504     } else {
1505       uint8_t NewAllocType = computeAllocType(NewEdgeContextIds);
1506       auto NewEdge = std::make_shared<ContextEdge>(
1507           NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds));
1508       NewNode->CallerEdges.push_back(NewEdge);
1509       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
1510     }
1511     // Remove old edge if context ids empty.
1512     if (Edge->getContextIds().empty()) {
1513       removeEdgeFromGraph(Edge.get(), &EI, TowardsCallee);
1514       continue;
1515     }
1516     ++EI;
1517   }
1518 }
1519 
1520 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkEdge(const std::shared_ptr<ContextEdge<DerivedCCG,FuncTy,CallTy>> & Edge)1521 static void checkEdge(
1522     const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) {
1523   // Confirm that alloc type is not None and that we have at least one context
1524   // id.
1525   assert(Edge->AllocTypes != (uint8_t)AllocationType::None);
1526   assert(!Edge->ContextIds.empty());
1527 }
1528 
1529 template <typename DerivedCCG, typename FuncTy, typename CallTy>
checkNode(const ContextNode<DerivedCCG,FuncTy,CallTy> * Node,bool CheckEdges=true)1530 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node,
1531                       bool CheckEdges = true) {
1532   if (Node->isRemoved())
1533     return;
1534 #ifndef NDEBUG
1535   // Compute node's context ids once for use in asserts.
1536   auto NodeContextIds = Node->getContextIds();
1537 #endif
1538   // Node's context ids should be the union of both its callee and caller edge
1539   // context ids.
1540   if (Node->CallerEdges.size()) {
1541     DenseSet<uint32_t> CallerEdgeContextIds(
1542         Node->CallerEdges.front()->ContextIds);
1543     for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) {
1544       if (CheckEdges)
1545         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1546       set_union(CallerEdgeContextIds, Edge->ContextIds);
1547     }
1548     // Node can have more context ids than callers if some contexts terminate at
1549     // node and some are longer. If we are allowing recursive callsites and
1550     // contexts this will be violated for incompletely cloned recursive cycles,
1551     // so skip the checking in that case.
1552     assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1553            NodeContextIds == CallerEdgeContextIds ||
1554            set_is_subset(CallerEdgeContextIds, NodeContextIds));
1555   }
1556   if (Node->CalleeEdges.size()) {
1557     DenseSet<uint32_t> CalleeEdgeContextIds(
1558         Node->CalleeEdges.front()->ContextIds);
1559     for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) {
1560       if (CheckEdges)
1561         checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
1562       set_union(CalleeEdgeContextIds, Edge->getContextIds());
1563     }
1564     // If we are allowing recursive callsites and contexts this will be violated
1565     // for incompletely cloned recursive cycles, so skip the checking in that
1566     // case.
1567     assert((AllowRecursiveCallsites && AllowRecursiveContexts) ||
1568            NodeContextIds == CalleeEdgeContextIds);
1569   }
1570   // FIXME: Since this checking is only invoked under an option, we should
1571   // change the error checking from using assert to something that will trigger
1572   // an error on a release build.
1573 #ifndef NDEBUG
1574   // Make sure we don't end up with duplicate edges between the same caller and
1575   // callee.
1576   DenseSet<ContextNode<DerivedCCG, FuncTy, CallTy> *> NodeSet;
1577   for (const auto &E : Node->CalleeEdges)
1578     NodeSet.insert(E->Callee);
1579   assert(NodeSet.size() == Node->CalleeEdges.size());
1580 #endif
1581 }
1582 
1583 template <typename DerivedCCG, typename FuncTy, typename CallTy>
1584 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
assignStackNodesPostOrder(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseMap<uint64_t,std::vector<CallContextInfo>> & StackIdToMatchingCalls,DenseMap<CallInfo,CallInfo> & CallToMatchingCall)1585     assignStackNodesPostOrder(
1586         ContextNode *Node, DenseSet<const ContextNode *> &Visited,
1587         DenseMap<uint64_t, std::vector<CallContextInfo>>
1588             &StackIdToMatchingCalls,
1589         DenseMap<CallInfo, CallInfo> &CallToMatchingCall) {
1590   auto Inserted = Visited.insert(Node);
1591   if (!Inserted.second)
1592     return;
1593   // Post order traversal. Iterate over a copy since we may add nodes and
1594   // therefore new callers during the recursive call, invalidating any
1595   // iterator over the original edge vector. We don't need to process these
1596   // new nodes as they were already processed on creation.
1597   auto CallerEdges = Node->CallerEdges;
1598   for (auto &Edge : CallerEdges) {
1599     // Skip any that have been removed during the recursion.
1600     if (Edge->isRemoved()) {
1601       assert(!is_contained(Node->CallerEdges, Edge));
1602       continue;
1603     }
1604     assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls,
1605                               CallToMatchingCall);
1606   }
1607 
1608   // If this node's stack id is in the map, update the graph to contain new
1609   // nodes representing any inlining at interior callsites. Note we move the
1610   // associated context ids over to the new nodes.
1611 
1612   // Ignore this node if it is for an allocation or we didn't record any
1613   // stack id lists ending at it.
1614   if (Node->IsAllocation ||
1615       !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId))
1616     return;
1617 
1618   auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId];
1619   // Handle the simple case first. A single call with a single stack id.
1620   // In this case there is no need to create any new context nodes, simply
1621   // assign the context node for stack id to this Call.
1622   if (Calls.size() == 1) {
1623     auto &[Call, Ids, Func, SavedContextIds] = Calls[0];
1624     if (Ids.size() == 1) {
1625       assert(SavedContextIds.empty());
1626       // It should be this Node
1627       assert(Node == getNodeForStackId(Ids[0]));
1628       if (Node->Recursive)
1629         return;
1630       Node->setCall(Call);
1631       NonAllocationCallToContextNodeMap[Call] = Node;
1632       NodeToCallingFunc[Node] = Func;
1633       return;
1634     }
1635   }
1636 
1637 #ifndef NDEBUG
1638   // Find the node for the last stack id, which should be the same
1639   // across all calls recorded for this id, and is this node's id.
1640   uint64_t LastId = Node->OrigStackOrAllocId;
1641   ContextNode *LastNode = getNodeForStackId(LastId);
1642   // We should only have kept stack ids that had nodes.
1643   assert(LastNode);
1644   assert(LastNode == Node);
1645 #else
1646   ContextNode *LastNode = Node;
1647 #endif
1648 
1649   // Compute the last node's context ids once, as it is shared by all calls in
1650   // this entry.
1651   DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1652 
1653   [[maybe_unused]] bool PrevIterCreatedNode = false;
1654   bool CreatedNode = false;
1655   for (unsigned I = 0; I < Calls.size();
1656        I++, PrevIterCreatedNode = CreatedNode) {
1657     CreatedNode = false;
1658     auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1659     // Skip any for which we didn't assign any ids, these don't get a node in
1660     // the graph.
1661     if (SavedContextIds.empty()) {
1662       // If this call has a matching call (located in the same function and
1663       // having the same stack ids), simply add it to the context node created
1664       // for its matching call earlier. These can be treated the same through
1665       // cloning and get updated at the same time.
1666       if (!CallToMatchingCall.contains(Call))
1667         continue;
1668       auto MatchingCall = CallToMatchingCall[Call];
1669       if (!NonAllocationCallToContextNodeMap.contains(MatchingCall)) {
1670         // This should only happen if we had a prior iteration, and it didn't
1671         // create a node because of the below recomputation of context ids
1672         // finding none remaining and continuing early.
1673         assert(I > 0 && !PrevIterCreatedNode);
1674         continue;
1675       }
1676       NonAllocationCallToContextNodeMap[MatchingCall]->MatchingCalls.push_back(
1677           Call);
1678       continue;
1679     }
1680 
1681     assert(LastId == Ids.back());
1682 
1683     // Recompute the context ids for this stack id sequence (the
1684     // intersection of the context ids of the corresponding nodes).
1685     // Start with the ids we saved in the map for this call, which could be
1686     // duplicated context ids. We have to recompute as we might have overlap
1687     // overlap between the saved context ids for different last nodes, and
1688     // removed them already during the post order traversal.
1689     set_intersect(SavedContextIds, LastNodeContextIds);
1690     ContextNode *PrevNode = LastNode;
1691     bool Skip = false;
1692     // Iterate backwards through the stack Ids, starting after the last Id
1693     // in the list, which was handled once outside for all Calls.
1694     for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1695       auto Id = *IdIter;
1696       ContextNode *CurNode = getNodeForStackId(Id);
1697       // We should only have kept stack ids that had nodes and weren't
1698       // recursive.
1699       assert(CurNode);
1700       assert(!CurNode->Recursive);
1701 
1702       auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1703       if (!Edge) {
1704         Skip = true;
1705         break;
1706       }
1707       PrevNode = CurNode;
1708 
1709       // Update the context ids, which is the intersection of the ids along
1710       // all edges in the sequence.
1711       set_intersect(SavedContextIds, Edge->getContextIds());
1712 
1713       // If we now have no context ids for clone, skip this call.
1714       if (SavedContextIds.empty()) {
1715         Skip = true;
1716         break;
1717       }
1718     }
1719     if (Skip)
1720       continue;
1721 
1722     // Create new context node.
1723     ContextNode *NewNode = createNewNode(/*IsAllocation=*/false, Func, Call);
1724     NonAllocationCallToContextNodeMap[Call] = NewNode;
1725     CreatedNode = true;
1726     NewNode->AllocTypes = computeAllocType(SavedContextIds);
1727 
1728     ContextNode *FirstNode = getNodeForStackId(Ids[0]);
1729     assert(FirstNode);
1730 
1731     // Connect to callees of innermost stack frame in inlined call chain.
1732     // This updates context ids for FirstNode's callee's to reflect those
1733     // moved to NewNode.
1734     connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds);
1735 
1736     // Connect to callers of outermost stack frame in inlined call chain.
1737     // This updates context ids for FirstNode's caller's to reflect those
1738     // moved to NewNode.
1739     connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds);
1740 
1741     // Now we need to remove context ids from edges/nodes between First and
1742     // Last Node.
1743     PrevNode = nullptr;
1744     for (auto Id : Ids) {
1745       ContextNode *CurNode = getNodeForStackId(Id);
1746       // We should only have kept stack ids that had nodes.
1747       assert(CurNode);
1748 
1749       // Remove the context ids moved to NewNode from CurNode, and the
1750       // edge from the prior node.
1751       if (PrevNode) {
1752         auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode);
1753         // If the sequence contained recursion, we might have already removed
1754         // some edges during the connectNewNode calls above.
1755         if (!PrevEdge) {
1756           PrevNode = CurNode;
1757           continue;
1758         }
1759         set_subtract(PrevEdge->getContextIds(), SavedContextIds);
1760         if (PrevEdge->getContextIds().empty())
1761           removeEdgeFromGraph(PrevEdge);
1762       }
1763       // Since we update the edges from leaf to tail, only look at the callee
1764       // edges. This isn't an alloc node, so if there are no callee edges, the
1765       // alloc type is None.
1766       CurNode->AllocTypes = CurNode->CalleeEdges.empty()
1767                                 ? (uint8_t)AllocationType::None
1768                                 : CurNode->computeAllocType();
1769       PrevNode = CurNode;
1770     }
1771     if (VerifyNodes) {
1772       checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true);
1773       for (auto Id : Ids) {
1774         ContextNode *CurNode = getNodeForStackId(Id);
1775         // We should only have kept stack ids that had nodes.
1776         assert(CurNode);
1777         checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true);
1778       }
1779     }
1780   }
1781 }
1782 
1783 template <typename DerivedCCG, typename FuncTy, typename CallTy>
updateStackNodes()1784 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() {
1785   // Map of stack id to all calls with that as the last (outermost caller)
1786   // callsite id that has a context node (some might not due to pruning
1787   // performed during matching of the allocation profile contexts).
1788   // The CallContextInfo contains the Call and a list of its stack ids with
1789   // ContextNodes, the function containing Call, and the set of context ids
1790   // the analysis will eventually identify for use in any new node created
1791   // for that callsite.
1792   DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls;
1793   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
1794     for (auto &Call : CallsWithMetadata) {
1795       // Ignore allocations, already handled.
1796       if (AllocationCallToContextNodeMap.count(Call))
1797         continue;
1798       auto StackIdsWithContextNodes =
1799           getStackIdsWithContextNodesForCall(Call.call());
1800       // If there were no nodes created for MIBs on allocs (maybe this was in
1801       // the unambiguous part of the MIB stack that was pruned), ignore.
1802       if (StackIdsWithContextNodes.empty())
1803         continue;
1804       // Otherwise, record this Call along with the list of ids for the last
1805       // (outermost caller) stack id with a node.
1806       StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back(
1807           {Call.call(), StackIdsWithContextNodes, Func, {}});
1808     }
1809   }
1810 
1811   // First make a pass through all stack ids that correspond to a call,
1812   // as identified in the above loop. Compute the context ids corresponding to
1813   // each of these calls when they correspond to multiple stack ids due to
1814   // due to inlining. Perform any duplication of context ids required when
1815   // there is more than one call with the same stack ids. Their (possibly newly
1816   // duplicated) context ids are saved in the StackIdToMatchingCalls map.
1817   DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds;
1818   // Save a map from each call to any that are found to match it. I.e. located
1819   // in the same function and have the same (possibly pruned) stack ids. We use
1820   // this to avoid creating extra graph nodes as they can be treated the same.
1821   DenseMap<CallInfo, CallInfo> CallToMatchingCall;
1822   for (auto &It : StackIdToMatchingCalls) {
1823     auto &Calls = It.getSecond();
1824     // Skip single calls with a single stack id. These don't need a new node.
1825     if (Calls.size() == 1) {
1826       auto &Ids = Calls[0].StackIds;
1827       if (Ids.size() == 1)
1828         continue;
1829     }
1830     // In order to do the best and maximal matching of inlined calls to context
1831     // node sequences we will sort the vectors of stack ids in descending order
1832     // of length, and within each length, lexicographically by stack id. The
1833     // latter is so that we can specially handle calls that have identical stack
1834     // id sequences (either due to cloning or artificially because of the MIB
1835     // context pruning). Those with the same Ids are then sorted by function to
1836     // facilitate efficiently mapping them to the same context node.
1837     // Because the functions are pointers, to ensure a stable sort first assign
1838     // each function pointer to its first index in the Calls array, and then use
1839     // that to sort by.
1840     DenseMap<const FuncTy *, unsigned> FuncToIndex;
1841     for (const auto &[Idx, CallCtxInfo] : enumerate(Calls))
1842       FuncToIndex.insert({CallCtxInfo.Func, Idx});
1843     llvm::stable_sort(
1844         Calls,
1845         [&FuncToIndex](const CallContextInfo &A, const CallContextInfo &B) {
1846           return A.StackIds.size() > B.StackIds.size() ||
1847                  (A.StackIds.size() == B.StackIds.size() &&
1848                   (A.StackIds < B.StackIds ||
1849                    (A.StackIds == B.StackIds &&
1850                     FuncToIndex[A.Func] < FuncToIndex[B.Func])));
1851         });
1852 
1853     // Find the node for the last stack id, which should be the same
1854     // across all calls recorded for this id, and is the id for this
1855     // entry in the StackIdToMatchingCalls map.
1856     uint64_t LastId = It.getFirst();
1857     ContextNode *LastNode = getNodeForStackId(LastId);
1858     // We should only have kept stack ids that had nodes.
1859     assert(LastNode);
1860 
1861     if (LastNode->Recursive)
1862       continue;
1863 
1864     // Initialize the context ids with the last node's. We will subsequently
1865     // refine the context ids by computing the intersection along all edges.
1866     DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds();
1867     assert(!LastNodeContextIds.empty());
1868 
1869 #ifndef NDEBUG
1870     // Save the set of functions seen for a particular set of the same stack
1871     // ids. This is used to ensure that they have been correctly sorted to be
1872     // adjacent in the Calls list, since we rely on that to efficiently place
1873     // all such matching calls onto the same context node.
1874     DenseSet<const FuncTy *> MatchingIdsFuncSet;
1875 #endif
1876 
1877     for (unsigned I = 0; I < Calls.size(); I++) {
1878       auto &[Call, Ids, Func, SavedContextIds] = Calls[I];
1879       assert(SavedContextIds.empty());
1880       assert(LastId == Ids.back());
1881 
1882 #ifndef NDEBUG
1883       // If this call has a different set of ids than the last one, clear the
1884       // set used to ensure they are sorted properly.
1885       if (I > 0 && Ids != Calls[I - 1].StackIds)
1886         MatchingIdsFuncSet.clear();
1887 #endif
1888 
1889       // First compute the context ids for this stack id sequence (the
1890       // intersection of the context ids of the corresponding nodes).
1891       // Start with the remaining saved ids for the last node.
1892       assert(!LastNodeContextIds.empty());
1893       DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds;
1894 
1895       ContextNode *PrevNode = LastNode;
1896       ContextNode *CurNode = LastNode;
1897       bool Skip = false;
1898 
1899       // Iterate backwards through the stack Ids, starting after the last Id
1900       // in the list, which was handled once outside for all Calls.
1901       for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) {
1902         auto Id = *IdIter;
1903         CurNode = getNodeForStackId(Id);
1904         // We should only have kept stack ids that had nodes.
1905         assert(CurNode);
1906 
1907         if (CurNode->Recursive) {
1908           Skip = true;
1909           break;
1910         }
1911 
1912         auto *Edge = CurNode->findEdgeFromCaller(PrevNode);
1913         // If there is no edge then the nodes belong to different MIB contexts,
1914         // and we should skip this inlined context sequence. For example, this
1915         // particular inlined context may include stack ids A->B, and we may
1916         // indeed have nodes for both A and B, but it is possible that they were
1917         // never profiled in sequence in a single MIB for any allocation (i.e.
1918         // we might have profiled an allocation that involves the callsite A,
1919         // but through a different one of its callee callsites, and we might
1920         // have profiled an allocation that involves callsite B, but reached
1921         // from a different caller callsite).
1922         if (!Edge) {
1923           Skip = true;
1924           break;
1925         }
1926         PrevNode = CurNode;
1927 
1928         // Update the context ids, which is the intersection of the ids along
1929         // all edges in the sequence.
1930         set_intersect(StackSequenceContextIds, Edge->getContextIds());
1931 
1932         // If we now have no context ids for clone, skip this call.
1933         if (StackSequenceContextIds.empty()) {
1934           Skip = true;
1935           break;
1936         }
1937       }
1938       if (Skip)
1939         continue;
1940 
1941       // If some of this call's stack ids did not have corresponding nodes (due
1942       // to pruning), don't include any context ids for contexts that extend
1943       // beyond these nodes. Otherwise we would be matching part of unrelated /
1944       // not fully matching stack contexts. To do this, subtract any context ids
1945       // found in caller nodes of the last node found above.
1946       if (Ids.back() != getLastStackId(Call)) {
1947         for (const auto &PE : LastNode->CallerEdges) {
1948           set_subtract(StackSequenceContextIds, PE->getContextIds());
1949           if (StackSequenceContextIds.empty())
1950             break;
1951         }
1952         // If we now have no context ids for clone, skip this call.
1953         if (StackSequenceContextIds.empty())
1954           continue;
1955       }
1956 
1957 #ifndef NDEBUG
1958       // If the prior call had the same stack ids this set would not be empty.
1959       // Check if we already have a call that "matches" because it is located
1960       // in the same function. If the Calls list was sorted properly we should
1961       // not encounter this situation as all such entries should be adjacent
1962       // and processed in bulk further below.
1963       assert(!MatchingIdsFuncSet.contains(Func));
1964 
1965       MatchingIdsFuncSet.insert(Func);
1966 #endif
1967 
1968       // Check if the next set of stack ids is the same (since the Calls vector
1969       // of tuples is sorted by the stack ids we can just look at the next one).
1970       // If so, save them in the CallToMatchingCall map so that they get
1971       // assigned to the same context node, and skip them.
1972       bool DuplicateContextIds = false;
1973       for (unsigned J = I + 1; J < Calls.size(); J++) {
1974         auto &CallCtxInfo = Calls[J];
1975         auto &NextIds = CallCtxInfo.StackIds;
1976         if (NextIds != Ids)
1977           break;
1978         auto *NextFunc = CallCtxInfo.Func;
1979         if (NextFunc != Func) {
1980           // We have another Call with the same ids but that cannot share this
1981           // node, must duplicate ids for it.
1982           DuplicateContextIds = true;
1983           break;
1984         }
1985         auto &NextCall = CallCtxInfo.Call;
1986         CallToMatchingCall[NextCall] = Call;
1987         // Update I so that it gets incremented correctly to skip this call.
1988         I = J;
1989       }
1990 
1991       // If we don't have duplicate context ids, then we can assign all the
1992       // context ids computed for the original node sequence to this call.
1993       // If there are duplicate calls with the same stack ids then we synthesize
1994       // new context ids that are duplicates of the originals. These are
1995       // assigned to SavedContextIds, which is a reference into the map entry
1996       // for this call, allowing us to access these ids later on.
1997       OldToNewContextIds.reserve(OldToNewContextIds.size() +
1998                                  StackSequenceContextIds.size());
1999       SavedContextIds =
2000           DuplicateContextIds
2001               ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds)
2002               : StackSequenceContextIds;
2003       assert(!SavedContextIds.empty());
2004 
2005       if (!DuplicateContextIds) {
2006         // Update saved last node's context ids to remove those that are
2007         // assigned to other calls, so that it is ready for the next call at
2008         // this stack id.
2009         set_subtract(LastNodeContextIds, StackSequenceContextIds);
2010         if (LastNodeContextIds.empty())
2011           break;
2012       }
2013     }
2014   }
2015 
2016   // Propagate the duplicate context ids over the graph.
2017   propagateDuplicateContextIds(OldToNewContextIds);
2018 
2019   if (VerifyCCG)
2020     check();
2021 
2022   // Now perform a post-order traversal over the graph, starting with the
2023   // allocation nodes, essentially processing nodes from callers to callees.
2024   // For any that contains an id in the map, update the graph to contain new
2025   // nodes representing any inlining at interior callsites. Note we move the
2026   // associated context ids over to the new nodes.
2027   DenseSet<const ContextNode *> Visited;
2028   for (auto &Entry : AllocationCallToContextNodeMap)
2029     assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls,
2030                               CallToMatchingCall);
2031   if (VerifyCCG)
2032     check();
2033 }
2034 
getLastStackId(Instruction * Call)2035 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) {
2036   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2037       Call->getMetadata(LLVMContext::MD_callsite));
2038   return CallsiteContext.back();
2039 }
2040 
getLastStackId(IndexCall & Call)2041 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) {
2042   assert(isa<CallsiteInfo *>(Call));
2043   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2044       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2045   // Need to convert index into stack id.
2046   return Index.getStackIdAtIndex(CallsiteContext.back());
2047 }
2048 
2049 static const std::string MemProfCloneSuffix = ".memprof.";
2050 
getMemProfFuncName(Twine Base,unsigned CloneNo)2051 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) {
2052   // We use CloneNo == 0 to refer to the original version, which doesn't get
2053   // renamed with a suffix.
2054   if (!CloneNo)
2055     return Base.str();
2056   return (Base + MemProfCloneSuffix + Twine(CloneNo)).str();
2057 }
2058 
isMemProfClone(const Function & F)2059 static bool isMemProfClone(const Function &F) {
2060   return F.getName().contains(MemProfCloneSuffix);
2061 }
2062 
getLabel(const Function * Func,const Instruction * Call,unsigned CloneNo) const2063 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func,
2064                                                  const Instruction *Call,
2065                                                  unsigned CloneNo) const {
2066   return (Twine(Call->getFunction()->getName()) + " -> " +
2067           cast<CallBase>(Call)->getCalledFunction()->getName())
2068       .str();
2069 }
2070 
getLabel(const FunctionSummary * Func,const IndexCall & Call,unsigned CloneNo) const2071 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func,
2072                                                 const IndexCall &Call,
2073                                                 unsigned CloneNo) const {
2074   auto VI = FSToVIMap.find(Func);
2075   assert(VI != FSToVIMap.end());
2076   if (isa<AllocInfo *>(Call))
2077     return (VI->second.name() + " -> alloc").str();
2078   else {
2079     auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call);
2080     return (VI->second.name() + " -> " +
2081             getMemProfFuncName(Callsite->Callee.name(),
2082                                Callsite->Clones[CloneNo]))
2083         .str();
2084   }
2085 }
2086 
2087 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(Instruction * Call)2088 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall(
2089     Instruction *Call) {
2090   CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
2091       Call->getMetadata(LLVMContext::MD_callsite));
2092   return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>(
2093       CallsiteContext);
2094 }
2095 
2096 std::vector<uint64_t>
getStackIdsWithContextNodesForCall(IndexCall & Call)2097 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) {
2098   assert(isa<CallsiteInfo *>(Call));
2099   CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator>
2100       CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call));
2101   return getStackIdsWithContextNodes<CallsiteInfo,
2102                                      SmallVector<unsigned>::const_iterator>(
2103       CallsiteContext);
2104 }
2105 
2106 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2107 template <class NodeT, class IteratorT>
2108 std::vector<uint64_t>
getStackIdsWithContextNodes(CallStack<NodeT,IteratorT> & CallsiteContext)2109 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes(
2110     CallStack<NodeT, IteratorT> &CallsiteContext) {
2111   std::vector<uint64_t> StackIds;
2112   for (auto IdOrIndex : CallsiteContext) {
2113     auto StackId = getStackId(IdOrIndex);
2114     ContextNode *Node = getNodeForStackId(StackId);
2115     if (!Node)
2116       break;
2117     StackIds.push_back(StackId);
2118   }
2119   return StackIds;
2120 }
2121 
ModuleCallsiteContextGraph(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)2122 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph(
2123     Module &M,
2124     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter)
2125     : Mod(M), OREGetter(OREGetter) {
2126   for (auto &F : M) {
2127     std::vector<CallInfo> CallsWithMetadata;
2128     for (auto &BB : F) {
2129       for (auto &I : BB) {
2130         if (!isa<CallBase>(I))
2131           continue;
2132         if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) {
2133           CallsWithMetadata.push_back(&I);
2134           auto *AllocNode = addAllocNode(&I, &F);
2135           auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite);
2136           assert(CallsiteMD);
2137           CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD);
2138           // Add all of the MIBs and their stack nodes.
2139           for (auto &MDOp : MemProfMD->operands()) {
2140             auto *MIBMD = cast<const MDNode>(MDOp);
2141             std::vector<ContextTotalSize> ContextSizeInfo;
2142             // Collect the context size information if it exists.
2143             if (MIBMD->getNumOperands() > 2) {
2144               for (unsigned I = 2; I < MIBMD->getNumOperands(); I++) {
2145                 MDNode *ContextSizePair =
2146                     dyn_cast<MDNode>(MIBMD->getOperand(I));
2147                 assert(ContextSizePair->getNumOperands() == 2);
2148                 uint64_t FullStackId = mdconst::dyn_extract<ConstantInt>(
2149                                            ContextSizePair->getOperand(0))
2150                                            ->getZExtValue();
2151                 uint64_t TotalSize = mdconst::dyn_extract<ConstantInt>(
2152                                          ContextSizePair->getOperand(1))
2153                                          ->getZExtValue();
2154                 ContextSizeInfo.push_back({FullStackId, TotalSize});
2155               }
2156             }
2157             MDNode *StackNode = getMIBStackNode(MIBMD);
2158             assert(StackNode);
2159             CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode);
2160             addStackNodesForMIB<MDNode, MDNode::op_iterator>(
2161                 AllocNode, StackContext, CallsiteContext,
2162                 getMIBAllocType(MIBMD), ContextSizeInfo);
2163           }
2164           // If exporting the graph to dot and an allocation id of interest was
2165           // specified, record all the context ids for this allocation node.
2166           if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2167             DotAllocContextIds = AllocNode->getContextIds();
2168           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2169           // Memprof and callsite metadata on memory allocations no longer
2170           // needed.
2171           I.setMetadata(LLVMContext::MD_memprof, nullptr);
2172           I.setMetadata(LLVMContext::MD_callsite, nullptr);
2173         }
2174         // For callsite metadata, add to list for this function for later use.
2175         else if (I.getMetadata(LLVMContext::MD_callsite)) {
2176           CallsWithMetadata.push_back(&I);
2177         }
2178       }
2179     }
2180     if (!CallsWithMetadata.empty())
2181       FuncToCallsWithMetadata[&F] = CallsWithMetadata;
2182   }
2183 
2184   if (DumpCCG) {
2185     dbgs() << "CCG before updating call stack chains:\n";
2186     dbgs() << *this;
2187   }
2188 
2189   if (ExportToDot)
2190     exportToDot("prestackupdate");
2191 
2192   updateStackNodes();
2193 
2194   if (ExportToDot)
2195     exportToDot("poststackupdate");
2196 
2197   handleCallsitesWithMultipleTargets();
2198 
2199   markBackedges();
2200 
2201   // Strip off remaining callsite metadata, no longer needed.
2202   for (auto &FuncEntry : FuncToCallsWithMetadata)
2203     for (auto &Call : FuncEntry.second)
2204       Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr);
2205 }
2206 
IndexCallsiteContextGraph(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)2207 IndexCallsiteContextGraph::IndexCallsiteContextGraph(
2208     ModuleSummaryIndex &Index,
2209     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
2210         isPrevailing)
2211     : Index(Index), isPrevailing(isPrevailing) {
2212   for (auto &I : Index) {
2213     auto VI = Index.getValueInfo(I);
2214     for (auto &S : VI.getSummaryList()) {
2215       // We should only add the prevailing nodes. Otherwise we may try to clone
2216       // in a weak copy that won't be linked (and may be different than the
2217       // prevailing version).
2218       // We only keep the memprof summary on the prevailing copy now when
2219       // building the combined index, as a space optimization, however don't
2220       // rely on this optimization. The linker doesn't resolve local linkage
2221       // values so don't check whether those are prevailing.
2222       if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2223           !isPrevailing(VI.getGUID(), S.get()))
2224         continue;
2225       auto *FS = dyn_cast<FunctionSummary>(S.get());
2226       if (!FS)
2227         continue;
2228       std::vector<CallInfo> CallsWithMetadata;
2229       if (!FS->allocs().empty()) {
2230         for (auto &AN : FS->mutableAllocs()) {
2231           // This can happen because of recursion elimination handling that
2232           // currently exists in ModuleSummaryAnalysis. Skip these for now.
2233           // We still added them to the summary because we need to be able to
2234           // correlate properly in applyImport in the backends.
2235           if (AN.MIBs.empty())
2236             continue;
2237           IndexCall AllocCall(&AN);
2238           CallsWithMetadata.push_back(AllocCall);
2239           auto *AllocNode = addAllocNode(AllocCall, FS);
2240           // Pass an empty CallStack to the CallsiteContext (second)
2241           // parameter, since for ThinLTO we already collapsed out the inlined
2242           // stack ids on the allocation call during ModuleSummaryAnalysis.
2243           CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2244               EmptyContext;
2245           unsigned I = 0;
2246           assert(!metadataMayIncludeContextSizeInfo() ||
2247                  AN.ContextSizeInfos.size() == AN.MIBs.size());
2248           // Now add all of the MIBs and their stack nodes.
2249           for (auto &MIB : AN.MIBs) {
2250             CallStack<MIBInfo, SmallVector<unsigned>::const_iterator>
2251                 StackContext(&MIB);
2252             std::vector<ContextTotalSize> ContextSizeInfo;
2253             if (!AN.ContextSizeInfos.empty()) {
2254               for (auto [FullStackId, TotalSize] : AN.ContextSizeInfos[I])
2255                 ContextSizeInfo.push_back({FullStackId, TotalSize});
2256             }
2257             addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>(
2258                 AllocNode, StackContext, EmptyContext, MIB.AllocType,
2259                 ContextSizeInfo);
2260             I++;
2261           }
2262           // If exporting the graph to dot and an allocation id of interest was
2263           // specified, record all the context ids for this allocation node.
2264           if (ExportToDot && AllocNode->OrigStackOrAllocId == AllocIdForDot)
2265             DotAllocContextIds = AllocNode->getContextIds();
2266           assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None);
2267           // Initialize version 0 on the summary alloc node to the current alloc
2268           // type, unless it has both types in which case make it default, so
2269           // that in the case where we aren't able to clone the original version
2270           // always ends up with the default allocation behavior.
2271           AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes);
2272         }
2273       }
2274       // For callsite metadata, add to list for this function for later use.
2275       if (!FS->callsites().empty())
2276         for (auto &SN : FS->mutableCallsites()) {
2277           IndexCall StackNodeCall(&SN);
2278           CallsWithMetadata.push_back(StackNodeCall);
2279         }
2280 
2281       if (!CallsWithMetadata.empty())
2282         FuncToCallsWithMetadata[FS] = CallsWithMetadata;
2283 
2284       if (!FS->allocs().empty() || !FS->callsites().empty())
2285         FSToVIMap[FS] = VI;
2286     }
2287   }
2288 
2289   if (DumpCCG) {
2290     dbgs() << "CCG before updating call stack chains:\n";
2291     dbgs() << *this;
2292   }
2293 
2294   if (ExportToDot)
2295     exportToDot("prestackupdate");
2296 
2297   updateStackNodes();
2298 
2299   if (ExportToDot)
2300     exportToDot("poststackupdate");
2301 
2302   handleCallsitesWithMultipleTargets();
2303 
2304   markBackedges();
2305 }
2306 
2307 template <typename DerivedCCG, typename FuncTy, typename CallTy>
2308 void CallsiteContextGraph<DerivedCCG, FuncTy,
handleCallsitesWithMultipleTargets()2309                           CallTy>::handleCallsitesWithMultipleTargets() {
2310   // Look for and workaround callsites that call multiple functions.
2311   // This can happen for indirect calls, which needs better handling, and in
2312   // more rare cases (e.g. macro expansion).
2313   // TODO: To fix this for indirect calls we will want to perform speculative
2314   // devirtualization using either the normal PGO info with ICP, or using the
2315   // information in the profiled MemProf contexts. We can do this prior to
2316   // this transformation for regular LTO, and for ThinLTO we can simulate that
2317   // effect in the summary and perform the actual speculative devirtualization
2318   // while cloning in the ThinLTO backend.
2319 
2320   // Keep track of the new nodes synthesized for discovered tail calls missing
2321   // from the profiled contexts.
2322   MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap;
2323 
2324   std::vector<std::pair<CallInfo, ContextNode *>> NewCallToNode;
2325   for (auto &Entry : NonAllocationCallToContextNodeMap) {
2326     auto *Node = Entry.second;
2327     assert(Node->Clones.empty());
2328     // Check all node callees and see if in the same function.
2329     // We need to check all of the calls recorded in this Node, because in some
2330     // cases we may have had multiple calls with the same debug info calling
2331     // different callees. This can happen, for example, when an object is
2332     // constructed in the paramter list - the destructor call of the object has
2333     // the same debug info (line/col) as the call the object was passed to.
2334     // Here we will prune any that don't match all callee nodes.
2335     std::vector<CallInfo> AllCalls;
2336     AllCalls.reserve(Node->MatchingCalls.size() + 1);
2337     AllCalls.push_back(Node->Call);
2338     llvm::append_range(AllCalls, Node->MatchingCalls);
2339 
2340     // First see if we can partition the calls by callee function, creating new
2341     // nodes to host each set of calls calling the same callees. This is
2342     // necessary for support indirect calls with ThinLTO, for which we
2343     // synthesized CallsiteInfo records for each target. They will all have the
2344     // same callsite stack ids and would be sharing a context node at this
2345     // point. We need to perform separate cloning for each, which will be
2346     // applied along with speculative devirtualization in the ThinLTO backends
2347     // as needed. Note this does not currently support looking through tail
2348     // calls, it is unclear if we need that for indirect call targets.
2349     // First partition calls by callee func. Map indexed by func, value is
2350     // struct with list of matching calls, assigned node.
2351     if (partitionCallsByCallee(Node, AllCalls, NewCallToNode))
2352       continue;
2353 
2354     auto It = AllCalls.begin();
2355     // Iterate through the calls until we find the first that matches.
2356     for (; It != AllCalls.end(); ++It) {
2357       auto ThisCall = *It;
2358       bool Match = true;
2359       for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();
2360            ++EI) {
2361         auto Edge = *EI;
2362         if (!Edge->Callee->hasCall())
2363           continue;
2364         assert(NodeToCallingFunc.count(Edge->Callee));
2365         // Check if the called function matches that of the callee node.
2366         if (!calleesMatch(ThisCall.call(), EI, TailCallToContextNodeMap)) {
2367           Match = false;
2368           break;
2369         }
2370       }
2371       // Found a call that matches the callee nodes, we can quit now.
2372       if (Match) {
2373         // If the first match is not the primary call on the Node, update it
2374         // now. We will update the list of matching calls further below.
2375         if (Node->Call != ThisCall) {
2376           Node->setCall(ThisCall);
2377           // We need to update the NonAllocationCallToContextNodeMap, but don't
2378           // want to do this during iteration over that map, so save the calls
2379           // that need updated entries.
2380           NewCallToNode.push_back({ThisCall, Node});
2381         }
2382         break;
2383       }
2384     }
2385     // We will update this list below (or leave it cleared if there was no
2386     // match found above).
2387     Node->MatchingCalls.clear();
2388     // If we hit the end of the AllCalls vector, no call matching the callee
2389     // nodes was found, clear the call information in the node.
2390     if (It == AllCalls.end()) {
2391       RemovedEdgesWithMismatchedCallees++;
2392       // Work around by setting Node to have a null call, so it gets
2393       // skipped during cloning. Otherwise assignFunctions will assert
2394       // because its data structures are not designed to handle this case.
2395       Node->setCall(CallInfo());
2396       continue;
2397     }
2398     // Now add back any matching calls that call the same function as the
2399     // matching primary call on Node.
2400     for (++It; It != AllCalls.end(); ++It) {
2401       auto ThisCall = *It;
2402       if (!sameCallee(Node->Call.call(), ThisCall.call()))
2403         continue;
2404       Node->MatchingCalls.push_back(ThisCall);
2405     }
2406   }
2407 
2408   // Remove all mismatched nodes identified in the above loop from the node map
2409   // (checking whether they have a null call which is set above). For a
2410   // MapVector like NonAllocationCallToContextNodeMap it is much more efficient
2411   // to do the removal via remove_if than by individually erasing entries above.
2412   // Also remove any entries if we updated the node's primary call above.
2413   NonAllocationCallToContextNodeMap.remove_if([](const auto &it) {
2414     return !it.second->hasCall() || it.second->Call != it.first;
2415   });
2416 
2417   // Add entries for any new primary calls recorded above.
2418   for (auto &[Call, Node] : NewCallToNode)
2419     NonAllocationCallToContextNodeMap[Call] = Node;
2420 
2421   // Add the new nodes after the above loop so that the iteration is not
2422   // invalidated.
2423   for (auto &[Call, Node] : TailCallToContextNodeMap)
2424     NonAllocationCallToContextNodeMap[Call] = Node;
2425 }
2426 
2427 template <typename DerivedCCG, typename FuncTy, typename CallTy>
partitionCallsByCallee(ContextNode * Node,ArrayRef<CallInfo> AllCalls,std::vector<std::pair<CallInfo,ContextNode * >> & NewCallToNode)2428 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::partitionCallsByCallee(
2429     ContextNode *Node, ArrayRef<CallInfo> AllCalls,
2430     std::vector<std::pair<CallInfo, ContextNode *>> &NewCallToNode) {
2431   // Struct to keep track of all the calls having the same callee function,
2432   // and the node we eventually assign to them. Eventually we will record the
2433   // context node assigned to this group of calls.
2434   struct CallsWithSameCallee {
2435     std::vector<CallInfo> Calls;
2436     ContextNode *Node = nullptr;
2437   };
2438 
2439   // First partition calls by callee function. Build map from each function
2440   // to the list of matching calls.
2441   DenseMap<const FuncTy *, CallsWithSameCallee> CalleeFuncToCallInfo;
2442   for (auto ThisCall : AllCalls) {
2443     auto *F = getCalleeFunc(ThisCall.call());
2444     if (F)
2445       CalleeFuncToCallInfo[F].Calls.push_back(ThisCall);
2446   }
2447 
2448   // Next, walk through all callee edges. For each callee node, get its
2449   // containing function and see if it was recorded in the above map (meaning we
2450   // have at least one matching call). Build another map from each callee node
2451   // with a matching call to the structure instance created above containing all
2452   // the calls.
2453   DenseMap<ContextNode *, CallsWithSameCallee *> CalleeNodeToCallInfo;
2454   for (const auto &Edge : Node->CalleeEdges) {
2455     if (!Edge->Callee->hasCall())
2456       continue;
2457     const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2458     if (CalleeFuncToCallInfo.contains(ProfiledCalleeFunc))
2459       CalleeNodeToCallInfo[Edge->Callee] =
2460           &CalleeFuncToCallInfo[ProfiledCalleeFunc];
2461   }
2462 
2463   // If there are entries in the second map, then there were no matching
2464   // calls/callees, nothing to do here. Return so we can go to the handling that
2465   // looks through tail calls.
2466   if (CalleeNodeToCallInfo.empty())
2467     return false;
2468 
2469   // Walk through all callee edges again. Any and all callee edges that didn't
2470   // match any calls (callee not in the CalleeNodeToCallInfo map) are moved to a
2471   // new caller node (UnmatchedCalleesNode) which gets a null call so that it is
2472   // ignored during cloning. If it is in the map, then we use the node recorded
2473   // in that entry (creating it if needed), and move the callee edge to it.
2474   // The first callee will use the original node instead of creating a new one.
2475   // Note that any of the original calls on this node (in AllCalls) that didn't
2476   // have a callee function automatically get dropped from the node as part of
2477   // this process.
2478   ContextNode *UnmatchedCalleesNode = nullptr;
2479   // Track whether we already assigned original node to a callee.
2480   bool UsedOrigNode = false;
2481   assert(NodeToCallingFunc[Node]);
2482   // Iterate over a copy of Node's callee edges, since we may need to remove
2483   // edges in moveCalleeEdgeToNewCaller, and this simplifies the handling and
2484   // makes it less error-prone.
2485   auto CalleeEdges = Node->CalleeEdges;
2486   for (auto &Edge : CalleeEdges) {
2487     if (!Edge->Callee->hasCall())
2488       continue;
2489 
2490     // Will be updated below to point to whatever (caller) node this callee edge
2491     // should be moved to.
2492     ContextNode *CallerNodeToUse = nullptr;
2493 
2494     // Handle the case where there were no matching calls first. Move this
2495     // callee edge to the UnmatchedCalleesNode, creating it if needed.
2496     if (!CalleeNodeToCallInfo.contains(Edge->Callee)) {
2497       if (!UnmatchedCalleesNode)
2498         UnmatchedCalleesNode =
2499             createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2500       CallerNodeToUse = UnmatchedCalleesNode;
2501     } else {
2502       // Look up the information recorded for this callee node, and use the
2503       // recorded caller node (creating it if needed).
2504       auto *Info = CalleeNodeToCallInfo[Edge->Callee];
2505       if (!Info->Node) {
2506         // If we haven't assigned any callees to the original node use it.
2507         if (!UsedOrigNode) {
2508           Info->Node = Node;
2509           // Clear the set of matching calls which will be updated below.
2510           Node->MatchingCalls.clear();
2511           UsedOrigNode = true;
2512         } else
2513           Info->Node =
2514               createNewNode(/*IsAllocation=*/false, NodeToCallingFunc[Node]);
2515         assert(!Info->Calls.empty());
2516         // The first call becomes the primary call for this caller node, and the
2517         // rest go in the matching calls list.
2518         Info->Node->setCall(Info->Calls.front());
2519         llvm::append_range(Info->Node->MatchingCalls,
2520                            llvm::drop_begin(Info->Calls));
2521         // Save the primary call to node correspondence so that we can update
2522         // the NonAllocationCallToContextNodeMap, which is being iterated in the
2523         // caller of this function.
2524         NewCallToNode.push_back({Info->Node->Call, Info->Node});
2525       }
2526       CallerNodeToUse = Info->Node;
2527     }
2528 
2529     // Don't need to move edge if we are using the original node;
2530     if (CallerNodeToUse == Node)
2531       continue;
2532 
2533     moveCalleeEdgeToNewCaller(Edge, CallerNodeToUse);
2534   }
2535   // Now that we are done moving edges, clean up any caller edges that ended
2536   // up with no type or context ids. During moveCalleeEdgeToNewCaller all
2537   // caller edges from Node are replicated onto the new callers, and it
2538   // simplifies the handling to leave them until we have moved all
2539   // edges/context ids.
2540   for (auto &I : CalleeNodeToCallInfo)
2541     removeNoneTypeCallerEdges(I.second->Node);
2542   if (UnmatchedCalleesNode)
2543     removeNoneTypeCallerEdges(UnmatchedCalleesNode);
2544   removeNoneTypeCallerEdges(Node);
2545 
2546   return true;
2547 }
2548 
getStackId(uint64_t IdOrIndex) const2549 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2550   // In the Module (IR) case this is already the Id.
2551   return IdOrIndex;
2552 }
2553 
getStackId(uint64_t IdOrIndex) const2554 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const {
2555   // In the Index case this is an index into the stack id list in the summary
2556   // index, convert it to an Id.
2557   return Index.getStackIdAtIndex(IdOrIndex);
2558 }
2559 
2560 template <typename DerivedCCG, typename FuncTy, typename CallTy>
calleesMatch(CallTy Call,EdgeIter & EI,MapVector<CallInfo,ContextNode * > & TailCallToContextNodeMap)2561 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch(
2562     CallTy Call, EdgeIter &EI,
2563     MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) {
2564   auto Edge = *EI;
2565   const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee];
2566   const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller];
2567   // Will be populated in order of callee to caller if we find a chain of tail
2568   // calls between the profiled caller and callee.
2569   std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain;
2570   if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc,
2571                          FoundCalleeChain))
2572     return false;
2573 
2574   // The usual case where the profiled callee matches that of the IR/summary.
2575   if (FoundCalleeChain.empty())
2576     return true;
2577 
2578   auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) {
2579     auto *CurEdge = Callee->findEdgeFromCaller(Caller);
2580     // If there is already an edge between these nodes, simply update it and
2581     // return.
2582     if (CurEdge) {
2583       CurEdge->ContextIds.insert_range(Edge->ContextIds);
2584       CurEdge->AllocTypes |= Edge->AllocTypes;
2585       return;
2586     }
2587     // Otherwise, create a new edge and insert it into the caller and callee
2588     // lists.
2589     auto NewEdge = std::make_shared<ContextEdge>(
2590         Callee, Caller, Edge->AllocTypes, Edge->ContextIds);
2591     Callee->CallerEdges.push_back(NewEdge);
2592     if (Caller == Edge->Caller) {
2593       // If we are inserting the new edge into the current edge's caller, insert
2594       // the new edge before the current iterator position, and then increment
2595       // back to the current edge.
2596       EI = Caller->CalleeEdges.insert(EI, NewEdge);
2597       ++EI;
2598       assert(*EI == Edge &&
2599              "Iterator position not restored after insert and increment");
2600     } else
2601       Caller->CalleeEdges.push_back(NewEdge);
2602   };
2603 
2604   // Create new nodes for each found callee and connect in between the profiled
2605   // caller and callee.
2606   auto *CurCalleeNode = Edge->Callee;
2607   for (auto &[NewCall, Func] : FoundCalleeChain) {
2608     ContextNode *NewNode = nullptr;
2609     // First check if we have already synthesized a node for this tail call.
2610     if (TailCallToContextNodeMap.count(NewCall)) {
2611       NewNode = TailCallToContextNodeMap[NewCall];
2612       NewNode->AllocTypes |= Edge->AllocTypes;
2613     } else {
2614       FuncToCallsWithMetadata[Func].push_back({NewCall});
2615       // Create Node and record node info.
2616       NewNode = createNewNode(/*IsAllocation=*/false, Func, NewCall);
2617       TailCallToContextNodeMap[NewCall] = NewNode;
2618       NewNode->AllocTypes = Edge->AllocTypes;
2619     }
2620 
2621     // Hook up node to its callee node
2622     AddEdge(NewNode, CurCalleeNode);
2623 
2624     CurCalleeNode = NewNode;
2625   }
2626 
2627   // Hook up edge's original caller to new callee node.
2628   AddEdge(Edge->Caller, CurCalleeNode);
2629 
2630 #ifndef NDEBUG
2631   // Save this because Edge's fields get cleared below when removed.
2632   auto *Caller = Edge->Caller;
2633 #endif
2634 
2635   // Remove old edge
2636   removeEdgeFromGraph(Edge.get(), &EI, /*CalleeIter=*/true);
2637 
2638   // To simplify the increment of EI in the caller, subtract one from EI.
2639   // In the final AddEdge call we would have either added a new callee edge,
2640   // to Edge->Caller, or found an existing one. Either way we are guaranteed
2641   // that there is at least one callee edge.
2642   assert(!Caller->CalleeEdges.empty());
2643   --EI;
2644 
2645   return true;
2646 }
2647 
findProfiledCalleeThroughTailCalls(const Function * ProfiledCallee,Value * CurCallee,unsigned Depth,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)2648 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2649     const Function *ProfiledCallee, Value *CurCallee, unsigned Depth,
2650     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain,
2651     bool &FoundMultipleCalleeChains) {
2652   // Stop recursive search if we have already explored the maximum specified
2653   // depth.
2654   if (Depth > TailCallSearchDepth)
2655     return false;
2656 
2657   auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) {
2658     FoundCalleeChain.push_back({Callsite, F});
2659   };
2660 
2661   auto *CalleeFunc = dyn_cast<Function>(CurCallee);
2662   if (!CalleeFunc) {
2663     auto *Alias = dyn_cast<GlobalAlias>(CurCallee);
2664     assert(Alias);
2665     CalleeFunc = dyn_cast<Function>(Alias->getAliasee());
2666     assert(CalleeFunc);
2667   }
2668 
2669   // Look for tail calls in this function, and check if they either call the
2670   // profiled callee directly, or indirectly (via a recursive search).
2671   // Only succeed if there is a single unique tail call chain found between the
2672   // profiled caller and callee, otherwise we could perform incorrect cloning.
2673   bool FoundSingleCalleeChain = false;
2674   for (auto &BB : *CalleeFunc) {
2675     for (auto &I : BB) {
2676       auto *CB = dyn_cast<CallBase>(&I);
2677       if (!CB || !CB->isTailCall())
2678         continue;
2679       auto *CalledValue = CB->getCalledOperand();
2680       auto *CalledFunction = CB->getCalledFunction();
2681       if (CalledValue && !CalledFunction) {
2682         CalledValue = CalledValue->stripPointerCasts();
2683         // Stripping pointer casts can reveal a called function.
2684         CalledFunction = dyn_cast<Function>(CalledValue);
2685       }
2686       // Check if this is an alias to a function. If so, get the
2687       // called aliasee for the checks below.
2688       if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
2689         assert(!CalledFunction &&
2690                "Expected null called function in callsite for alias");
2691         CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
2692       }
2693       if (!CalledFunction)
2694         continue;
2695       if (CalledFunction == ProfiledCallee) {
2696         if (FoundSingleCalleeChain) {
2697           FoundMultipleCalleeChains = true;
2698           return false;
2699         }
2700         FoundSingleCalleeChain = true;
2701         FoundProfiledCalleeCount++;
2702         FoundProfiledCalleeDepth += Depth;
2703         if (Depth > FoundProfiledCalleeMaxDepth)
2704           FoundProfiledCalleeMaxDepth = Depth;
2705         SaveCallsiteInfo(&I, CalleeFunc);
2706       } else if (findProfiledCalleeThroughTailCalls(
2707                      ProfiledCallee, CalledFunction, Depth + 1,
2708                      FoundCalleeChain, FoundMultipleCalleeChains)) {
2709         // findProfiledCalleeThroughTailCalls should not have returned
2710         // true if FoundMultipleCalleeChains.
2711         assert(!FoundMultipleCalleeChains);
2712         if (FoundSingleCalleeChain) {
2713           FoundMultipleCalleeChains = true;
2714           return false;
2715         }
2716         FoundSingleCalleeChain = true;
2717         SaveCallsiteInfo(&I, CalleeFunc);
2718       } else if (FoundMultipleCalleeChains)
2719         return false;
2720     }
2721   }
2722 
2723   return FoundSingleCalleeChain;
2724 }
2725 
getCalleeFunc(Instruction * Call)2726 const Function *ModuleCallsiteContextGraph::getCalleeFunc(Instruction *Call) {
2727   auto *CB = dyn_cast<CallBase>(Call);
2728   if (!CB->getCalledOperand() || CB->isIndirectCall())
2729     return nullptr;
2730   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2731   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2732   if (Alias)
2733     return dyn_cast<Function>(Alias->getAliasee());
2734   return dyn_cast<Function>(CalleeVal);
2735 }
2736 
calleeMatchesFunc(Instruction * Call,const Function * Func,const Function * CallerFunc,std::vector<std::pair<Instruction *,Function * >> & FoundCalleeChain)2737 bool ModuleCallsiteContextGraph::calleeMatchesFunc(
2738     Instruction *Call, const Function *Func, const Function *CallerFunc,
2739     std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) {
2740   auto *CB = dyn_cast<CallBase>(Call);
2741   if (!CB->getCalledOperand() || CB->isIndirectCall())
2742     return false;
2743   auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts();
2744   auto *CalleeFunc = dyn_cast<Function>(CalleeVal);
2745   if (CalleeFunc == Func)
2746     return true;
2747   auto *Alias = dyn_cast<GlobalAlias>(CalleeVal);
2748   if (Alias && Alias->getAliasee() == Func)
2749     return true;
2750 
2751   // Recursively search for the profiled callee through tail calls starting with
2752   // the actual Callee. The discovered tail call chain is saved in
2753   // FoundCalleeChain, and we will fixup the graph to include these callsites
2754   // after returning.
2755   // FIXME: We will currently redo the same recursive walk if we find the same
2756   // mismatched callee from another callsite. We can improve this with more
2757   // bookkeeping of the created chain of new nodes for each mismatch.
2758   unsigned Depth = 1;
2759   bool FoundMultipleCalleeChains = false;
2760   if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth,
2761                                           FoundCalleeChain,
2762                                           FoundMultipleCalleeChains)) {
2763     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: "
2764                       << Func->getName() << " from " << CallerFunc->getName()
2765                       << " that actually called " << CalleeVal->getName()
2766                       << (FoundMultipleCalleeChains
2767                               ? " (found multiple possible chains)"
2768                               : "")
2769                       << "\n");
2770     if (FoundMultipleCalleeChains)
2771       FoundProfiledCalleeNonUniquelyCount++;
2772     return false;
2773   }
2774 
2775   return true;
2776 }
2777 
sameCallee(Instruction * Call1,Instruction * Call2)2778 bool ModuleCallsiteContextGraph::sameCallee(Instruction *Call1,
2779                                             Instruction *Call2) {
2780   auto *CB1 = cast<CallBase>(Call1);
2781   if (!CB1->getCalledOperand() || CB1->isIndirectCall())
2782     return false;
2783   auto *CalleeVal1 = CB1->getCalledOperand()->stripPointerCasts();
2784   auto *CalleeFunc1 = dyn_cast<Function>(CalleeVal1);
2785   auto *CB2 = cast<CallBase>(Call2);
2786   if (!CB2->getCalledOperand() || CB2->isIndirectCall())
2787     return false;
2788   auto *CalleeVal2 = CB2->getCalledOperand()->stripPointerCasts();
2789   auto *CalleeFunc2 = dyn_cast<Function>(CalleeVal2);
2790   return CalleeFunc1 == CalleeFunc2;
2791 }
2792 
findProfiledCalleeThroughTailCalls(ValueInfo ProfiledCallee,ValueInfo CurCallee,unsigned Depth,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain,bool & FoundMultipleCalleeChains)2793 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls(
2794     ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth,
2795     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain,
2796     bool &FoundMultipleCalleeChains) {
2797   // Stop recursive search if we have already explored the maximum specified
2798   // depth.
2799   if (Depth > TailCallSearchDepth)
2800     return false;
2801 
2802   auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) {
2803     // Make a CallsiteInfo for each discovered callee, if one hasn't already
2804     // been synthesized.
2805     if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) ||
2806         !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee))
2807       // StackIds is empty (we don't have debug info available in the index for
2808       // these callsites)
2809       FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] =
2810           std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>());
2811     CallsiteInfo *NewCallsiteInfo =
2812         FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get();
2813     FoundCalleeChain.push_back({NewCallsiteInfo, FS});
2814   };
2815 
2816   // Look for tail calls in this function, and check if they either call the
2817   // profiled callee directly, or indirectly (via a recursive search).
2818   // Only succeed if there is a single unique tail call chain found between the
2819   // profiled caller and callee, otherwise we could perform incorrect cloning.
2820   bool FoundSingleCalleeChain = false;
2821   for (auto &S : CurCallee.getSummaryList()) {
2822     if (!GlobalValue::isLocalLinkage(S->linkage()) &&
2823         !isPrevailing(CurCallee.getGUID(), S.get()))
2824       continue;
2825     auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject());
2826     if (!FS)
2827       continue;
2828     auto FSVI = CurCallee;
2829     auto *AS = dyn_cast<AliasSummary>(S.get());
2830     if (AS)
2831       FSVI = AS->getAliaseeVI();
2832     for (auto &CallEdge : FS->calls()) {
2833       if (!CallEdge.second.hasTailCall())
2834         continue;
2835       if (CallEdge.first == ProfiledCallee) {
2836         if (FoundSingleCalleeChain) {
2837           FoundMultipleCalleeChains = true;
2838           return false;
2839         }
2840         FoundSingleCalleeChain = true;
2841         FoundProfiledCalleeCount++;
2842         FoundProfiledCalleeDepth += Depth;
2843         if (Depth > FoundProfiledCalleeMaxDepth)
2844           FoundProfiledCalleeMaxDepth = Depth;
2845         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2846         // Add FS to FSToVIMap  in case it isn't already there.
2847         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2848         FSToVIMap[FS] = FSVI;
2849       } else if (findProfiledCalleeThroughTailCalls(
2850                      ProfiledCallee, CallEdge.first, Depth + 1,
2851                      FoundCalleeChain, FoundMultipleCalleeChains)) {
2852         // findProfiledCalleeThroughTailCalls should not have returned
2853         // true if FoundMultipleCalleeChains.
2854         assert(!FoundMultipleCalleeChains);
2855         if (FoundSingleCalleeChain) {
2856           FoundMultipleCalleeChains = true;
2857           return false;
2858         }
2859         FoundSingleCalleeChain = true;
2860         CreateAndSaveCallsiteInfo(CallEdge.first, FS);
2861         // Add FS to FSToVIMap  in case it isn't already there.
2862         assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI);
2863         FSToVIMap[FS] = FSVI;
2864       } else if (FoundMultipleCalleeChains)
2865         return false;
2866     }
2867   }
2868 
2869   return FoundSingleCalleeChain;
2870 }
2871 
2872 const FunctionSummary *
getCalleeFunc(IndexCall & Call)2873 IndexCallsiteContextGraph::getCalleeFunc(IndexCall &Call) {
2874   ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2875   if (Callee.getSummaryList().empty())
2876     return nullptr;
2877   return dyn_cast<FunctionSummary>(Callee.getSummaryList()[0]->getBaseObject());
2878 }
2879 
calleeMatchesFunc(IndexCall & Call,const FunctionSummary * Func,const FunctionSummary * CallerFunc,std::vector<std::pair<IndexCall,FunctionSummary * >> & FoundCalleeChain)2880 bool IndexCallsiteContextGraph::calleeMatchesFunc(
2881     IndexCall &Call, const FunctionSummary *Func,
2882     const FunctionSummary *CallerFunc,
2883     std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) {
2884   ValueInfo Callee = dyn_cast_if_present<CallsiteInfo *>(Call)->Callee;
2885   // If there is no summary list then this is a call to an externally defined
2886   // symbol.
2887   AliasSummary *Alias =
2888       Callee.getSummaryList().empty()
2889           ? nullptr
2890           : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get());
2891   assert(FSToVIMap.count(Func));
2892   auto FuncVI = FSToVIMap[Func];
2893   if (Callee == FuncVI ||
2894       // If callee is an alias, check the aliasee, since only function
2895       // summary base objects will contain the stack node summaries and thus
2896       // get a context node.
2897       (Alias && Alias->getAliaseeVI() == FuncVI))
2898     return true;
2899 
2900   // Recursively search for the profiled callee through tail calls starting with
2901   // the actual Callee. The discovered tail call chain is saved in
2902   // FoundCalleeChain, and we will fixup the graph to include these callsites
2903   // after returning.
2904   // FIXME: We will currently redo the same recursive walk if we find the same
2905   // mismatched callee from another callsite. We can improve this with more
2906   // bookkeeping of the created chain of new nodes for each mismatch.
2907   unsigned Depth = 1;
2908   bool FoundMultipleCalleeChains = false;
2909   if (!findProfiledCalleeThroughTailCalls(
2910           FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) {
2911     LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI
2912                       << " from " << FSToVIMap[CallerFunc]
2913                       << " that actually called " << Callee
2914                       << (FoundMultipleCalleeChains
2915                               ? " (found multiple possible chains)"
2916                               : "")
2917                       << "\n");
2918     if (FoundMultipleCalleeChains)
2919       FoundProfiledCalleeNonUniquelyCount++;
2920     return false;
2921   }
2922 
2923   return true;
2924 }
2925 
sameCallee(IndexCall & Call1,IndexCall & Call2)2926 bool IndexCallsiteContextGraph::sameCallee(IndexCall &Call1, IndexCall &Call2) {
2927   ValueInfo Callee1 = dyn_cast_if_present<CallsiteInfo *>(Call1)->Callee;
2928   ValueInfo Callee2 = dyn_cast_if_present<CallsiteInfo *>(Call2)->Callee;
2929   return Callee1 == Callee2;
2930 }
2931 
2932 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2933 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump()
2934     const {
2935   print(dbgs());
2936   dbgs() << "\n";
2937 }
2938 
2939 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2940 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print(
2941     raw_ostream &OS) const {
2942   OS << "Node " << this << "\n";
2943   OS << "\t";
2944   printCall(OS);
2945   if (Recursive)
2946     OS << " (recursive)";
2947   OS << "\n";
2948   if (!MatchingCalls.empty()) {
2949     OS << "\tMatchingCalls:\n";
2950     for (auto &MatchingCall : MatchingCalls) {
2951       OS << "\t";
2952       MatchingCall.print(OS);
2953       OS << "\n";
2954     }
2955   }
2956   OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n";
2957   OS << "\tContextIds:";
2958   // Make a copy of the computed context ids that we can sort for stability.
2959   auto ContextIds = getContextIds();
2960   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2961   std::sort(SortedIds.begin(), SortedIds.end());
2962   for (auto Id : SortedIds)
2963     OS << " " << Id;
2964   OS << "\n";
2965   OS << "\tCalleeEdges:\n";
2966   for (auto &Edge : CalleeEdges)
2967     OS << "\t\t" << *Edge << "\n";
2968   OS << "\tCallerEdges:\n";
2969   for (auto &Edge : CallerEdges)
2970     OS << "\t\t" << *Edge << "\n";
2971   if (!Clones.empty()) {
2972     OS << "\tClones: " << llvm::interleaved(Clones) << "\n";
2973   } else if (CloneOf) {
2974     OS << "\tClone of " << CloneOf << "\n";
2975   }
2976 }
2977 
2978 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2979 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump()
2980     const {
2981   print(dbgs());
2982   dbgs() << "\n";
2983 }
2984 
2985 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const2986 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print(
2987     raw_ostream &OS) const {
2988   OS << "Edge from Callee " << Callee << " to Caller: " << Caller
2989      << (IsBackedge ? " (BE)" : "")
2990      << " AllocTypes: " << getAllocTypeString(AllocTypes);
2991   OS << " ContextIds:";
2992   std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
2993   std::sort(SortedIds.begin(), SortedIds.end());
2994   for (auto Id : SortedIds)
2995     OS << " " << Id;
2996 }
2997 
2998 template <typename DerivedCCG, typename FuncTy, typename CallTy>
dump() const2999 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const {
3000   print(dbgs());
3001 }
3002 
3003 template <typename DerivedCCG, typename FuncTy, typename CallTy>
print(raw_ostream & OS) const3004 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print(
3005     raw_ostream &OS) const {
3006   OS << "Callsite Context Graph:\n";
3007   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3008   for (const auto Node : nodes<GraphType>(this)) {
3009     if (Node->isRemoved())
3010       continue;
3011     Node->print(OS);
3012     OS << "\n";
3013   }
3014 }
3015 
3016 template <typename DerivedCCG, typename FuncTy, typename CallTy>
printTotalSizes(raw_ostream & OS) const3017 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes(
3018     raw_ostream &OS) const {
3019   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3020   for (const auto Node : nodes<GraphType>(this)) {
3021     if (Node->isRemoved())
3022       continue;
3023     if (!Node->IsAllocation)
3024       continue;
3025     DenseSet<uint32_t> ContextIds = Node->getContextIds();
3026     auto AllocTypeFromCall = getAllocationCallType(Node->Call);
3027     std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3028     std::sort(SortedIds.begin(), SortedIds.end());
3029     for (auto Id : SortedIds) {
3030       auto TypeI = ContextIdToAllocationType.find(Id);
3031       assert(TypeI != ContextIdToAllocationType.end());
3032       auto CSI = ContextIdToContextSizeInfos.find(Id);
3033       if (CSI != ContextIdToContextSizeInfos.end()) {
3034         for (auto &Info : CSI->second) {
3035           OS << "MemProf hinting: "
3036              << getAllocTypeString((uint8_t)TypeI->second)
3037              << " full allocation context " << Info.FullStackId
3038              << " with total size " << Info.TotalSize << " is "
3039              << getAllocTypeString(Node->AllocTypes) << " after cloning";
3040           if (allocTypeToUse(Node->AllocTypes) != AllocTypeFromCall)
3041             OS << " marked " << getAllocTypeString((uint8_t)AllocTypeFromCall)
3042                << " due to cold byte percent";
3043           // Print the internal context id to aid debugging and visualization.
3044           OS << " (context id " << Id << ")";
3045           OS << "\n";
3046         }
3047       }
3048     }
3049   }
3050 }
3051 
3052 template <typename DerivedCCG, typename FuncTy, typename CallTy>
check() const3053 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const {
3054   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3055   for (const auto Node : nodes<GraphType>(this)) {
3056     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3057     for (auto &Edge : Node->CallerEdges)
3058       checkEdge<DerivedCCG, FuncTy, CallTy>(Edge);
3059   }
3060 }
3061 
3062 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3063 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> {
3064   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3065   using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *;
3066 
3067   using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>;
getNodeGraphTraits3068   static NodeRef getNode(const NodePtrTy &P) { return P.get(); }
3069 
3070   using nodes_iterator =
3071       mapped_iterator<typename std::vector<NodePtrTy>::const_iterator,
3072                       decltype(&getNode)>;
3073 
nodes_beginGraphTraits3074   static nodes_iterator nodes_begin(GraphType G) {
3075     return nodes_iterator(G->NodeOwner.begin(), &getNode);
3076   }
3077 
nodes_endGraphTraits3078   static nodes_iterator nodes_end(GraphType G) {
3079     return nodes_iterator(G->NodeOwner.end(), &getNode);
3080   }
3081 
getEntryNodeGraphTraits3082   static NodeRef getEntryNode(GraphType G) {
3083     return G->NodeOwner.begin()->get();
3084   }
3085 
3086   using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>;
3087   static const ContextNode<DerivedCCG, FuncTy, CallTy> *
GetCalleeGraphTraits3088   GetCallee(const EdgePtrTy &P) {
3089     return P->Callee;
3090   }
3091 
3092   using ChildIteratorType =
3093       mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge<
3094                           DerivedCCG, FuncTy, CallTy>>>::const_iterator,
3095                       decltype(&GetCallee)>;
3096 
child_beginGraphTraits3097   static ChildIteratorType child_begin(NodeRef N) {
3098     return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee);
3099   }
3100 
child_endGraphTraits3101   static ChildIteratorType child_end(NodeRef N) {
3102     return ChildIteratorType(N->CalleeEdges.end(), &GetCallee);
3103   }
3104 };
3105 
3106 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3107 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>
3108     : public DefaultDOTGraphTraits {
DOTGraphTraitsDOTGraphTraits3109   DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {
3110     // If the user requested the full graph to be exported, but provided an
3111     // allocation id, or if the user gave a context id and requested more than
3112     // just a specific context to be exported, note that highlighting is
3113     // enabled.
3114     DoHighlight =
3115         (AllocIdForDot.getNumOccurrences() && DotGraphScope == DotScope::All) ||
3116         (ContextIdForDot.getNumOccurrences() &&
3117          DotGraphScope != DotScope::Context);
3118   }
3119 
3120   using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *;
3121   using GTraits = GraphTraits<GraphType>;
3122   using NodeRef = typename GTraits::NodeRef;
3123   using ChildIteratorType = typename GTraits::ChildIteratorType;
3124 
getNodeLabelDOTGraphTraits3125   static std::string getNodeLabel(NodeRef Node, GraphType G) {
3126     std::string LabelString =
3127         (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") +
3128          Twine(Node->OrigStackOrAllocId))
3129             .str();
3130     LabelString += "\n";
3131     if (Node->hasCall()) {
3132       auto Func = G->NodeToCallingFunc.find(Node);
3133       assert(Func != G->NodeToCallingFunc.end());
3134       LabelString +=
3135           G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo());
3136     } else {
3137       LabelString += "null call";
3138       if (Node->Recursive)
3139         LabelString += " (recursive)";
3140       else
3141         LabelString += " (external)";
3142     }
3143     return LabelString;
3144   }
3145 
getNodeAttributesDOTGraphTraits3146   static std::string getNodeAttributes(NodeRef Node, GraphType G) {
3147     auto ContextIds = Node->getContextIds();
3148     // If highlighting enabled, see if this node contains any of the context ids
3149     // of interest. If so, it will use a different color and a larger fontsize
3150     // (which makes the node larger as well).
3151     bool Highlight = false;
3152     if (DoHighlight) {
3153       assert(ContextIdForDot.getNumOccurrences() ||
3154              AllocIdForDot.getNumOccurrences());
3155       if (ContextIdForDot.getNumOccurrences())
3156         Highlight = ContextIds.contains(ContextIdForDot);
3157       else
3158         Highlight = set_intersects(ContextIds, G->DotAllocContextIds);
3159     }
3160     std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " +
3161                                    getContextIds(ContextIds) + "\"")
3162                                       .str();
3163     // Default fontsize is 14
3164     if (Highlight)
3165       AttributeString += ",fontsize=\"30\"";
3166     AttributeString +=
3167         (Twine(",fillcolor=\"") + getColor(Node->AllocTypes, Highlight) + "\"")
3168             .str();
3169     if (Node->CloneOf) {
3170       AttributeString += ",color=\"blue\"";
3171       AttributeString += ",style=\"filled,bold,dashed\"";
3172     } else
3173       AttributeString += ",style=\"filled\"";
3174     return AttributeString;
3175   }
3176 
getEdgeAttributesDOTGraphTraits3177   static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter,
3178                                        GraphType G) {
3179     auto &Edge = *(ChildIter.getCurrent());
3180     // If highlighting enabled, see if this edge contains any of the context ids
3181     // of interest. If so, it will use a different color and a heavier arrow
3182     // size and weight (the larger weight makes the highlighted path
3183     // straighter).
3184     bool Highlight = false;
3185     if (DoHighlight) {
3186       assert(ContextIdForDot.getNumOccurrences() ||
3187              AllocIdForDot.getNumOccurrences());
3188       if (ContextIdForDot.getNumOccurrences())
3189         Highlight = Edge->ContextIds.contains(ContextIdForDot);
3190       else
3191         Highlight = set_intersects(Edge->ContextIds, G->DotAllocContextIds);
3192     }
3193     auto Color = getColor(Edge->AllocTypes, Highlight);
3194     std::string AttributeString =
3195         (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" +
3196          // fillcolor is the arrow head and color is the line
3197          Twine(",fillcolor=\"") + Color + "\"" + Twine(",color=\"") + Color +
3198          "\"")
3199             .str();
3200     if (Edge->IsBackedge)
3201       AttributeString += ",style=\"dotted\"";
3202     // Default penwidth and weight are both 1.
3203     if (Highlight)
3204       AttributeString += ",penwidth=\"2.0\",weight=\"2\"";
3205     return AttributeString;
3206   }
3207 
3208   // Since the NodeOwners list includes nodes that are no longer connected to
3209   // the graph, skip them here.
isNodeHiddenDOTGraphTraits3210   static bool isNodeHidden(NodeRef Node, GraphType G) {
3211     if (Node->isRemoved())
3212       return true;
3213     // If a scope smaller than the full graph was requested, see if this node
3214     // contains any of the context ids of interest.
3215     if (DotGraphScope == DotScope::Alloc)
3216       return !set_intersects(Node->getContextIds(), G->DotAllocContextIds);
3217     if (DotGraphScope == DotScope::Context)
3218       return !Node->getContextIds().contains(ContextIdForDot);
3219     return false;
3220   }
3221 
3222 private:
getContextIdsDOTGraphTraits3223   static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) {
3224     std::string IdString = "ContextIds:";
3225     if (ContextIds.size() < 100) {
3226       std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end());
3227       std::sort(SortedIds.begin(), SortedIds.end());
3228       for (auto Id : SortedIds)
3229         IdString += (" " + Twine(Id)).str();
3230     } else {
3231       IdString += (" (" + Twine(ContextIds.size()) + " ids)").str();
3232     }
3233     return IdString;
3234   }
3235 
getColorDOTGraphTraits3236   static std::string getColor(uint8_t AllocTypes, bool Highlight) {
3237     // If DoHighlight is not enabled, we want to use the highlight colors for
3238     // NotCold and Cold, and the non-highlight color for NotCold+Cold. This is
3239     // both compatible with the color scheme before highlighting was supported,
3240     // and for the NotCold+Cold color the non-highlight color is a bit more
3241     // readable.
3242     if (AllocTypes == (uint8_t)AllocationType::NotCold)
3243       // Color "brown1" actually looks like a lighter red.
3244       return !DoHighlight || Highlight ? "brown1" : "lightpink";
3245     if (AllocTypes == (uint8_t)AllocationType::Cold)
3246       return !DoHighlight || Highlight ? "cyan" : "lightskyblue";
3247     if (AllocTypes ==
3248         ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold))
3249       return Highlight ? "magenta" : "mediumorchid1";
3250     return "gray";
3251   }
3252 
getNodeIdDOTGraphTraits3253   static std::string getNodeId(NodeRef Node) {
3254     std::stringstream SStream;
3255     SStream << std::hex << "N0x" << (unsigned long long)Node;
3256     std::string Result = SStream.str();
3257     return Result;
3258   }
3259 
3260   // True if we should highlight a specific context or allocation's contexts in
3261   // the emitted graph.
3262   static bool DoHighlight;
3263 };
3264 
3265 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3266 bool DOTGraphTraits<
3267     const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>::DoHighlight =
3268     false;
3269 
3270 template <typename DerivedCCG, typename FuncTy, typename CallTy>
exportToDot(std::string Label) const3271 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot(
3272     std::string Label) const {
3273   WriteGraph(this, "", false, Label,
3274              DotFilePathPrefix + "ccg." + Label + ".dot");
3275 }
3276 
3277 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3278 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode *
moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> & Edge,DenseSet<uint32_t> ContextIdsToMove)3279 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone(
3280     const std::shared_ptr<ContextEdge> &Edge,
3281     DenseSet<uint32_t> ContextIdsToMove) {
3282   ContextNode *Node = Edge->Callee;
3283   assert(NodeToCallingFunc.count(Node));
3284   ContextNode *Clone =
3285       createNewNode(Node->IsAllocation, NodeToCallingFunc[Node], Node->Call);
3286   Node->addClone(Clone);
3287   Clone->MatchingCalls = Node->MatchingCalls;
3288   moveEdgeToExistingCalleeClone(Edge, Clone, /*NewClone=*/true,
3289                                 ContextIdsToMove);
3290   return Clone;
3291 }
3292 
3293 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3294 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> & Edge,ContextNode * NewCallee,bool NewClone,DenseSet<uint32_t> ContextIdsToMove)3295     moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge,
3296                                   ContextNode *NewCallee, bool NewClone,
3297                                   DenseSet<uint32_t> ContextIdsToMove) {
3298   // NewCallee and Edge's current callee must be clones of the same original
3299   // node (Edge's current callee may be the original node too).
3300   assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode());
3301 
3302   bool EdgeIsRecursive = Edge->Callee == Edge->Caller;
3303 
3304   ContextNode *OldCallee = Edge->Callee;
3305 
3306   // We might already have an edge to the new callee from earlier cloning for a
3307   // different allocation. If one exists we will reuse it.
3308   auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller);
3309 
3310   // Callers will pass an empty ContextIdsToMove set when they want to move the
3311   // edge. Copy in Edge's ids for simplicity.
3312   if (ContextIdsToMove.empty())
3313     ContextIdsToMove = Edge->getContextIds();
3314 
3315   // If we are moving all of Edge's ids, then just move the whole Edge.
3316   // Otherwise only move the specified subset, to a new edge if needed.
3317   if (Edge->getContextIds().size() == ContextIdsToMove.size()) {
3318     // First, update the alloc types on New Callee from Edge.
3319     // Do this before we potentially clear Edge's fields below!
3320     NewCallee->AllocTypes |= Edge->AllocTypes;
3321     // Moving the whole Edge.
3322     if (ExistingEdgeToNewCallee) {
3323       // Since we already have an edge to NewCallee, simply move the ids
3324       // onto it, and remove the existing Edge.
3325       ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3326       ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes;
3327       assert(Edge->ContextIds == ContextIdsToMove);
3328       removeEdgeFromGraph(Edge.get());
3329     } else {
3330       // Otherwise just reconnect Edge to NewCallee.
3331       Edge->Callee = NewCallee;
3332       NewCallee->CallerEdges.push_back(Edge);
3333       // Remove it from callee where it was previously connected.
3334       OldCallee->eraseCallerEdge(Edge.get());
3335       // Don't need to update Edge's context ids since we are simply
3336       // reconnecting it.
3337     }
3338   } else {
3339     // Only moving a subset of Edge's ids.
3340     // Compute the alloc type of the subset of ids being moved.
3341     auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove);
3342     if (ExistingEdgeToNewCallee) {
3343       // Since we already have an edge to NewCallee, simply move the ids
3344       // onto it.
3345       ExistingEdgeToNewCallee->getContextIds().insert_range(ContextIdsToMove);
3346       ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType;
3347     } else {
3348       // Otherwise, create a new edge to NewCallee for the ids being moved.
3349       auto NewEdge = std::make_shared<ContextEdge>(
3350           NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove);
3351       Edge->Caller->CalleeEdges.push_back(NewEdge);
3352       NewCallee->CallerEdges.push_back(NewEdge);
3353     }
3354     // In either case, need to update the alloc types on NewCallee, and remove
3355     // those ids and update the alloc type on the original Edge.
3356     NewCallee->AllocTypes |= CallerEdgeAllocType;
3357     set_subtract(Edge->ContextIds, ContextIdsToMove);
3358     Edge->AllocTypes = computeAllocType(Edge->ContextIds);
3359   }
3360   // Now walk the old callee node's callee edges and move Edge's context ids
3361   // over to the corresponding edge into the clone (which is created here if
3362   // this is a newly created clone).
3363   for (auto &OldCalleeEdge : OldCallee->CalleeEdges) {
3364     ContextNode *CalleeToUse = OldCalleeEdge->Callee;
3365     // If this is a direct recursion edge, use NewCallee (the clone) as the
3366     // callee as well, so that any edge updated/created here is also direct
3367     // recursive.
3368     if (CalleeToUse == OldCallee) {
3369       // If this is a recursive edge, see if we already moved a recursive edge
3370       // (which would have to have been this one) - if we were only moving a
3371       // subset of context ids it would still be on OldCallee.
3372       if (EdgeIsRecursive) {
3373         assert(OldCalleeEdge == Edge);
3374         continue;
3375       }
3376       CalleeToUse = NewCallee;
3377     }
3378     // The context ids moving to the new callee are the subset of this edge's
3379     // context ids and the context ids on the caller edge being moved.
3380     DenseSet<uint32_t> EdgeContextIdsToMove =
3381         set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove);
3382     set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove);
3383     OldCalleeEdge->AllocTypes =
3384         computeAllocType(OldCalleeEdge->getContextIds());
3385     if (!NewClone) {
3386       // Update context ids / alloc type on corresponding edge to NewCallee.
3387       // There is a chance this may not exist if we are reusing an existing
3388       // clone, specifically during function assignment, where we would have
3389       // removed none type edges after creating the clone. If we can't find
3390       // a corresponding edge there, fall through to the cloning below.
3391       if (auto *NewCalleeEdge = NewCallee->findEdgeFromCallee(CalleeToUse)) {
3392         NewCalleeEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3393         NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove);
3394         continue;
3395       }
3396     }
3397     auto NewEdge = std::make_shared<ContextEdge>(
3398         CalleeToUse, NewCallee, computeAllocType(EdgeContextIdsToMove),
3399         EdgeContextIdsToMove);
3400     NewCallee->CalleeEdges.push_back(NewEdge);
3401     NewEdge->Callee->CallerEdges.push_back(NewEdge);
3402   }
3403   // Recompute the node alloc type now that its callee edges have been
3404   // updated (since we will compute from those edges).
3405   OldCallee->AllocTypes = OldCallee->computeAllocType();
3406   // OldCallee alloc type should be None iff its context id set is now empty.
3407   assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) ==
3408          OldCallee->emptyContextIds());
3409   if (VerifyCCG) {
3410     checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false);
3411     checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false);
3412     for (const auto &OldCalleeEdge : OldCallee->CalleeEdges)
3413       checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee,
3414                                             /*CheckEdges=*/false);
3415     for (const auto &NewCalleeEdge : NewCallee->CalleeEdges)
3416       checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee,
3417                                             /*CheckEdges=*/false);
3418   }
3419 }
3420 
3421 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3422 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> & Edge,ContextNode * NewCaller)3423     moveCalleeEdgeToNewCaller(const std::shared_ptr<ContextEdge> &Edge,
3424                               ContextNode *NewCaller) {
3425   auto *OldCallee = Edge->Callee;
3426   auto *NewCallee = OldCallee;
3427   // If this edge was direct recursive, make any new/updated edge also direct
3428   // recursive to NewCaller.
3429   bool Recursive = Edge->Caller == Edge->Callee;
3430   if (Recursive)
3431     NewCallee = NewCaller;
3432 
3433   ContextNode *OldCaller = Edge->Caller;
3434   OldCaller->eraseCalleeEdge(Edge.get());
3435 
3436   // We might already have an edge to the new caller. If one exists we will
3437   // reuse it.
3438   auto ExistingEdgeToNewCaller = NewCaller->findEdgeFromCallee(NewCallee);
3439 
3440   if (ExistingEdgeToNewCaller) {
3441     // Since we already have an edge to NewCaller, simply move the ids
3442     // onto it, and remove the existing Edge.
3443     ExistingEdgeToNewCaller->getContextIds().insert_range(
3444         Edge->getContextIds());
3445     ExistingEdgeToNewCaller->AllocTypes |= Edge->AllocTypes;
3446     Edge->ContextIds.clear();
3447     Edge->AllocTypes = (uint8_t)AllocationType::None;
3448     OldCallee->eraseCallerEdge(Edge.get());
3449   } else {
3450     // Otherwise just reconnect Edge to NewCaller.
3451     Edge->Caller = NewCaller;
3452     NewCaller->CalleeEdges.push_back(Edge);
3453     if (Recursive) {
3454       assert(NewCallee == NewCaller);
3455       // In the case of (direct) recursive edges, we update the callee as well
3456       // so that it becomes recursive on the new caller.
3457       Edge->Callee = NewCallee;
3458       NewCallee->CallerEdges.push_back(Edge);
3459       OldCallee->eraseCallerEdge(Edge.get());
3460     }
3461     // Don't need to update Edge's context ids since we are simply
3462     // reconnecting it.
3463   }
3464   // In either case, need to update the alloc types on New Caller.
3465   NewCaller->AllocTypes |= Edge->AllocTypes;
3466 
3467   // Now walk the old caller node's caller edges and move Edge's context ids
3468   // over to the corresponding edge into the node (which is created here if
3469   // this is a newly created node). We can tell whether this is a newly created
3470   // node by seeing if it has any caller edges yet.
3471 #ifndef NDEBUG
3472   bool IsNewNode = NewCaller->CallerEdges.empty();
3473 #endif
3474   // If we just moved a direct recursive edge, presumably its context ids should
3475   // also flow out of OldCaller via some other non-recursive callee edge. We
3476   // don't want to remove the recursive context ids from other caller edges yet,
3477   // otherwise the context ids get into an inconsistent state on OldCaller.
3478   // We will update these context ids on the non-recursive caller edge when and
3479   // if they are updated on the non-recursive callee.
3480   if (!Recursive) {
3481     for (auto &OldCallerEdge : OldCaller->CallerEdges) {
3482       auto OldCallerCaller = OldCallerEdge->Caller;
3483       // The context ids moving to the new caller are the subset of this edge's
3484       // context ids and the context ids on the callee edge being moved.
3485       DenseSet<uint32_t> EdgeContextIdsToMove = set_intersection(
3486           OldCallerEdge->getContextIds(), Edge->getContextIds());
3487       if (OldCaller == OldCallerCaller) {
3488         OldCallerCaller = NewCaller;
3489         // Don't actually move this one. The caller will move it directly via a
3490         // call to this function with this as the Edge if it is appropriate to
3491         // move to a diff node that has a matching callee (itself).
3492         continue;
3493       }
3494       set_subtract(OldCallerEdge->getContextIds(), EdgeContextIdsToMove);
3495       OldCallerEdge->AllocTypes =
3496           computeAllocType(OldCallerEdge->getContextIds());
3497       // In this function we expect that any pre-existing node already has edges
3498       // from the same callers as the old node. That should be true in the
3499       // current use case, where we will remove None-type edges after copying
3500       // over all caller edges from the callee.
3501       auto *ExistingCallerEdge = NewCaller->findEdgeFromCaller(OldCallerCaller);
3502       // Since we would have skipped caller edges when moving a direct recursive
3503       // edge, this may not hold true when recursive handling enabled.
3504       assert(IsNewNode || ExistingCallerEdge || AllowRecursiveCallsites);
3505       if (ExistingCallerEdge) {
3506         ExistingCallerEdge->getContextIds().insert_range(EdgeContextIdsToMove);
3507         ExistingCallerEdge->AllocTypes |=
3508             computeAllocType(EdgeContextIdsToMove);
3509         continue;
3510       }
3511       auto NewEdge = std::make_shared<ContextEdge>(
3512           NewCaller, OldCallerCaller, computeAllocType(EdgeContextIdsToMove),
3513           EdgeContextIdsToMove);
3514       NewCaller->CallerEdges.push_back(NewEdge);
3515       NewEdge->Caller->CalleeEdges.push_back(NewEdge);
3516     }
3517   }
3518   // Recompute the node alloc type now that its caller edges have been
3519   // updated (since we will compute from those edges).
3520   OldCaller->AllocTypes = OldCaller->computeAllocType();
3521   // OldCaller alloc type should be None iff its context id set is now empty.
3522   assert((OldCaller->AllocTypes == (uint8_t)AllocationType::None) ==
3523          OldCaller->emptyContextIds());
3524   if (VerifyCCG) {
3525     checkNode<DerivedCCG, FuncTy, CallTy>(OldCaller, /*CheckEdges=*/false);
3526     checkNode<DerivedCCG, FuncTy, CallTy>(NewCaller, /*CheckEdges=*/false);
3527     for (const auto &OldCallerEdge : OldCaller->CallerEdges)
3528       checkNode<DerivedCCG, FuncTy, CallTy>(OldCallerEdge->Caller,
3529                                             /*CheckEdges=*/false);
3530     for (const auto &NewCallerEdge : NewCaller->CallerEdges)
3531       checkNode<DerivedCCG, FuncTy, CallTy>(NewCallerEdge->Caller,
3532                                             /*CheckEdges=*/false);
3533   }
3534 }
3535 
3536 template <typename DerivedCCG, typename FuncTy, typename CallTy>
3537 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
recursivelyRemoveNoneTypeCalleeEdges(ContextNode * Node,DenseSet<const ContextNode * > & Visited)3538     recursivelyRemoveNoneTypeCalleeEdges(
3539         ContextNode *Node, DenseSet<const ContextNode *> &Visited) {
3540   auto Inserted = Visited.insert(Node);
3541   if (!Inserted.second)
3542     return;
3543 
3544   removeNoneTypeCalleeEdges(Node);
3545 
3546   for (auto *Clone : Node->Clones)
3547     recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited);
3548 
3549   // The recursive call may remove some of this Node's caller edges.
3550   // Iterate over a copy and skip any that were removed.
3551   auto CallerEdges = Node->CallerEdges;
3552   for (auto &Edge : CallerEdges) {
3553     // Skip any that have been removed by an earlier recursive call.
3554     if (Edge->isRemoved()) {
3555       assert(!is_contained(Node->CallerEdges, Edge));
3556       continue;
3557     }
3558     recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited);
3559   }
3560 }
3561 
3562 // This is the standard DFS based backedge discovery algorithm.
3563 template <typename DerivedCCG, typename FuncTy, typename CallTy>
markBackedges()3564 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges() {
3565   // If we are cloning recursive contexts, find and mark backedges from all root
3566   // callers, using the typical DFS based backedge analysis.
3567   if (!CloneRecursiveContexts)
3568     return;
3569   DenseSet<const ContextNode *> Visited;
3570   DenseSet<const ContextNode *> CurrentStack;
3571   for (auto &Entry : NonAllocationCallToContextNodeMap) {
3572     auto *Node = Entry.second;
3573     if (Node->isRemoved())
3574       continue;
3575     // It is a root if it doesn't have callers.
3576     if (!Node->CallerEdges.empty())
3577       continue;
3578     markBackedges(Node, Visited, CurrentStack);
3579     assert(CurrentStack.empty());
3580   }
3581 }
3582 
3583 // Recursive helper for above markBackedges method.
3584 template <typename DerivedCCG, typename FuncTy, typename CallTy>
markBackedges(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseSet<const ContextNode * > & CurrentStack)3585 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::markBackedges(
3586     ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3587     DenseSet<const ContextNode *> &CurrentStack) {
3588   auto I = Visited.insert(Node);
3589   // We should only call this for unvisited nodes.
3590   assert(I.second);
3591   (void)I;
3592   for (auto &CalleeEdge : Node->CalleeEdges) {
3593     auto *Callee = CalleeEdge->Callee;
3594     if (Visited.count(Callee)) {
3595       // Since this was already visited we need to check if it is currently on
3596       // the recursive stack in which case it is a backedge.
3597       if (CurrentStack.count(Callee))
3598         CalleeEdge->IsBackedge = true;
3599       continue;
3600     }
3601     CurrentStack.insert(Callee);
3602     markBackedges(Callee, Visited, CurrentStack);
3603     CurrentStack.erase(Callee);
3604   }
3605 }
3606 
3607 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones()3608 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() {
3609   DenseSet<const ContextNode *> Visited;
3610   for (auto &Entry : AllocationCallToContextNodeMap) {
3611     Visited.clear();
3612     identifyClones(Entry.second, Visited, Entry.second->getContextIds());
3613   }
3614   Visited.clear();
3615   for (auto &Entry : AllocationCallToContextNodeMap)
3616     recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited);
3617   if (VerifyCCG)
3618     check();
3619 }
3620 
3621 // helper function to check an AllocType is cold or notcold or both.
checkColdOrNotCold(uint8_t AllocType)3622 bool checkColdOrNotCold(uint8_t AllocType) {
3623   return (AllocType == (uint8_t)AllocationType::Cold) ||
3624          (AllocType == (uint8_t)AllocationType::NotCold) ||
3625          (AllocType ==
3626           ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold));
3627 }
3628 
3629 template <typename DerivedCCG, typename FuncTy, typename CallTy>
identifyClones(ContextNode * Node,DenseSet<const ContextNode * > & Visited,const DenseSet<uint32_t> & AllocContextIds)3630 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones(
3631     ContextNode *Node, DenseSet<const ContextNode *> &Visited,
3632     const DenseSet<uint32_t> &AllocContextIds) {
3633   if (VerifyNodes)
3634     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3635   assert(!Node->CloneOf);
3636 
3637   // If Node as a null call, then either it wasn't found in the module (regular
3638   // LTO) or summary index (ThinLTO), or there were other conditions blocking
3639   // cloning (e.g. recursion, calls multiple targets, etc).
3640   // Do this here so that we don't try to recursively clone callers below, which
3641   // isn't useful at least for this node.
3642   if (!Node->hasCall())
3643     return;
3644 
3645   // No need to look at any callers if allocation type already unambiguous.
3646   if (hasSingleAllocType(Node->AllocTypes))
3647     return;
3648 
3649 #ifndef NDEBUG
3650   auto Insert =
3651 #endif
3652       Visited.insert(Node);
3653   // We should not have visited this node yet.
3654   assert(Insert.second);
3655   // The recursive call to identifyClones may delete the current edge from the
3656   // CallerEdges vector. Make a copy and iterate on that, simpler than passing
3657   // in an iterator and having recursive call erase from it. Other edges may
3658   // also get removed during the recursion, which will have null Callee and
3659   // Caller pointers (and are deleted later), so we skip those below.
3660   {
3661     auto CallerEdges = Node->CallerEdges;
3662     for (auto &Edge : CallerEdges) {
3663       // Skip any that have been removed by an earlier recursive call.
3664       if (Edge->isRemoved()) {
3665         assert(!is_contained(Node->CallerEdges, Edge));
3666         continue;
3667       }
3668       // Defer backedges. See comments further below where these edges are
3669       // handled during the cloning of this Node.
3670       if (Edge->IsBackedge) {
3671         // We should only mark these if cloning recursive contexts, where we
3672         // need to do this deferral.
3673         assert(CloneRecursiveContexts);
3674         continue;
3675       }
3676       // Ignore any caller we previously visited via another edge.
3677       if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) {
3678         identifyClones(Edge->Caller, Visited, AllocContextIds);
3679       }
3680     }
3681   }
3682 
3683   // Check if we reached an unambiguous call or have have only a single caller.
3684   if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3685     return;
3686 
3687   // We need to clone.
3688 
3689   // Try to keep the original version as alloc type NotCold. This will make
3690   // cases with indirect calls or any other situation with an unknown call to
3691   // the original function get the default behavior. We do this by sorting the
3692   // CallerEdges of the Node we will clone by alloc type.
3693   //
3694   // Give NotCold edge the lowest sort priority so those edges are at the end of
3695   // the caller edges vector, and stay on the original version (since the below
3696   // code clones greedily until it finds all remaining edges have the same type
3697   // and leaves the remaining ones on the original Node).
3698   //
3699   // We shouldn't actually have any None type edges, so the sorting priority for
3700   // that is arbitrary, and we assert in that case below.
3701   const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4,
3702                                                /*Cold*/ 1,
3703                                                /*NotColdCold*/ 2};
3704   llvm::stable_sort(Node->CallerEdges,
3705                     [&](const std::shared_ptr<ContextEdge> &A,
3706                         const std::shared_ptr<ContextEdge> &B) {
3707                       // Nodes with non-empty context ids should be sorted
3708                       // before those with empty context ids.
3709                       if (A->ContextIds.empty())
3710                         // Either B ContextIds are non-empty (in which case we
3711                         // should return false because B < A), or B ContextIds
3712                         // are empty, in which case they are equal, and we
3713                         // should maintain the original relative ordering.
3714                         return false;
3715                       if (B->ContextIds.empty())
3716                         return true;
3717 
3718                       if (A->AllocTypes == B->AllocTypes)
3719                         // Use the first context id for each edge as a
3720                         // tie-breaker.
3721                         return *A->ContextIds.begin() < *B->ContextIds.begin();
3722                       return AllocTypeCloningPriority[A->AllocTypes] <
3723                              AllocTypeCloningPriority[B->AllocTypes];
3724                     });
3725 
3726   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3727 
3728   DenseSet<uint32_t> RecursiveContextIds;
3729   assert(AllowRecursiveContexts || !CloneRecursiveContexts);
3730   // If we are allowing recursive callsites, but have also disabled recursive
3731   // contexts, look for context ids that show up in multiple caller edges.
3732   if (AllowRecursiveCallsites && !AllowRecursiveContexts) {
3733     DenseSet<uint32_t> AllCallerContextIds;
3734     for (auto &CE : Node->CallerEdges) {
3735       // Resize to the largest set of caller context ids, since we know the
3736       // final set will be at least that large.
3737       AllCallerContextIds.reserve(CE->getContextIds().size());
3738       for (auto Id : CE->getContextIds())
3739         if (!AllCallerContextIds.insert(Id).second)
3740           RecursiveContextIds.insert(Id);
3741     }
3742   }
3743 
3744   // Iterate until we find no more opportunities for disambiguating the alloc
3745   // types via cloning. In most cases this loop will terminate once the Node
3746   // has a single allocation type, in which case no more cloning is needed.
3747   // Iterate over a copy of Node's caller edges, since we may need to remove
3748   // edges in the moveEdgeTo* methods, and this simplifies the handling and
3749   // makes it less error-prone.
3750   auto CallerEdges = Node->CallerEdges;
3751   for (auto &CallerEdge : CallerEdges) {
3752     // Skip any that have been removed by an earlier recursive call.
3753     if (CallerEdge->isRemoved()) {
3754       assert(!is_contained(Node->CallerEdges, CallerEdge));
3755       continue;
3756     }
3757     assert(CallerEdge->Callee == Node);
3758 
3759     // See if cloning the prior caller edge left this node with a single alloc
3760     // type or a single caller. In that case no more cloning of Node is needed.
3761     if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1)
3762       break;
3763 
3764     // If the caller was not successfully matched to a call in the IR/summary,
3765     // there is no point in trying to clone for it as we can't update that call.
3766     if (!CallerEdge->Caller->hasCall())
3767       continue;
3768 
3769     // Only need to process the ids along this edge pertaining to the given
3770     // allocation.
3771     auto CallerEdgeContextsForAlloc =
3772         set_intersection(CallerEdge->getContextIds(), AllocContextIds);
3773     if (!RecursiveContextIds.empty())
3774       CallerEdgeContextsForAlloc =
3775           set_difference(CallerEdgeContextsForAlloc, RecursiveContextIds);
3776     if (CallerEdgeContextsForAlloc.empty())
3777       continue;
3778 
3779     auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3780 
3781     // Compute the node callee edge alloc types corresponding to the context ids
3782     // for this caller edge.
3783     std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge;
3784     CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size());
3785     for (auto &CalleeEdge : Node->CalleeEdges)
3786       CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3787           CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3788 
3789     // Don't clone if doing so will not disambiguate any alloc types amongst
3790     // caller edges (including the callee edges that would be cloned).
3791     // Otherwise we will simply move all edges to the clone.
3792     //
3793     // First check if by cloning we will disambiguate the caller allocation
3794     // type from node's allocation type. Query allocTypeToUse so that we don't
3795     // bother cloning to distinguish NotCold+Cold from NotCold. Note that
3796     // neither of these should be None type.
3797     //
3798     // Then check if by cloning node at least one of the callee edges will be
3799     // disambiguated by splitting out different context ids.
3800     //
3801     // However, always do the cloning if this is a backedge, in which case we
3802     // have not yet cloned along this caller edge.
3803     assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None);
3804     assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3805     if (!CallerEdge->IsBackedge &&
3806         allocTypeToUse(CallerAllocTypeForAlloc) ==
3807             allocTypeToUse(Node->AllocTypes) &&
3808         allocTypesMatch<DerivedCCG, FuncTy, CallTy>(
3809             CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) {
3810       continue;
3811     }
3812 
3813     if (CallerEdge->IsBackedge) {
3814       // We should only mark these if cloning recursive contexts, where we
3815       // need to do this deferral.
3816       assert(CloneRecursiveContexts);
3817       DeferredBackedges++;
3818     }
3819 
3820     // If this is a backedge, we now do recursive cloning starting from its
3821     // caller since we may have moved unambiguous caller contexts to a clone
3822     // of this Node in a previous iteration of the current loop, giving more
3823     // opportunity for cloning through the backedge. Because we sorted the
3824     // caller edges earlier so that cold caller edges are first, we would have
3825     // visited and cloned this node for any unamibiguously cold non-recursive
3826     // callers before any ambiguous backedge callers. Note that we don't do this
3827     // if the caller is already cloned or visited during cloning (e.g. via a
3828     // different context path from the allocation).
3829     // TODO: Can we do better in the case where the caller was already visited?
3830     if (CallerEdge->IsBackedge && !CallerEdge->Caller->CloneOf &&
3831         !Visited.count(CallerEdge->Caller)) {
3832       const auto OrigIdCount = CallerEdge->getContextIds().size();
3833       // Now do the recursive cloning of this backedge's caller, which was
3834       // deferred earlier.
3835       identifyClones(CallerEdge->Caller, Visited, CallerEdgeContextsForAlloc);
3836       removeNoneTypeCalleeEdges(CallerEdge->Caller);
3837       // See if the recursive call to identifyClones moved the context ids to a
3838       // new edge from this node to a clone of caller, and switch to looking at
3839       // that new edge so that we clone Node for the new caller clone.
3840       bool UpdatedEdge = false;
3841       if (OrigIdCount > CallerEdge->getContextIds().size()) {
3842         for (auto E : Node->CallerEdges) {
3843           // Only interested in clones of the current edges caller.
3844           if (E->Caller->CloneOf != CallerEdge->Caller)
3845             continue;
3846           // See if this edge contains any of the context ids originally on the
3847           // current caller edge.
3848           auto CallerEdgeContextsForAllocNew =
3849               set_intersection(CallerEdgeContextsForAlloc, E->getContextIds());
3850           if (CallerEdgeContextsForAllocNew.empty())
3851             continue;
3852           // Make sure we don't pick a previously existing caller edge of this
3853           // Node, which would be processed on a different iteration of the
3854           // outer loop over the saved CallerEdges.
3855           if (llvm::is_contained(CallerEdges, E))
3856             continue;
3857           // The CallerAllocTypeForAlloc and CalleeEdgeAllocTypesForCallerEdge
3858           // are updated further below for all cases where we just invoked
3859           // identifyClones recursively.
3860           CallerEdgeContextsForAlloc.swap(CallerEdgeContextsForAllocNew);
3861           CallerEdge = E;
3862           UpdatedEdge = true;
3863           break;
3864         }
3865       }
3866       // If cloning removed this edge (and we didn't update it to a new edge
3867       // above), we're done with this edge. It's possible we moved all of the
3868       // context ids to an existing clone, in which case there's no need to do
3869       // further processing for them.
3870       if (CallerEdge->isRemoved())
3871         continue;
3872 
3873       // Now we need to update the information used for the cloning decisions
3874       // further below, as we may have modified edges and their context ids.
3875 
3876       // Note if we changed the CallerEdge above we would have already updated
3877       // the context ids.
3878       if (!UpdatedEdge) {
3879         CallerEdgeContextsForAlloc = set_intersection(
3880             CallerEdgeContextsForAlloc, CallerEdge->getContextIds());
3881         if (CallerEdgeContextsForAlloc.empty())
3882           continue;
3883       }
3884       // Update the other information that depends on the edges and on the now
3885       // updated CallerEdgeContextsForAlloc.
3886       CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc);
3887       CalleeEdgeAllocTypesForCallerEdge.clear();
3888       for (auto &CalleeEdge : Node->CalleeEdges) {
3889         CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes(
3890             CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc));
3891       }
3892     }
3893 
3894     // First see if we can use an existing clone. Check each clone and its
3895     // callee edges for matching alloc types.
3896     ContextNode *Clone = nullptr;
3897     for (auto *CurClone : Node->Clones) {
3898       if (allocTypeToUse(CurClone->AllocTypes) !=
3899           allocTypeToUse(CallerAllocTypeForAlloc))
3900         continue;
3901 
3902       bool BothSingleAlloc = hasSingleAllocType(CurClone->AllocTypes) &&
3903                              hasSingleAllocType(CallerAllocTypeForAlloc);
3904       // The above check should mean that if both have single alloc types that
3905       // they should be equal.
3906       assert(!BothSingleAlloc ||
3907              CurClone->AllocTypes == CallerAllocTypeForAlloc);
3908 
3909       // If either both have a single alloc type (which are the same), or if the
3910       // clone's callee edges have the same alloc types as those for the current
3911       // allocation on Node's callee edges (CalleeEdgeAllocTypesForCallerEdge),
3912       // then we can reuse this clone.
3913       if (BothSingleAlloc || allocTypesMatchClone<DerivedCCG, FuncTy, CallTy>(
3914                                  CalleeEdgeAllocTypesForCallerEdge, CurClone)) {
3915         Clone = CurClone;
3916         break;
3917       }
3918     }
3919 
3920     // The edge iterator is adjusted when we move the CallerEdge to the clone.
3921     if (Clone)
3922       moveEdgeToExistingCalleeClone(CallerEdge, Clone, /*NewClone=*/false,
3923                                     CallerEdgeContextsForAlloc);
3924     else
3925       Clone = moveEdgeToNewCalleeClone(CallerEdge, CallerEdgeContextsForAlloc);
3926 
3927     // Sanity check that no alloc types on clone or its edges are None.
3928     assert(Clone->AllocTypes != (uint8_t)AllocationType::None);
3929   }
3930 
3931   // We should still have some context ids on the original Node.
3932   assert(!Node->emptyContextIds());
3933 
3934   // Sanity check that no alloc types on node or edges are None.
3935   assert(Node->AllocTypes != (uint8_t)AllocationType::None);
3936 
3937   if (VerifyNodes)
3938     checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false);
3939 }
3940 
updateAllocationCall(CallInfo & Call,AllocationType AllocType)3941 void ModuleCallsiteContextGraph::updateAllocationCall(
3942     CallInfo &Call, AllocationType AllocType) {
3943   std::string AllocTypeString = getAllocTypeAttributeString(AllocType);
3944   auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(),
3945                                 "memprof", AllocTypeString);
3946   cast<CallBase>(Call.call())->addFnAttr(A);
3947   OREGetter(Call.call()->getFunction())
3948       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call())
3949             << ore::NV("AllocationCall", Call.call()) << " in clone "
3950             << ore::NV("Caller", Call.call()->getFunction())
3951             << " marked with memprof allocation attribute "
3952             << ore::NV("Attribute", AllocTypeString));
3953 }
3954 
updateAllocationCall(CallInfo & Call,AllocationType AllocType)3955 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call,
3956                                                      AllocationType AllocType) {
3957   auto *AI = cast<AllocInfo *>(Call.call());
3958   assert(AI);
3959   assert(AI->Versions.size() > Call.cloneNo());
3960   AI->Versions[Call.cloneNo()] = (uint8_t)AllocType;
3961 }
3962 
3963 AllocationType
getAllocationCallType(const CallInfo & Call) const3964 ModuleCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3965   const auto *CB = cast<CallBase>(Call.call());
3966   if (!CB->getAttributes().hasFnAttr("memprof"))
3967     return AllocationType::None;
3968   return CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
3969              ? AllocationType::Cold
3970              : AllocationType::NotCold;
3971 }
3972 
3973 AllocationType
getAllocationCallType(const CallInfo & Call) const3974 IndexCallsiteContextGraph::getAllocationCallType(const CallInfo &Call) const {
3975   const auto *AI = cast<AllocInfo *>(Call.call());
3976   assert(AI->Versions.size() > Call.cloneNo());
3977   return (AllocationType)AI->Versions[Call.cloneNo()];
3978 }
3979 
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)3980 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3981                                             FuncInfo CalleeFunc) {
3982   if (CalleeFunc.cloneNo() > 0)
3983     cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func());
3984   OREGetter(CallerCall.call()->getFunction())
3985       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call())
3986             << ore::NV("Call", CallerCall.call()) << " in clone "
3987             << ore::NV("Caller", CallerCall.call()->getFunction())
3988             << " assigned to call function clone "
3989             << ore::NV("Callee", CalleeFunc.func()));
3990 }
3991 
updateCall(CallInfo & CallerCall,FuncInfo CalleeFunc)3992 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall,
3993                                            FuncInfo CalleeFunc) {
3994   auto *CI = cast<CallsiteInfo *>(CallerCall.call());
3995   assert(CI &&
3996          "Caller cannot be an allocation which should not have profiled calls");
3997   assert(CI->Clones.size() > CallerCall.cloneNo());
3998   CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo();
3999 }
4000 
4001 CallsiteContextGraph<ModuleCallsiteContextGraph, Function,
4002                      Instruction *>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)4003 ModuleCallsiteContextGraph::cloneFunctionForCallsite(
4004     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4005     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4006   // Use existing LLVM facilities for cloning and obtaining Call in clone
4007   ValueToValueMapTy VMap;
4008   auto *NewFunc = CloneFunction(Func.func(), VMap);
4009   std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo);
4010   assert(!Func.func()->getParent()->getFunction(Name));
4011   NewFunc->setName(Name);
4012   if (auto *SP = NewFunc->getSubprogram())
4013     SP->replaceLinkageName(
4014         MDString::get(NewFunc->getParent()->getContext(), Name));
4015   for (auto &Inst : CallsWithMetadataInFunc) {
4016     // This map always has the initial version in it.
4017     assert(Inst.cloneNo() == 0);
4018     CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo};
4019   }
4020   OREGetter(Func.func())
4021       .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func())
4022             << "created clone " << ore::NV("NewFunction", NewFunc));
4023   return {NewFunc, CloneNo};
4024 }
4025 
4026 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary,
4027                      IndexCall>::FuncInfo
cloneFunctionForCallsite(FuncInfo & Func,CallInfo & Call,std::map<CallInfo,CallInfo> & CallMap,std::vector<CallInfo> & CallsWithMetadataInFunc,unsigned CloneNo)4028 IndexCallsiteContextGraph::cloneFunctionForCallsite(
4029     FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap,
4030     std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) {
4031   // Check how many clones we have of Call (and therefore function).
4032   // The next clone number is the current size of versions array.
4033   // Confirm this matches the CloneNo provided by the caller, which is based on
4034   // the number of function clones we have.
4035   assert(CloneNo == (isa<AllocInfo *>(Call.call())
4036                          ? cast<AllocInfo *>(Call.call())->Versions.size()
4037                          : cast<CallsiteInfo *>(Call.call())->Clones.size()));
4038   // Walk all the instructions in this function. Create a new version for
4039   // each (by adding an entry to the Versions/Clones summary array), and copy
4040   // over the version being called for the function clone being cloned here.
4041   // Additionally, add an entry to the CallMap for the new function clone,
4042   // mapping the original call (clone 0, what is in CallsWithMetadataInFunc)
4043   // to the new call clone.
4044   for (auto &Inst : CallsWithMetadataInFunc) {
4045     // This map always has the initial version in it.
4046     assert(Inst.cloneNo() == 0);
4047     if (auto *AI = dyn_cast<AllocInfo *>(Inst.call())) {
4048       assert(AI->Versions.size() == CloneNo);
4049       // We assign the allocation type later (in updateAllocationCall), just add
4050       // an entry for it here.
4051       AI->Versions.push_back(0);
4052     } else {
4053       auto *CI = cast<CallsiteInfo *>(Inst.call());
4054       assert(CI && CI->Clones.size() == CloneNo);
4055       // We assign the clone number later (in updateCall), just add an entry for
4056       // it here.
4057       CI->Clones.push_back(0);
4058     }
4059     CallMap[Inst] = {Inst.call(), CloneNo};
4060   }
4061   return {Func.func(), CloneNo};
4062 }
4063 
4064 // We perform cloning for each allocation node separately. However, this
4065 // sometimes results in a situation where the same node calls multiple
4066 // clones of the same callee, created for different allocations. This
4067 // causes issues when assigning functions to these clones, as each node can
4068 // in reality only call a single callee clone.
4069 //
4070 // To address this, before assigning functions, merge callee clone nodes as
4071 // needed using a post order traversal from the allocations. We attempt to
4072 // use existing clones as the merge node when legal, and to share them
4073 // among callers with the same properties (callers calling the same set of
4074 // callee clone nodes for the same allocations).
4075 //
4076 // Without this fix, in some cases incorrect function assignment will lead
4077 // to calling the wrong allocation clone.
4078 template <typename DerivedCCG, typename FuncTy, typename CallTy>
mergeClones()4079 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones() {
4080   if (!MergeClones)
4081     return;
4082 
4083   // Generate a map from context id to the associated allocation node for use
4084   // when merging clones.
4085   DenseMap<uint32_t, ContextNode *> ContextIdToAllocationNode;
4086   for (auto &Entry : AllocationCallToContextNodeMap) {
4087     auto *Node = Entry.second;
4088     for (auto Id : Node->getContextIds())
4089       ContextIdToAllocationNode[Id] = Node->getOrigNode();
4090     for (auto *Clone : Node->Clones) {
4091       for (auto Id : Clone->getContextIds())
4092         ContextIdToAllocationNode[Id] = Clone->getOrigNode();
4093     }
4094   }
4095 
4096   // Post order traversal starting from allocations to ensure each callsite
4097   // calls a single clone of its callee. Callee nodes that are clones of each
4098   // other are merged (via new merge nodes if needed) to achieve this.
4099   DenseSet<const ContextNode *> Visited;
4100   for (auto &Entry : AllocationCallToContextNodeMap) {
4101     auto *Node = Entry.second;
4102 
4103     mergeClones(Node, Visited, ContextIdToAllocationNode);
4104 
4105     // Make a copy so the recursive post order traversal that may create new
4106     // clones doesn't mess up iteration. Note that the recursive traversal
4107     // itself does not call mergeClones on any of these nodes, which are all
4108     // (clones of) allocations.
4109     auto Clones = Node->Clones;
4110     for (auto *Clone : Clones)
4111       mergeClones(Clone, Visited, ContextIdToAllocationNode);
4112   }
4113 
4114   if (DumpCCG) {
4115     dbgs() << "CCG after merging:\n";
4116     dbgs() << *this;
4117   }
4118   if (ExportToDot)
4119     exportToDot("aftermerge");
4120 
4121   if (VerifyCCG) {
4122     check();
4123   }
4124 }
4125 
4126 // Recursive helper for above mergeClones method.
4127 template <typename DerivedCCG, typename FuncTy, typename CallTy>
mergeClones(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseMap<uint32_t,ContextNode * > & ContextIdToAllocationNode)4128 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeClones(
4129     ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4130     DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4131   auto Inserted = Visited.insert(Node);
4132   if (!Inserted.second)
4133     return;
4134 
4135   // Make a copy since the recursive call may move a caller edge to a new
4136   // callee, messing up the iterator.
4137   auto CallerEdges = Node->CallerEdges;
4138   for (auto CallerEdge : CallerEdges) {
4139     // Skip any caller edge moved onto a different callee during recursion.
4140     if (CallerEdge->Callee != Node)
4141       continue;
4142     mergeClones(CallerEdge->Caller, Visited, ContextIdToAllocationNode);
4143   }
4144 
4145   // Merge for this node after we handle its callers.
4146   mergeNodeCalleeClones(Node, Visited, ContextIdToAllocationNode);
4147 }
4148 
4149 template <typename DerivedCCG, typename FuncTy, typename CallTy>
mergeNodeCalleeClones(ContextNode * Node,DenseSet<const ContextNode * > & Visited,DenseMap<uint32_t,ContextNode * > & ContextIdToAllocationNode)4150 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::mergeNodeCalleeClones(
4151     ContextNode *Node, DenseSet<const ContextNode *> &Visited,
4152     DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode) {
4153   // Ignore Node if we moved all of its contexts to clones.
4154   if (Node->emptyContextIds())
4155     return;
4156 
4157   // First identify groups of clones among Node's callee edges, by building
4158   // a map from each callee base node to the associated callee edges from Node.
4159   MapVector<ContextNode *, std::vector<std::shared_ptr<ContextEdge>>>
4160       OrigNodeToCloneEdges;
4161   for (const auto &E : Node->CalleeEdges) {
4162     auto *Callee = E->Callee;
4163     if (!Callee->CloneOf && Callee->Clones.empty())
4164       continue;
4165     ContextNode *Base = Callee->getOrigNode();
4166     OrigNodeToCloneEdges[Base].push_back(E);
4167   }
4168 
4169   // Helper for callee edge sorting below. Return true if A's callee has fewer
4170   // caller edges than B, or if A is a clone and B is not, or if A's first
4171   // context id is smaller than B's.
4172   auto CalleeCallerEdgeLessThan = [](const std::shared_ptr<ContextEdge> &A,
4173                                      const std::shared_ptr<ContextEdge> &B) {
4174     if (A->Callee->CallerEdges.size() != B->Callee->CallerEdges.size())
4175       return A->Callee->CallerEdges.size() < B->Callee->CallerEdges.size();
4176     if (A->Callee->CloneOf && !B->Callee->CloneOf)
4177       return true;
4178     else if (!A->Callee->CloneOf && B->Callee->CloneOf)
4179       return false;
4180     // Use the first context id for each edge as a
4181     // tie-breaker.
4182     return *A->ContextIds.begin() < *B->ContextIds.begin();
4183   };
4184 
4185   // Process each set of callee clones called by Node, performing the needed
4186   // merging.
4187   for (auto Entry : OrigNodeToCloneEdges) {
4188     // CalleeEdges is the set of edges from Node reaching callees that are
4189     // mutual clones of each other.
4190     auto &CalleeEdges = Entry.second;
4191     auto NumCalleeClones = CalleeEdges.size();
4192     // A single edge means there is no merging needed.
4193     if (NumCalleeClones == 1)
4194       continue;
4195     // Sort the CalleeEdges calling this group of clones in ascending order of
4196     // their caller edge counts, putting the original non-clone node first in
4197     // cases of a tie. This simplifies finding an existing node to use as the
4198     // merge node.
4199     llvm::stable_sort(CalleeEdges, CalleeCallerEdgeLessThan);
4200 
4201     /// Find other callers of the given set of callee edges that can
4202     /// share the same callee merge node. See the comments at this method
4203     /// definition for details.
4204     DenseSet<ContextNode *> OtherCallersToShareMerge;
4205     findOtherCallersToShareMerge(Node, CalleeEdges, ContextIdToAllocationNode,
4206                                  OtherCallersToShareMerge);
4207 
4208     // Now do the actual merging. Identify existing or create a new MergeNode
4209     // during the first iteration. Move each callee over, along with edges from
4210     // other callers we've determined above can share the same merge node.
4211     ContextNode *MergeNode = nullptr;
4212     DenseMap<ContextNode *, unsigned> CallerToMoveCount;
4213     for (auto CalleeEdge : CalleeEdges) {
4214       auto *OrigCallee = CalleeEdge->Callee;
4215       // If we don't have a MergeNode yet (only happens on the first iteration,
4216       // as a new one will be created when we go to move the first callee edge
4217       // over as needed), see if we can use this callee.
4218       if (!MergeNode) {
4219         // If there are no other callers, simply use this callee.
4220         if (CalleeEdge->Callee->CallerEdges.size() == 1) {
4221           MergeNode = OrigCallee;
4222           NonNewMergedNodes++;
4223           continue;
4224         }
4225         // Otherwise, if we have identified other caller nodes that can share
4226         // the merge node with Node, see if all of OrigCallee's callers are
4227         // going to share the same merge node. In that case we can use callee
4228         // (since all of its callers would move to the new merge node).
4229         if (!OtherCallersToShareMerge.empty()) {
4230           bool MoveAllCallerEdges = true;
4231           for (auto CalleeCallerE : OrigCallee->CallerEdges) {
4232             if (CalleeCallerE == CalleeEdge)
4233               continue;
4234             if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller)) {
4235               MoveAllCallerEdges = false;
4236               break;
4237             }
4238           }
4239           // If we are going to move all callers over, we can use this callee as
4240           // the MergeNode.
4241           if (MoveAllCallerEdges) {
4242             MergeNode = OrigCallee;
4243             NonNewMergedNodes++;
4244             continue;
4245           }
4246         }
4247       }
4248       // Move this callee edge, creating a new merge node if necessary.
4249       if (MergeNode) {
4250         assert(MergeNode != OrigCallee);
4251         moveEdgeToExistingCalleeClone(CalleeEdge, MergeNode,
4252                                       /*NewClone*/ false);
4253       } else {
4254         MergeNode = moveEdgeToNewCalleeClone(CalleeEdge);
4255         NewMergedNodes++;
4256       }
4257       // Now move all identified edges from other callers over to the merge node
4258       // as well.
4259       if (!OtherCallersToShareMerge.empty()) {
4260         // Make and iterate over a copy of OrigCallee's caller edges because
4261         // some of these will be moved off of the OrigCallee and that would mess
4262         // up the iteration from OrigCallee.
4263         auto OrigCalleeCallerEdges = OrigCallee->CallerEdges;
4264         for (auto &CalleeCallerE : OrigCalleeCallerEdges) {
4265           if (CalleeCallerE == CalleeEdge)
4266             continue;
4267           if (!OtherCallersToShareMerge.contains(CalleeCallerE->Caller))
4268             continue;
4269           CallerToMoveCount[CalleeCallerE->Caller]++;
4270           moveEdgeToExistingCalleeClone(CalleeCallerE, MergeNode,
4271                                         /*NewClone*/ false);
4272         }
4273       }
4274       removeNoneTypeCalleeEdges(OrigCallee);
4275       removeNoneTypeCalleeEdges(MergeNode);
4276     }
4277   }
4278 }
4279 
4280 // Look for other nodes that have edges to the same set of callee
4281 // clones as the current Node. Those can share the eventual merge node
4282 // (reducing cloning and binary size overhead) iff:
4283 // - they have edges to the same set of callee clones
4284 // - each callee edge reaches a subset of the same allocations as Node's
4285 //   corresponding edge to the same callee clone.
4286 // The second requirement is to ensure that we don't undo any of the
4287 // necessary cloning to distinguish contexts with different allocation
4288 // behavior.
4289 // FIXME: This is somewhat conservative, as we really just need to ensure
4290 // that they don't reach the same allocations as contexts on edges from Node
4291 // going to any of the *other* callee clones being merged. However, that
4292 // requires more tracking and checking to get right.
4293 template <typename DerivedCCG, typename FuncTy, typename CallTy>
4294 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::
findOtherCallersToShareMerge(ContextNode * Node,std::vector<std::shared_ptr<ContextEdge>> & CalleeEdges,DenseMap<uint32_t,ContextNode * > & ContextIdToAllocationNode,DenseSet<ContextNode * > & OtherCallersToShareMerge)4295     findOtherCallersToShareMerge(
4296         ContextNode *Node,
4297         std::vector<std::shared_ptr<ContextEdge>> &CalleeEdges,
4298         DenseMap<uint32_t, ContextNode *> &ContextIdToAllocationNode,
4299         DenseSet<ContextNode *> &OtherCallersToShareMerge) {
4300   auto NumCalleeClones = CalleeEdges.size();
4301   // This map counts how many edges to the same callee clone exist for other
4302   // caller nodes of each callee clone.
4303   DenseMap<ContextNode *, unsigned> OtherCallersToSharedCalleeEdgeCount;
4304   // Counts the number of other caller nodes that have edges to all callee
4305   // clones that don't violate the allocation context checking.
4306   unsigned PossibleOtherCallerNodes = 0;
4307 
4308   // We only need to look at other Caller nodes if the first callee edge has
4309   // multiple callers (recall they are sorted in ascending order above).
4310   if (CalleeEdges[0]->Callee->CallerEdges.size() < 2)
4311     return;
4312 
4313   // For each callee edge:
4314   // - Collect the count of other caller nodes calling the same callees.
4315   // - Collect the alloc nodes reached by contexts on each callee edge.
4316   DenseMap<ContextEdge *, DenseSet<ContextNode *>> CalleeEdgeToAllocNodes;
4317   for (auto CalleeEdge : CalleeEdges) {
4318     assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4319     // For each other caller of the same callee, increment the count of
4320     // edges reaching the same callee clone.
4321     for (auto CalleeCallerEdges : CalleeEdge->Callee->CallerEdges) {
4322       if (CalleeCallerEdges->Caller == Node) {
4323         assert(CalleeCallerEdges == CalleeEdge);
4324         continue;
4325       }
4326       OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller]++;
4327       // If this caller edge now reaches all of the same callee clones,
4328       // increment the count of candidate other caller nodes.
4329       if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerEdges->Caller] ==
4330           NumCalleeClones)
4331         PossibleOtherCallerNodes++;
4332     }
4333     // Collect the alloc nodes reached by contexts on each callee edge, for
4334     // later analysis.
4335     for (auto Id : CalleeEdge->getContextIds()) {
4336       auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4337       if (!Alloc) {
4338         // FIXME: unclear why this happens occasionally, presumably
4339         // imperfect graph updates possibly with recursion.
4340         MissingAllocForContextId++;
4341         continue;
4342       }
4343       CalleeEdgeToAllocNodes[CalleeEdge.get()].insert(Alloc);
4344     }
4345   }
4346 
4347   // Now walk the callee edges again, and make sure that for each candidate
4348   // caller node all of its edges to the callees reach the same allocs (or
4349   // a subset) as those along the corresponding callee edge from Node.
4350   for (auto CalleeEdge : CalleeEdges) {
4351     assert(CalleeEdge->Callee->CallerEdges.size() > 1);
4352     // Stop if we do not have any (more) candidate other caller nodes.
4353     if (!PossibleOtherCallerNodes)
4354       break;
4355     auto &CurCalleeAllocNodes = CalleeEdgeToAllocNodes[CalleeEdge.get()];
4356     // Check each other caller of this callee clone.
4357     for (auto &CalleeCallerE : CalleeEdge->Callee->CallerEdges) {
4358       // Not interested in the callee edge from Node itself.
4359       if (CalleeCallerE == CalleeEdge)
4360         continue;
4361       // Skip any callers that didn't have callee edges to all the same
4362       // callee clones.
4363       if (OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] !=
4364           NumCalleeClones)
4365         continue;
4366       // Make sure that each context along edge from candidate caller node
4367       // reaches an allocation also reached by this callee edge from Node.
4368       for (auto Id : CalleeCallerE->getContextIds()) {
4369         auto *Alloc = ContextIdToAllocationNode.lookup(Id);
4370         if (!Alloc)
4371           continue;
4372         // If not, simply reset the map entry to 0 so caller is ignored, and
4373         // reduce the count of candidate other caller nodes.
4374         if (!CurCalleeAllocNodes.contains(Alloc)) {
4375           OtherCallersToSharedCalleeEdgeCount[CalleeCallerE->Caller] = 0;
4376           PossibleOtherCallerNodes--;
4377           break;
4378         }
4379       }
4380     }
4381   }
4382 
4383   if (!PossibleOtherCallerNodes)
4384     return;
4385 
4386   // Build the set of other caller nodes that can use the same callee merge
4387   // node.
4388   for (auto &[OtherCaller, Count] : OtherCallersToSharedCalleeEdgeCount) {
4389     if (Count != NumCalleeClones)
4390       continue;
4391     OtherCallersToShareMerge.insert(OtherCaller);
4392   }
4393 }
4394 
4395 // This method assigns cloned callsites to functions, cloning the functions as
4396 // needed. The assignment is greedy and proceeds roughly as follows:
4397 //
4398 // For each function Func:
4399 //   For each call with graph Node having clones:
4400 //     Initialize ClonesWorklist to Node and its clones
4401 //     Initialize NodeCloneCount to 0
4402 //     While ClonesWorklist is not empty:
4403 //        Clone = pop front ClonesWorklist
4404 //        NodeCloneCount++
4405 //        If Func has been cloned less than NodeCloneCount times:
4406 //           If NodeCloneCount is 1:
4407 //             Assign Clone to original Func
4408 //             Continue
4409 //           Create a new function clone
4410 //           If other callers not assigned to call a function clone yet:
4411 //              Assign them to call new function clone
4412 //              Continue
4413 //           Assign any other caller calling the cloned version to new clone
4414 //
4415 //        For each caller of Clone:
4416 //           If caller is assigned to call a specific function clone:
4417 //             If we cannot assign Clone to that function clone:
4418 //               Create new callsite Clone NewClone
4419 //               Add NewClone to ClonesWorklist
4420 //               Continue
4421 //             Assign Clone to existing caller's called function clone
4422 //           Else:
4423 //             If Clone not already assigned to a function clone:
4424 //                Assign to first function clone without assignment
4425 //             Assign caller to selected function clone
4426 template <typename DerivedCCG, typename FuncTy, typename CallTy>
assignFunctions()4427 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() {
4428   bool Changed = false;
4429 
4430   mergeClones();
4431 
4432   // Keep track of the assignment of nodes (callsites) to function clones they
4433   // call.
4434   DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap;
4435 
4436   // Update caller node to call function version CalleeFunc, by recording the
4437   // assignment in CallsiteToCalleeFuncCloneMap.
4438   auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller,
4439                                         const FuncInfo &CalleeFunc) {
4440     assert(Caller->hasCall());
4441     CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc;
4442   };
4443 
4444   // Walk all functions for which we saw calls with memprof metadata, and handle
4445   // cloning for each of its calls.
4446   for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) {
4447     FuncInfo OrigFunc(Func);
4448     // Map from each clone of OrigFunc to a map of remappings of each call of
4449     // interest (from original uncloned call to the corresponding cloned call in
4450     // that function clone).
4451     std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap;
4452     for (auto &Call : CallsWithMetadata) {
4453       ContextNode *Node = getNodeForInst(Call);
4454       // Skip call if we do not have a node for it (all uses of its stack ids
4455       // were either on inlined chains or pruned from the MIBs), or if we did
4456       // not create any clones for it.
4457       if (!Node || Node->Clones.empty())
4458         continue;
4459       assert(Node->hasCall() &&
4460              "Not having a call should have prevented cloning");
4461 
4462       // Track the assignment of function clones to clones of the current
4463       // callsite Node being handled.
4464       std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap;
4465 
4466       // Assign callsite version CallsiteClone to function version FuncClone,
4467       // and also assign (possibly cloned) Call to CallsiteClone.
4468       auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone,
4469                                                 CallInfo &Call,
4470                                                 ContextNode *CallsiteClone,
4471                                                 bool IsAlloc) {
4472         // Record the clone of callsite node assigned to this function clone.
4473         FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone;
4474 
4475         assert(FuncClonesToCallMap.count(FuncClone));
4476         std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone];
4477         CallInfo CallClone(Call);
4478         if (auto It = CallMap.find(Call); It != CallMap.end())
4479           CallClone = It->second;
4480         CallsiteClone->setCall(CallClone);
4481         // Need to do the same for all matching calls.
4482         for (auto &MatchingCall : Node->MatchingCalls) {
4483           CallInfo CallClone(MatchingCall);
4484           if (auto It = CallMap.find(MatchingCall); It != CallMap.end())
4485             CallClone = It->second;
4486           // Updates the call in the list.
4487           MatchingCall = CallClone;
4488         }
4489       };
4490 
4491       // Keep track of the clones of callsite Node that need to be assigned to
4492       // function clones. This list may be expanded in the loop body below if we
4493       // find additional cloning is required.
4494       std::deque<ContextNode *> ClonesWorklist;
4495       // Ignore original Node if we moved all of its contexts to clones.
4496       if (!Node->emptyContextIds())
4497         ClonesWorklist.push_back(Node);
4498       llvm::append_range(ClonesWorklist, Node->Clones);
4499 
4500       // Now walk through all of the clones of this callsite Node that we need,
4501       // and determine the assignment to a corresponding clone of the current
4502       // function (creating new function clones as needed).
4503       unsigned NodeCloneCount = 0;
4504       while (!ClonesWorklist.empty()) {
4505         ContextNode *Clone = ClonesWorklist.front();
4506         ClonesWorklist.pop_front();
4507         NodeCloneCount++;
4508         if (VerifyNodes)
4509           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4510 
4511         // Need to create a new function clone if we have more callsite clones
4512         // than existing function clones, which would have been assigned to an
4513         // earlier clone in the list (we assign callsite clones to function
4514         // clones greedily).
4515         if (FuncClonesToCallMap.size() < NodeCloneCount) {
4516           // If this is the first callsite copy, assign to original function.
4517           if (NodeCloneCount == 1) {
4518             // Since FuncClonesToCallMap is empty in this case, no clones have
4519             // been created for this function yet, and no callers should have
4520             // been assigned a function clone for this callee node yet.
4521             assert(llvm::none_of(
4522                 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4523                   return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4524                 }));
4525             // Initialize with empty call map, assign Clone to original function
4526             // and its callers, and skip to the next clone.
4527             FuncClonesToCallMap[OrigFunc] = {};
4528             AssignCallsiteCloneToFuncClone(
4529                 OrigFunc, Call, Clone,
4530                 AllocationCallToContextNodeMap.count(Call));
4531             for (auto &CE : Clone->CallerEdges) {
4532               // Ignore any caller that does not have a recorded callsite Call.
4533               if (!CE->Caller->hasCall())
4534                 continue;
4535               RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc);
4536             }
4537             continue;
4538           }
4539 
4540           // First locate which copy of OrigFunc to clone again. If a caller
4541           // of this callsite clone was already assigned to call a particular
4542           // function clone, we need to redirect all of those callers to the
4543           // new function clone, and update their other callees within this
4544           // function.
4545           FuncInfo PreviousAssignedFuncClone;
4546           auto EI = llvm::find_if(
4547               Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) {
4548                 return CallsiteToCalleeFuncCloneMap.count(E->Caller);
4549               });
4550           bool CallerAssignedToCloneOfFunc = false;
4551           if (EI != Clone->CallerEdges.end()) {
4552             const std::shared_ptr<ContextEdge> &Edge = *EI;
4553             PreviousAssignedFuncClone =
4554                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4555             CallerAssignedToCloneOfFunc = true;
4556           }
4557 
4558           // Clone function and save it along with the CallInfo map created
4559           // during cloning in the FuncClonesToCallMap.
4560           std::map<CallInfo, CallInfo> NewCallMap;
4561           unsigned CloneNo = FuncClonesToCallMap.size();
4562           assert(CloneNo > 0 && "Clone 0 is the original function, which "
4563                                 "should already exist in the map");
4564           FuncInfo NewFuncClone = cloneFunctionForCallsite(
4565               OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo);
4566           FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap));
4567           FunctionClonesAnalysis++;
4568           Changed = true;
4569 
4570           // If no caller callsites were already assigned to a clone of this
4571           // function, we can simply assign this clone to the new func clone
4572           // and update all callers to it, then skip to the next clone.
4573           if (!CallerAssignedToCloneOfFunc) {
4574             AssignCallsiteCloneToFuncClone(
4575                 NewFuncClone, Call, Clone,
4576                 AllocationCallToContextNodeMap.count(Call));
4577             for (auto &CE : Clone->CallerEdges) {
4578               // Ignore any caller that does not have a recorded callsite Call.
4579               if (!CE->Caller->hasCall())
4580                 continue;
4581               RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4582             }
4583             continue;
4584           }
4585 
4586           // We may need to do additional node cloning in this case.
4587           // Reset the CallsiteToCalleeFuncCloneMap entry for any callers
4588           // that were previously assigned to call PreviousAssignedFuncClone,
4589           // to record that they now call NewFuncClone.
4590           // The none type edge removal may remove some of this Clone's caller
4591           // edges, if it is reached via another of its caller's callees.
4592           // Iterate over a copy and skip any that were removed.
4593           auto CallerEdges = Clone->CallerEdges;
4594           for (auto CE : CallerEdges) {
4595             // Skip any that have been removed on an earlier iteration.
4596             if (CE->isRemoved()) {
4597               assert(!is_contained(Clone->CallerEdges, CE));
4598               continue;
4599             }
4600             assert(CE);
4601             // Ignore any caller that does not have a recorded callsite Call.
4602             if (!CE->Caller->hasCall())
4603               continue;
4604 
4605             if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) ||
4606                 // We subsequently fall through to later handling that
4607                 // will perform any additional cloning required for
4608                 // callers that were calling other function clones.
4609                 CallsiteToCalleeFuncCloneMap[CE->Caller] !=
4610                     PreviousAssignedFuncClone)
4611               continue;
4612 
4613             RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone);
4614 
4615             // If we are cloning a function that was already assigned to some
4616             // callers, then essentially we are creating new callsite clones
4617             // of the other callsites in that function that are reached by those
4618             // callers. Clone the other callees of the current callsite's caller
4619             // that were already assigned to PreviousAssignedFuncClone
4620             // accordingly. This is important since we subsequently update the
4621             // calls from the nodes in the graph and their assignments to callee
4622             // functions recorded in CallsiteToCalleeFuncCloneMap.
4623             // The none type edge removal may remove some of this caller's
4624             // callee edges, if it is reached via another of its callees.
4625             // Iterate over a copy and skip any that were removed.
4626             auto CalleeEdges = CE->Caller->CalleeEdges;
4627             for (auto CalleeEdge : CalleeEdges) {
4628               // Skip any that have been removed on an earlier iteration when
4629               // cleaning up newly None type callee edges.
4630               if (CalleeEdge->isRemoved()) {
4631                 assert(!is_contained(CE->Caller->CalleeEdges, CalleeEdge));
4632                 continue;
4633               }
4634               assert(CalleeEdge);
4635               ContextNode *Callee = CalleeEdge->Callee;
4636               // Skip the current callsite, we are looking for other
4637               // callsites Caller calls, as well as any that does not have a
4638               // recorded callsite Call.
4639               if (Callee == Clone || !Callee->hasCall())
4640                 continue;
4641               // Skip direct recursive calls. We don't need/want to clone the
4642               // caller node again, and this loop will not behave as expected if
4643               // we tried.
4644               if (Callee == CalleeEdge->Caller)
4645                 continue;
4646               ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge);
4647               removeNoneTypeCalleeEdges(NewClone);
4648               // Moving the edge may have resulted in some none type
4649               // callee edges on the original Callee.
4650               removeNoneTypeCalleeEdges(Callee);
4651               assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4652               // If the Callee node was already assigned to call a specific
4653               // function version, make sure its new clone is assigned to call
4654               // that same function clone.
4655               if (CallsiteToCalleeFuncCloneMap.count(Callee))
4656                 RecordCalleeFuncOfCallsite(
4657                     NewClone, CallsiteToCalleeFuncCloneMap[Callee]);
4658               // Update NewClone with the new Call clone of this callsite's Call
4659               // created for the new function clone created earlier.
4660               // Recall that we have already ensured when building the graph
4661               // that each caller can only call callsites within the same
4662               // function, so we are guaranteed that Callee Call is in the
4663               // current OrigFunc.
4664               // CallMap is set up as indexed by original Call at clone 0.
4665               CallInfo OrigCall(Callee->getOrigNode()->Call);
4666               OrigCall.setCloneNo(0);
4667               std::map<CallInfo, CallInfo> &CallMap =
4668                   FuncClonesToCallMap[NewFuncClone];
4669               assert(CallMap.count(OrigCall));
4670               CallInfo NewCall(CallMap[OrigCall]);
4671               assert(NewCall);
4672               NewClone->setCall(NewCall);
4673               // Need to do the same for all matching calls.
4674               for (auto &MatchingCall : NewClone->MatchingCalls) {
4675                 CallInfo OrigMatchingCall(MatchingCall);
4676                 OrigMatchingCall.setCloneNo(0);
4677                 assert(CallMap.count(OrigMatchingCall));
4678                 CallInfo NewCall(CallMap[OrigMatchingCall]);
4679                 assert(NewCall);
4680                 // Updates the call in the list.
4681                 MatchingCall = NewCall;
4682               }
4683             }
4684           }
4685           // Fall through to handling below to perform the recording of the
4686           // function for this callsite clone. This enables handling of cases
4687           // where the callers were assigned to different clones of a function.
4688         }
4689 
4690         // See if we can use existing function clone. Walk through
4691         // all caller edges to see if any have already been assigned to
4692         // a clone of this callsite's function. If we can use it, do so. If not,
4693         // because that function clone is already assigned to a different clone
4694         // of this callsite, then we need to clone again.
4695         // Basically, this checking is needed to handle the case where different
4696         // caller functions/callsites may need versions of this function
4697         // containing different mixes of callsite clones across the different
4698         // callsites within the function. If that happens, we need to create
4699         // additional function clones to handle the various combinations.
4700         //
4701         // Keep track of any new clones of this callsite created by the
4702         // following loop, as well as any existing clone that we decided to
4703         // assign this clone to.
4704         std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap;
4705         FuncInfo FuncCloneAssignedToCurCallsiteClone;
4706         // Iterate over a copy of Clone's caller edges, since we may need to
4707         // remove edges in the moveEdgeTo* methods, and this simplifies the
4708         // handling and makes it less error-prone.
4709         auto CloneCallerEdges = Clone->CallerEdges;
4710         for (auto &Edge : CloneCallerEdges) {
4711           // Skip removed edges (due to direct recursive edges updated when
4712           // updating callee edges when moving an edge and subsequently
4713           // removed by call to removeNoneTypeCalleeEdges on the Clone).
4714           if (Edge->isRemoved())
4715             continue;
4716           // Ignore any caller that does not have a recorded callsite Call.
4717           if (!Edge->Caller->hasCall())
4718             continue;
4719           // If this caller already assigned to call a version of OrigFunc, need
4720           // to ensure we can assign this callsite clone to that function clone.
4721           if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) {
4722             FuncInfo FuncCloneCalledByCaller =
4723                 CallsiteToCalleeFuncCloneMap[Edge->Caller];
4724             // First we need to confirm that this function clone is available
4725             // for use by this callsite node clone.
4726             //
4727             // While FuncCloneToCurNodeCloneMap is built only for this Node and
4728             // its callsite clones, one of those callsite clones X could have
4729             // been assigned to the same function clone called by Edge's caller
4730             // - if Edge's caller calls another callsite within Node's original
4731             // function, and that callsite has another caller reaching clone X.
4732             // We need to clone Node again in this case.
4733             if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) &&
4734                  FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] !=
4735                      Clone) ||
4736                 // Detect when we have multiple callers of this callsite that
4737                 // have already been assigned to specific, and different, clones
4738                 // of OrigFunc (due to other unrelated callsites in Func they
4739                 // reach via call contexts). Is this Clone of callsite Node
4740                 // assigned to a different clone of OrigFunc? If so, clone Node
4741                 // again.
4742                 (FuncCloneAssignedToCurCallsiteClone &&
4743                  FuncCloneAssignedToCurCallsiteClone !=
4744                      FuncCloneCalledByCaller)) {
4745               // We need to use a different newly created callsite clone, in
4746               // order to assign it to another new function clone on a
4747               // subsequent iteration over the Clones array (adjusted below).
4748               // Note we specifically do not reset the
4749               // CallsiteToCalleeFuncCloneMap entry for this caller, so that
4750               // when this new clone is processed later we know which version of
4751               // the function to copy (so that other callsite clones we have
4752               // assigned to that function clone are properly cloned over). See
4753               // comments in the function cloning handling earlier.
4754 
4755               // Check if we already have cloned this callsite again while
4756               // walking through caller edges, for a caller calling the same
4757               // function clone. If so, we can move this edge to that new clone
4758               // rather than creating yet another new clone.
4759               if (FuncCloneToNewCallsiteCloneMap.count(
4760                       FuncCloneCalledByCaller)) {
4761                 ContextNode *NewClone =
4762                     FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller];
4763                 moveEdgeToExistingCalleeClone(Edge, NewClone);
4764                 // Cleanup any none type edges cloned over.
4765                 removeNoneTypeCalleeEdges(NewClone);
4766               } else {
4767                 // Create a new callsite clone.
4768                 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge);
4769                 removeNoneTypeCalleeEdges(NewClone);
4770                 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] =
4771                     NewClone;
4772                 // Add to list of clones and process later.
4773                 ClonesWorklist.push_back(NewClone);
4774                 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None);
4775               }
4776               // Moving the caller edge may have resulted in some none type
4777               // callee edges.
4778               removeNoneTypeCalleeEdges(Clone);
4779               // We will handle the newly created callsite clone in a subsequent
4780               // iteration over this Node's Clones.
4781               continue;
4782             }
4783 
4784             // Otherwise, we can use the function clone already assigned to this
4785             // caller.
4786             if (!FuncCloneAssignedToCurCallsiteClone) {
4787               FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller;
4788               // Assign Clone to FuncCloneCalledByCaller
4789               AssignCallsiteCloneToFuncClone(
4790                   FuncCloneCalledByCaller, Call, Clone,
4791                   AllocationCallToContextNodeMap.count(Call));
4792             } else
4793               // Don't need to do anything - callsite is already calling this
4794               // function clone.
4795               assert(FuncCloneAssignedToCurCallsiteClone ==
4796                      FuncCloneCalledByCaller);
4797 
4798           } else {
4799             // We have not already assigned this caller to a version of
4800             // OrigFunc. Do the assignment now.
4801 
4802             // First check if we have already assigned this callsite clone to a
4803             // clone of OrigFunc for another caller during this iteration over
4804             // its caller edges.
4805             if (!FuncCloneAssignedToCurCallsiteClone) {
4806               // Find first function in FuncClonesToCallMap without an assigned
4807               // clone of this callsite Node. We should always have one
4808               // available at this point due to the earlier cloning when the
4809               // FuncClonesToCallMap size was smaller than the clone number.
4810               for (auto &CF : FuncClonesToCallMap) {
4811                 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) {
4812                   FuncCloneAssignedToCurCallsiteClone = CF.first;
4813                   break;
4814                 }
4815               }
4816               assert(FuncCloneAssignedToCurCallsiteClone);
4817               // Assign Clone to FuncCloneAssignedToCurCallsiteClone
4818               AssignCallsiteCloneToFuncClone(
4819                   FuncCloneAssignedToCurCallsiteClone, Call, Clone,
4820                   AllocationCallToContextNodeMap.count(Call));
4821             } else
4822               assert(FuncCloneToCurNodeCloneMap
4823                          [FuncCloneAssignedToCurCallsiteClone] == Clone);
4824             // Update callers to record function version called.
4825             RecordCalleeFuncOfCallsite(Edge->Caller,
4826                                        FuncCloneAssignedToCurCallsiteClone);
4827           }
4828         }
4829       }
4830       if (VerifyCCG) {
4831         checkNode<DerivedCCG, FuncTy, CallTy>(Node);
4832         for (const auto &PE : Node->CalleeEdges)
4833           checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4834         for (const auto &CE : Node->CallerEdges)
4835           checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4836         for (auto *Clone : Node->Clones) {
4837           checkNode<DerivedCCG, FuncTy, CallTy>(Clone);
4838           for (const auto &PE : Clone->CalleeEdges)
4839             checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee);
4840           for (const auto &CE : Clone->CallerEdges)
4841             checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller);
4842         }
4843       }
4844     }
4845   }
4846 
4847   uint8_t BothTypes =
4848       (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold;
4849 
4850   auto UpdateCalls = [&](ContextNode *Node,
4851                          DenseSet<const ContextNode *> &Visited,
4852                          auto &&UpdateCalls) {
4853     auto Inserted = Visited.insert(Node);
4854     if (!Inserted.second)
4855       return;
4856 
4857     for (auto *Clone : Node->Clones)
4858       UpdateCalls(Clone, Visited, UpdateCalls);
4859 
4860     for (auto &Edge : Node->CallerEdges)
4861       UpdateCalls(Edge->Caller, Visited, UpdateCalls);
4862 
4863     // Skip if either no call to update, or if we ended up with no context ids
4864     // (we moved all edges onto other clones).
4865     if (!Node->hasCall() || Node->emptyContextIds())
4866       return;
4867 
4868     if (Node->IsAllocation) {
4869       auto AT = allocTypeToUse(Node->AllocTypes);
4870       // If the allocation type is ambiguous, and more aggressive hinting
4871       // has been enabled via the MinClonedColdBytePercent flag, see if this
4872       // allocation should be hinted cold anyway because its fraction cold bytes
4873       // allocated is at least the given threshold.
4874       if (Node->AllocTypes == BothTypes && MinClonedColdBytePercent < 100 &&
4875           !ContextIdToContextSizeInfos.empty()) {
4876         uint64_t TotalCold = 0;
4877         uint64_t Total = 0;
4878         for (auto Id : Node->getContextIds()) {
4879           auto TypeI = ContextIdToAllocationType.find(Id);
4880           assert(TypeI != ContextIdToAllocationType.end());
4881           auto CSI = ContextIdToContextSizeInfos.find(Id);
4882           if (CSI != ContextIdToContextSizeInfos.end()) {
4883             for (auto &Info : CSI->second) {
4884               Total += Info.TotalSize;
4885               if (TypeI->second == AllocationType::Cold)
4886                 TotalCold += Info.TotalSize;
4887             }
4888           }
4889         }
4890         if (TotalCold * 100 >= Total * MinClonedColdBytePercent)
4891           AT = AllocationType::Cold;
4892       }
4893       updateAllocationCall(Node->Call, AT);
4894       assert(Node->MatchingCalls.empty());
4895       return;
4896     }
4897 
4898     if (!CallsiteToCalleeFuncCloneMap.count(Node))
4899       return;
4900 
4901     auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node];
4902     updateCall(Node->Call, CalleeFunc);
4903     // Update all the matching calls as well.
4904     for (auto &Call : Node->MatchingCalls)
4905       updateCall(Call, CalleeFunc);
4906   };
4907 
4908   // Performs DFS traversal starting from allocation nodes to update calls to
4909   // reflect cloning decisions recorded earlier. For regular LTO this will
4910   // update the actual calls in the IR to call the appropriate function clone
4911   // (and add attributes to allocation calls), whereas for ThinLTO the decisions
4912   // are recorded in the summary entries.
4913   DenseSet<const ContextNode *> Visited;
4914   for (auto &Entry : AllocationCallToContextNodeMap)
4915     UpdateCalls(Entry.second, Visited, UpdateCalls);
4916 
4917   return Changed;
4918 }
4919 
createFunctionClones(Function & F,unsigned NumClones,Module & M,OptimizationRemarkEmitter & ORE,std::map<const Function *,SmallPtrSet<const GlobalAlias *,1>> & FuncToAliasMap)4920 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones(
4921     Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE,
4922     std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
4923         &FuncToAliasMap) {
4924   // The first "clone" is the original copy, we should only call this if we
4925   // needed to create new clones.
4926   assert(NumClones > 1);
4927   SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
4928   VMaps.reserve(NumClones - 1);
4929   FunctionsClonedThinBackend++;
4930   for (unsigned I = 1; I < NumClones; I++) {
4931     VMaps.emplace_back(std::make_unique<ValueToValueMapTy>());
4932     auto *NewF = CloneFunction(&F, *VMaps.back());
4933     FunctionClonesThinBackend++;
4934     // Strip memprof and callsite metadata from clone as they are no longer
4935     // needed.
4936     for (auto &BB : *NewF) {
4937       for (auto &Inst : BB) {
4938         Inst.setMetadata(LLVMContext::MD_memprof, nullptr);
4939         Inst.setMetadata(LLVMContext::MD_callsite, nullptr);
4940       }
4941     }
4942     std::string Name = getMemProfFuncName(F.getName(), I);
4943     auto *PrevF = M.getFunction(Name);
4944     if (PrevF) {
4945       // We might have created this when adjusting callsite in another
4946       // function. It should be a declaration.
4947       assert(PrevF->isDeclaration());
4948       NewF->takeName(PrevF);
4949       PrevF->replaceAllUsesWith(NewF);
4950       PrevF->eraseFromParent();
4951     } else
4952       NewF->setName(Name);
4953     if (auto *SP = NewF->getSubprogram())
4954       SP->replaceLinkageName(
4955           MDString::get(NewF->getParent()->getContext(), Name));
4956     ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F)
4957              << "created clone " << ore::NV("NewFunction", NewF));
4958 
4959     // Now handle aliases to this function, and clone those as well.
4960     if (!FuncToAliasMap.count(&F))
4961       continue;
4962     for (auto *A : FuncToAliasMap[&F]) {
4963       std::string Name = getMemProfFuncName(A->getName(), I);
4964       auto *PrevA = M.getNamedAlias(Name);
4965       auto *NewA = GlobalAlias::create(A->getValueType(),
4966                                        A->getType()->getPointerAddressSpace(),
4967                                        A->getLinkage(), Name, NewF);
4968       NewA->copyAttributesFrom(A);
4969       if (PrevA) {
4970         // We might have created this when adjusting callsite in another
4971         // function. It should be a declaration.
4972         assert(PrevA->isDeclaration());
4973         NewA->takeName(PrevA);
4974         PrevA->replaceAllUsesWith(NewA);
4975         PrevA->eraseFromParent();
4976       }
4977     }
4978   }
4979   return VMaps;
4980 }
4981 
4982 // Locate the summary for F. This is complicated by the fact that it might
4983 // have been internalized or promoted.
findValueInfoForFunc(const Function & F,const Module & M,const ModuleSummaryIndex * ImportSummary,const Function * CallingFunc=nullptr)4984 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M,
4985                                       const ModuleSummaryIndex *ImportSummary,
4986                                       const Function *CallingFunc = nullptr) {
4987   // FIXME: Ideally we would retain the original GUID in some fashion on the
4988   // function (e.g. as metadata), but for now do our best to locate the
4989   // summary without that information.
4990   ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID());
4991   if (!TheFnVI)
4992     // See if theFn was internalized, by checking index directly with
4993     // original name (this avoids the name adjustment done by getGUID() for
4994     // internal symbols).
4995     TheFnVI = ImportSummary->getValueInfo(
4996         GlobalValue::getGUIDAssumingExternalLinkage(F.getName()));
4997   if (TheFnVI)
4998     return TheFnVI;
4999   // Now query with the original name before any promotion was performed.
5000   StringRef OrigName =
5001       ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName());
5002   // When this pass is enabled, we always add thinlto_src_file provenance
5003   // metadata to imported function definitions, which allows us to recreate the
5004   // original internal symbol's GUID.
5005   auto SrcFileMD = F.getMetadata("thinlto_src_file");
5006   // If this is a call to an imported/promoted local for which we didn't import
5007   // the definition, the metadata will not exist on the declaration. However,
5008   // since we are doing this early, before any inlining in the LTO backend, we
5009   // can simply look at the metadata on the calling function which must have
5010   // been from the same module if F was an internal symbol originally.
5011   if (!SrcFileMD && F.isDeclaration()) {
5012     // We would only call this for a declaration for a direct callsite, in which
5013     // case the caller would have provided the calling function pointer.
5014     assert(CallingFunc);
5015     SrcFileMD = CallingFunc->getMetadata("thinlto_src_file");
5016     // If this is a promoted local (OrigName != F.getName()), since this is a
5017     // declaration, it must be imported from a different module and therefore we
5018     // should always find the metadata on its calling function. Any call to a
5019     // promoted local that came from this module should still be a definition.
5020     assert(SrcFileMD || OrigName == F.getName());
5021   }
5022   StringRef SrcFile = M.getSourceFileName();
5023   if (SrcFileMD)
5024     SrcFile = dyn_cast<MDString>(SrcFileMD->getOperand(0))->getString();
5025   std::string OrigId = GlobalValue::getGlobalIdentifier(
5026       OrigName, GlobalValue::InternalLinkage, SrcFile);
5027   TheFnVI = ImportSummary->getValueInfo(
5028       GlobalValue::getGUIDAssumingExternalLinkage(OrigId));
5029   // Internal func in original module may have gotten a numbered suffix if we
5030   // imported an external function with the same name. This happens
5031   // automatically during IR linking for naming conflicts. It would have to
5032   // still be internal in that case (otherwise it would have been renamed on
5033   // promotion in which case we wouldn't have a naming conflict).
5034   if (!TheFnVI && OrigName == F.getName() && F.hasLocalLinkage() &&
5035       F.getName().contains('.')) {
5036     OrigName = F.getName().rsplit('.').first;
5037     OrigId = GlobalValue::getGlobalIdentifier(
5038         OrigName, GlobalValue::InternalLinkage, SrcFile);
5039     TheFnVI = ImportSummary->getValueInfo(
5040         GlobalValue::getGUIDAssumingExternalLinkage(OrigId));
5041   }
5042   // The only way we may not have a VI is if this is a declaration created for
5043   // an imported reference. For distributed ThinLTO we may not have a VI for
5044   // such declarations in the distributed summary.
5045   assert(TheFnVI || F.isDeclaration());
5046   return TheFnVI;
5047 }
5048 
initializeIndirectCallPromotionInfo(Module & M)5049 bool MemProfContextDisambiguation::initializeIndirectCallPromotionInfo(
5050     Module &M) {
5051   ICallAnalysis = std::make_unique<ICallPromotionAnalysis>();
5052   Symtab = std::make_unique<InstrProfSymtab>();
5053   // Don't add canonical names, to avoid multiple functions to the symtab
5054   // when they both have the same root name with "." suffixes stripped.
5055   // If we pick the wrong one then this could lead to incorrect ICP and calling
5056   // a memprof clone that we don't actually create (resulting in linker unsats).
5057   // What this means is that the GUID of the function (or its PGOFuncName
5058   // metadata) *must* match that in the VP metadata to allow promotion.
5059   // In practice this should not be a limitation, since local functions should
5060   // have PGOFuncName metadata and global function names shouldn't need any
5061   // special handling (they should not get the ".llvm.*" suffix that the
5062   // canonicalization handling is attempting to strip).
5063   if (Error E = Symtab->create(M, /*InLTO=*/true, /*AddCanonical=*/false)) {
5064     std::string SymtabFailure = toString(std::move(E));
5065     M.getContext().emitError("Failed to create symtab: " + SymtabFailure);
5066     return false;
5067   }
5068   return true;
5069 }
5070 
5071 #ifndef NDEBUG
5072 // Sanity check that the MIB stack ids match between the summary and
5073 // instruction metadata.
checkAllocContextIds(const AllocInfo & AllocNode,const MDNode * MemProfMD,const CallStack<MDNode,MDNode::op_iterator> & CallsiteContext,const ModuleSummaryIndex * ImportSummary)5074 static void checkAllocContextIds(
5075     const AllocInfo &AllocNode, const MDNode *MemProfMD,
5076     const CallStack<MDNode, MDNode::op_iterator> &CallsiteContext,
5077     const ModuleSummaryIndex *ImportSummary) {
5078   auto MIBIter = AllocNode.MIBs.begin();
5079   for (auto &MDOp : MemProfMD->operands()) {
5080     assert(MIBIter != AllocNode.MIBs.end());
5081     auto StackIdIndexIter = MIBIter->StackIdIndices.begin();
5082     auto *MIBMD = cast<const MDNode>(MDOp);
5083     MDNode *StackMDNode = getMIBStackNode(MIBMD);
5084     assert(StackMDNode);
5085     CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode);
5086     auto ContextIterBegin =
5087         StackContext.beginAfterSharedPrefix(CallsiteContext);
5088     // Skip the checking on the first iteration.
5089     uint64_t LastStackContextId =
5090         (ContextIterBegin != StackContext.end() && *ContextIterBegin == 0) ? 1
5091                                                                            : 0;
5092     for (auto ContextIter = ContextIterBegin; ContextIter != StackContext.end();
5093          ++ContextIter) {
5094       // If this is a direct recursion, simply skip the duplicate
5095       // entries, to be consistent with how the summary ids were
5096       // generated during ModuleSummaryAnalysis.
5097       if (LastStackContextId == *ContextIter)
5098         continue;
5099       LastStackContextId = *ContextIter;
5100       assert(StackIdIndexIter != MIBIter->StackIdIndices.end());
5101       assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5102              *ContextIter);
5103       StackIdIndexIter++;
5104     }
5105     MIBIter++;
5106   }
5107 }
5108 #endif
5109 
applyImport(Module & M)5110 bool MemProfContextDisambiguation::applyImport(Module &M) {
5111   assert(ImportSummary);
5112   bool Changed = false;
5113 
5114   // We also need to clone any aliases that reference cloned functions, because
5115   // the modified callsites may invoke via the alias. Keep track of the aliases
5116   // for each function.
5117   std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>>
5118       FuncToAliasMap;
5119   for (auto &A : M.aliases()) {
5120     auto *Aliasee = A.getAliaseeObject();
5121     if (auto *F = dyn_cast<Function>(Aliasee))
5122       FuncToAliasMap[F].insert(&A);
5123   }
5124 
5125   if (!initializeIndirectCallPromotionInfo(M))
5126     return false;
5127 
5128   for (auto &F : M) {
5129     if (F.isDeclaration() || isMemProfClone(F))
5130       continue;
5131 
5132     OptimizationRemarkEmitter ORE(&F);
5133 
5134     SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
5135     bool ClonesCreated = false;
5136     unsigned NumClonesCreated = 0;
5137     auto CloneFuncIfNeeded = [&](unsigned NumClones) {
5138       // We should at least have version 0 which is the original copy.
5139       assert(NumClones > 0);
5140       // If only one copy needed use original.
5141       if (NumClones == 1)
5142         return;
5143       // If we already performed cloning of this function, confirm that the
5144       // requested number of clones matches (the thin link should ensure the
5145       // number of clones for each constituent callsite is consistent within
5146       // each function), before returning.
5147       if (ClonesCreated) {
5148         assert(NumClonesCreated == NumClones);
5149         return;
5150       }
5151       VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap);
5152       // The first "clone" is the original copy, which doesn't have a VMap.
5153       assert(VMaps.size() == NumClones - 1);
5154       Changed = true;
5155       ClonesCreated = true;
5156       NumClonesCreated = NumClones;
5157     };
5158 
5159     auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB,
5160                              Function *CalledFunction) {
5161       // Perform cloning if not yet done.
5162       CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size());
5163 
5164       assert(!isMemProfClone(*CalledFunction));
5165 
5166       // Because we update the cloned calls by calling setCalledOperand (see
5167       // comment below), out of an abundance of caution make sure the called
5168       // function was actually the called operand (or its aliasee). We also
5169       // strip pointer casts when looking for calls (to match behavior during
5170       // summary generation), however, with opaque pointers in theory this
5171       // should not be an issue. Note we still clone the current function
5172       // (containing this call) above, as that could be needed for its callers.
5173       auto *GA = dyn_cast_or_null<GlobalAlias>(CB->getCalledOperand());
5174       if (CalledFunction != CB->getCalledOperand() &&
5175           (!GA || CalledFunction != GA->getAliaseeObject())) {
5176         SkippedCallsCloning++;
5177         return;
5178       }
5179       // Update the calls per the summary info.
5180       // Save orig name since it gets updated in the first iteration
5181       // below.
5182       auto CalleeOrigName = CalledFunction->getName();
5183       for (unsigned J = 0; J < StackNode.Clones.size(); J++) {
5184         // Do nothing if this version calls the original version of its
5185         // callee.
5186         if (!StackNode.Clones[J])
5187           continue;
5188         auto NewF = M.getOrInsertFunction(
5189             getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]),
5190             CalledFunction->getFunctionType());
5191         CallBase *CBClone;
5192         // Copy 0 is the original function.
5193         if (!J)
5194           CBClone = CB;
5195         else
5196           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5197         // Set the called operand directly instead of calling setCalledFunction,
5198         // as the latter mutates the function type on the call. In rare cases
5199         // we may have a slightly different type on a callee function
5200         // declaration due to it being imported from a different module with
5201         // incomplete types. We really just want to change the name of the
5202         // function to the clone, and not make any type changes.
5203         CBClone->setCalledOperand(NewF.getCallee());
5204         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5205                  << ore::NV("Call", CBClone) << " in clone "
5206                  << ore::NV("Caller", CBClone->getFunction())
5207                  << " assigned to call function clone "
5208                  << ore::NV("Callee", NewF.getCallee()));
5209       }
5210     };
5211 
5212     // Locate the summary for F.
5213     ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary);
5214     // If not found, this could be an imported local (see comment in
5215     // findValueInfoForFunc). Skip for now as it will be cloned in its original
5216     // module (where it would have been promoted to global scope so should
5217     // satisfy any reference in this module).
5218     if (!TheFnVI)
5219       continue;
5220 
5221     auto *GVSummary =
5222         ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier());
5223     if (!GVSummary) {
5224       // Must have been imported, use the summary which matches the definition。
5225       // (might be multiple if this was a linkonce_odr).
5226       auto SrcModuleMD = F.getMetadata("thinlto_src_module");
5227       assert(SrcModuleMD &&
5228              "enable-import-metadata is needed to emit thinlto_src_module");
5229       StringRef SrcModule =
5230           dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString();
5231       for (auto &GVS : TheFnVI.getSummaryList()) {
5232         if (GVS->modulePath() == SrcModule) {
5233           GVSummary = GVS.get();
5234           break;
5235         }
5236       }
5237       assert(GVSummary && GVSummary->modulePath() == SrcModule);
5238     }
5239 
5240     // If this was an imported alias skip it as we won't have the function
5241     // summary, and it should be cloned in the original module.
5242     if (isa<AliasSummary>(GVSummary))
5243       continue;
5244 
5245     auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject());
5246 
5247     if (FS->allocs().empty() && FS->callsites().empty())
5248       continue;
5249 
5250     auto SI = FS->callsites().begin();
5251     auto AI = FS->allocs().begin();
5252 
5253     // To handle callsite infos synthesized for tail calls which have missing
5254     // frames in the profiled context, map callee VI to the synthesized callsite
5255     // info.
5256     DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite;
5257     // Iterate the callsites for this function in reverse, since we place all
5258     // those synthesized for tail calls at the end.
5259     for (auto CallsiteIt = FS->callsites().rbegin();
5260          CallsiteIt != FS->callsites().rend(); CallsiteIt++) {
5261       auto &Callsite = *CallsiteIt;
5262       // Stop as soon as we see a non-synthesized callsite info (see comment
5263       // above loop). All the entries added for discovered tail calls have empty
5264       // stack ids.
5265       if (!Callsite.StackIdIndices.empty())
5266         break;
5267       MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite});
5268     }
5269 
5270     // Keeps track of needed ICP for the function.
5271     SmallVector<ICallAnalysisData> ICallAnalysisInfo;
5272 
5273     // Assume for now that the instructions are in the exact same order
5274     // as when the summary was created, but confirm this is correct by
5275     // matching the stack ids.
5276     for (auto &BB : F) {
5277       for (auto &I : BB) {
5278         auto *CB = dyn_cast<CallBase>(&I);
5279         // Same handling as when creating module summary.
5280         if (!mayHaveMemprofSummary(CB))
5281           continue;
5282 
5283         auto *CalledValue = CB->getCalledOperand();
5284         auto *CalledFunction = CB->getCalledFunction();
5285         if (CalledValue && !CalledFunction) {
5286           CalledValue = CalledValue->stripPointerCasts();
5287           // Stripping pointer casts can reveal a called function.
5288           CalledFunction = dyn_cast<Function>(CalledValue);
5289         }
5290         // Check if this is an alias to a function. If so, get the
5291         // called aliasee for the checks below.
5292         if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) {
5293           assert(!CalledFunction &&
5294                  "Expected null called function in callsite for alias");
5295           CalledFunction = dyn_cast<Function>(GA->getAliaseeObject());
5296         }
5297 
5298         CallStack<MDNode, MDNode::op_iterator> CallsiteContext(
5299             I.getMetadata(LLVMContext::MD_callsite));
5300         auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof);
5301 
5302         // Include allocs that were already assigned a memprof function
5303         // attribute in the statistics.
5304         if (CB->getAttributes().hasFnAttr("memprof")) {
5305           assert(!MemProfMD);
5306           CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold"
5307               ? AllocTypeColdThinBackend++
5308               : AllocTypeNotColdThinBackend++;
5309           OrigAllocsThinBackend++;
5310           AllocVersionsThinBackend++;
5311           if (!MaxAllocVersionsThinBackend)
5312             MaxAllocVersionsThinBackend = 1;
5313           continue;
5314         }
5315 
5316         if (MemProfMD) {
5317           // Consult the next alloc node.
5318           assert(AI != FS->allocs().end());
5319           auto &AllocNode = *(AI++);
5320 
5321 #ifndef NDEBUG
5322           checkAllocContextIds(AllocNode, MemProfMD, CallsiteContext,
5323                                ImportSummary);
5324 #endif
5325 
5326           // Perform cloning if not yet done.
5327           CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size());
5328 
5329           OrigAllocsThinBackend++;
5330           AllocVersionsThinBackend += AllocNode.Versions.size();
5331           if (MaxAllocVersionsThinBackend < AllocNode.Versions.size())
5332             MaxAllocVersionsThinBackend = AllocNode.Versions.size();
5333 
5334           // If there is only one version that means we didn't end up
5335           // considering this function for cloning, and in that case the alloc
5336           // will still be none type or should have gotten the default NotCold.
5337           // Skip that after calling clone helper since that does some sanity
5338           // checks that confirm we haven't decided yet that we need cloning.
5339           // We might have a single version that is cold due to the
5340           // MinClonedColdBytePercent heuristic, make sure we don't skip in that
5341           // case.
5342           if (AllocNode.Versions.size() == 1 &&
5343               (AllocationType)AllocNode.Versions[0] != AllocationType::Cold) {
5344             assert((AllocationType)AllocNode.Versions[0] ==
5345                        AllocationType::NotCold ||
5346                    (AllocationType)AllocNode.Versions[0] ==
5347                        AllocationType::None);
5348             UnclonableAllocsThinBackend++;
5349             continue;
5350           }
5351 
5352           // All versions should have a singular allocation type.
5353           assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) {
5354             return Type == ((uint8_t)AllocationType::NotCold |
5355                             (uint8_t)AllocationType::Cold);
5356           }));
5357 
5358           // Update the allocation types per the summary info.
5359           for (unsigned J = 0; J < AllocNode.Versions.size(); J++) {
5360             // Ignore any that didn't get an assigned allocation type.
5361             if (AllocNode.Versions[J] == (uint8_t)AllocationType::None)
5362               continue;
5363             AllocationType AllocTy = (AllocationType)AllocNode.Versions[J];
5364             AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++
5365                                             : AllocTypeNotColdThinBackend++;
5366             std::string AllocTypeString = getAllocTypeAttributeString(AllocTy);
5367             auto A = llvm::Attribute::get(F.getContext(), "memprof",
5368                                           AllocTypeString);
5369             CallBase *CBClone;
5370             // Copy 0 is the original function.
5371             if (!J)
5372               CBClone = CB;
5373             else
5374               // Since VMaps are only created for new clones, we index with
5375               // clone J-1 (J==0 is the original clone and does not have a VMaps
5376               // entry).
5377               CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5378             CBClone->addFnAttr(A);
5379             ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone)
5380                      << ore::NV("AllocationCall", CBClone) << " in clone "
5381                      << ore::NV("Caller", CBClone->getFunction())
5382                      << " marked with memprof allocation attribute "
5383                      << ore::NV("Attribute", AllocTypeString));
5384           }
5385         } else if (!CallsiteContext.empty()) {
5386           if (!CalledFunction) {
5387 #ifndef NDEBUG
5388             // We should have skipped inline assembly calls.
5389             auto *CI = dyn_cast<CallInst>(CB);
5390             assert(!CI || !CI->isInlineAsm());
5391 #endif
5392             // We should have skipped direct calls via a Constant.
5393             assert(CalledValue && !isa<Constant>(CalledValue));
5394 
5395             // This is an indirect call, see if we have profile information and
5396             // whether any clones were recorded for the profiled targets (that
5397             // we synthesized CallsiteInfo summary records for when building the
5398             // index).
5399             auto NumClones =
5400                 recordICPInfo(CB, FS->callsites(), SI, ICallAnalysisInfo);
5401 
5402             // Perform cloning if not yet done. This is done here in case
5403             // we don't need to do ICP, but might need to clone this
5404             // function as it is the target of other cloned calls.
5405             if (NumClones)
5406               CloneFuncIfNeeded(NumClones);
5407           }
5408 
5409           else {
5410             // Consult the next callsite node.
5411             assert(SI != FS->callsites().end());
5412             auto &StackNode = *(SI++);
5413 
5414 #ifndef NDEBUG
5415             // Sanity check that the stack ids match between the summary and
5416             // instruction metadata.
5417             auto StackIdIndexIter = StackNode.StackIdIndices.begin();
5418             for (auto StackId : CallsiteContext) {
5419               assert(StackIdIndexIter != StackNode.StackIdIndices.end());
5420               assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) ==
5421                      StackId);
5422               StackIdIndexIter++;
5423             }
5424 #endif
5425 
5426             CloneCallsite(StackNode, CB, CalledFunction);
5427           }
5428         } else if (CB->isTailCall() && CalledFunction) {
5429           // Locate the synthesized callsite info for the callee VI, if any was
5430           // created, and use that for cloning.
5431           ValueInfo CalleeVI =
5432               findValueInfoForFunc(*CalledFunction, M, ImportSummary, &F);
5433           if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) {
5434             auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI);
5435             assert(Callsite != MapTailCallCalleeVIToCallsite.end());
5436             CloneCallsite(Callsite->second, CB, CalledFunction);
5437           }
5438         }
5439       }
5440     }
5441 
5442     // Now do any promotion required for cloning.
5443     performICP(M, FS->callsites(), VMaps, ICallAnalysisInfo, ORE);
5444   }
5445 
5446   // We skip some of the functions and instructions above, so remove all the
5447   // metadata in a single sweep here.
5448   for (auto &F : M) {
5449     // We can skip memprof clones because createFunctionClones already strips
5450     // the metadata from the newly created clones.
5451     if (F.isDeclaration() || isMemProfClone(F))
5452       continue;
5453     for (auto &BB : F) {
5454       for (auto &I : BB) {
5455         if (!isa<CallBase>(I))
5456           continue;
5457         I.setMetadata(LLVMContext::MD_memprof, nullptr);
5458         I.setMetadata(LLVMContext::MD_callsite, nullptr);
5459       }
5460     }
5461   }
5462 
5463   return Changed;
5464 }
5465 
recordICPInfo(CallBase * CB,ArrayRef<CallsiteInfo> AllCallsites,ArrayRef<CallsiteInfo>::iterator & SI,SmallVector<ICallAnalysisData> & ICallAnalysisInfo)5466 unsigned MemProfContextDisambiguation::recordICPInfo(
5467     CallBase *CB, ArrayRef<CallsiteInfo> AllCallsites,
5468     ArrayRef<CallsiteInfo>::iterator &SI,
5469     SmallVector<ICallAnalysisData> &ICallAnalysisInfo) {
5470   // First see if we have profile information for this indirect call.
5471   uint32_t NumCandidates;
5472   uint64_t TotalCount;
5473   auto CandidateProfileData =
5474       ICallAnalysis->getPromotionCandidatesForInstruction(CB, TotalCount,
5475                                                           NumCandidates);
5476   if (CandidateProfileData.empty())
5477     return 0;
5478 
5479   // Iterate through all of the candidate profiled targets along with the
5480   // CallsiteInfo summary records synthesized for them when building the index,
5481   // and see if any are cloned and/or refer to clones.
5482   bool ICPNeeded = false;
5483   unsigned NumClones = 0;
5484   size_t CallsiteInfoStartIndex = std::distance(AllCallsites.begin(), SI);
5485   for (const auto &Candidate : CandidateProfileData) {
5486 #ifndef NDEBUG
5487     auto CalleeValueInfo =
5488 #endif
5489         ImportSummary->getValueInfo(Candidate.Value);
5490     // We might not have a ValueInfo if this is a distributed
5491     // ThinLTO backend and decided not to import that function.
5492     assert(!CalleeValueInfo || SI->Callee == CalleeValueInfo);
5493     assert(SI != AllCallsites.end());
5494     auto &StackNode = *(SI++);
5495     // See if any of the clones of the indirect callsite for this
5496     // profiled target should call a cloned version of the profiled
5497     // target. We only need to do the ICP here if so.
5498     ICPNeeded |= llvm::any_of(StackNode.Clones,
5499                               [](unsigned CloneNo) { return CloneNo != 0; });
5500     // Every callsite in the same function should have been cloned the same
5501     // number of times.
5502     assert(!NumClones || NumClones == StackNode.Clones.size());
5503     NumClones = StackNode.Clones.size();
5504   }
5505   if (!ICPNeeded)
5506     return NumClones;
5507   // Save information for ICP, which is performed later to avoid messing up the
5508   // current function traversal.
5509   ICallAnalysisInfo.push_back({CB, CandidateProfileData.vec(), NumCandidates,
5510                                TotalCount, CallsiteInfoStartIndex});
5511   return NumClones;
5512 }
5513 
performICP(Module & M,ArrayRef<CallsiteInfo> AllCallsites,ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,ArrayRef<ICallAnalysisData> ICallAnalysisInfo,OptimizationRemarkEmitter & ORE)5514 void MemProfContextDisambiguation::performICP(
5515     Module &M, ArrayRef<CallsiteInfo> AllCallsites,
5516     ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
5517     ArrayRef<ICallAnalysisData> ICallAnalysisInfo,
5518     OptimizationRemarkEmitter &ORE) {
5519   // Now do any promotion required for cloning. Specifically, for each
5520   // recorded ICP candidate (which was only recorded because one clone of that
5521   // candidate should call a cloned target), we perform ICP (speculative
5522   // devirtualization) for each clone of the callsite, and update its callee
5523   // to the appropriate clone. Note that the ICP compares against the original
5524   // version of the target, which is what is in the vtable.
5525   for (auto &Info : ICallAnalysisInfo) {
5526     auto *CB = Info.CB;
5527     auto CallsiteIndex = Info.CallsiteInfoStartIndex;
5528     auto TotalCount = Info.TotalCount;
5529     unsigned NumPromoted = 0;
5530     unsigned NumClones = 0;
5531 
5532     for (auto &Candidate : Info.CandidateProfileData) {
5533       auto &StackNode = AllCallsites[CallsiteIndex++];
5534 
5535       // All calls in the same function must have the same number of clones.
5536       assert(!NumClones || NumClones == StackNode.Clones.size());
5537       NumClones = StackNode.Clones.size();
5538 
5539       // See if the target is in the module. If it wasn't imported, it is
5540       // possible that this profile could have been collected on a different
5541       // target (or version of the code), and we need to be conservative
5542       // (similar to what is done in the ICP pass).
5543       Function *TargetFunction = Symtab->getFunction(Candidate.Value);
5544       if (TargetFunction == nullptr ||
5545           // Any ThinLTO global dead symbol removal should have already
5546           // occurred, so it should be safe to promote when the target is a
5547           // declaration.
5548           // TODO: Remove internal option once more fully tested.
5549           (MemProfRequireDefinitionForPromotion &&
5550            TargetFunction->isDeclaration())) {
5551         ORE.emit([&]() {
5552           return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", CB)
5553                  << "Memprof cannot promote indirect call: target with md5sum "
5554                  << ore::NV("target md5sum", Candidate.Value) << " not found";
5555         });
5556         // FIXME: See if we can use the new declaration importing support to
5557         // at least get the declarations imported for this case. Hot indirect
5558         // targets should have been imported normally, however.
5559         continue;
5560       }
5561 
5562       // Check if legal to promote
5563       const char *Reason = nullptr;
5564       if (!isLegalToPromote(*CB, TargetFunction, &Reason)) {
5565         ORE.emit([&]() {
5566           return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", CB)
5567                  << "Memprof cannot promote indirect call to "
5568                  << ore::NV("TargetFunction", TargetFunction)
5569                  << " with count of " << ore::NV("TotalCount", TotalCount)
5570                  << ": " << Reason;
5571         });
5572         continue;
5573       }
5574 
5575       assert(!isMemProfClone(*TargetFunction));
5576 
5577       // Handle each call clone, applying ICP so that each clone directly
5578       // calls the specified callee clone, guarded by the appropriate ICP
5579       // check.
5580       CallBase *CBClone = CB;
5581       for (unsigned J = 0; J < NumClones; J++) {
5582         // Copy 0 is the original function.
5583         if (J > 0)
5584           CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5585         // We do the promotion using the original name, so that the comparison
5586         // is against the name in the vtable. Then just below, change the new
5587         // direct call to call the cloned function.
5588         auto &DirectCall =
5589             pgo::promoteIndirectCall(*CBClone, TargetFunction, Candidate.Count,
5590                                      TotalCount, isSamplePGO, &ORE);
5591         auto *TargetToUse = TargetFunction;
5592         // Call original if this version calls the original version of its
5593         // callee.
5594         if (StackNode.Clones[J]) {
5595           TargetToUse =
5596               cast<Function>(M.getOrInsertFunction(
5597                                   getMemProfFuncName(TargetFunction->getName(),
5598                                                      StackNode.Clones[J]),
5599                                   TargetFunction->getFunctionType())
5600                                  .getCallee());
5601         }
5602         DirectCall.setCalledFunction(TargetToUse);
5603         // During matching we generate synthetic VP metadata for indirect calls
5604         // not already having any, from the memprof profile's callee GUIDs. If
5605         // we subsequently promote and inline those callees, we currently lose
5606         // the ability to generate this synthetic VP metadata. Optionally apply
5607         // a noinline attribute to promoted direct calls, where the threshold is
5608         // set to capture synthetic VP metadata targets which get a count of 1.
5609         if (MemProfICPNoInlineThreshold &&
5610             Candidate.Count < MemProfICPNoInlineThreshold)
5611           DirectCall.setIsNoInline();
5612         ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone)
5613                  << ore::NV("Call", CBClone) << " in clone "
5614                  << ore::NV("Caller", CBClone->getFunction())
5615                  << " promoted and assigned to call function clone "
5616                  << ore::NV("Callee", TargetToUse));
5617       }
5618 
5619       // Update TotalCount (all clones should get same count above)
5620       TotalCount -= Candidate.Count;
5621       NumPromoted++;
5622     }
5623     // Adjust the MD.prof metadata for all clones, now that we have the new
5624     // TotalCount and the number promoted.
5625     CallBase *CBClone = CB;
5626     for (unsigned J = 0; J < NumClones; J++) {
5627       // Copy 0 is the original function.
5628       if (J > 0)
5629         CBClone = cast<CallBase>((*VMaps[J - 1])[CB]);
5630       // First delete the old one.
5631       CBClone->setMetadata(LLVMContext::MD_prof, nullptr);
5632       // If all promoted, we don't need the MD.prof metadata.
5633       // Otherwise we need update with the un-promoted records back.
5634       if (TotalCount != 0)
5635         annotateValueSite(
5636             M, *CBClone, ArrayRef(Info.CandidateProfileData).slice(NumPromoted),
5637             TotalCount, IPVK_IndirectCallTarget, Info.NumCandidates);
5638     }
5639   }
5640 }
5641 
5642 template <typename DerivedCCG, typename FuncTy, typename CallTy>
process()5643 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() {
5644   if (DumpCCG) {
5645     dbgs() << "CCG before cloning:\n";
5646     dbgs() << *this;
5647   }
5648   if (ExportToDot)
5649     exportToDot("postbuild");
5650 
5651   if (VerifyCCG) {
5652     check();
5653   }
5654 
5655   identifyClones();
5656 
5657   if (VerifyCCG) {
5658     check();
5659   }
5660 
5661   if (DumpCCG) {
5662     dbgs() << "CCG after cloning:\n";
5663     dbgs() << *this;
5664   }
5665   if (ExportToDot)
5666     exportToDot("cloned");
5667 
5668   bool Changed = assignFunctions();
5669 
5670   if (DumpCCG) {
5671     dbgs() << "CCG after assigning function clones:\n";
5672     dbgs() << *this;
5673   }
5674   if (ExportToDot)
5675     exportToDot("clonefuncassign");
5676 
5677   if (MemProfReportHintedSizes)
5678     printTotalSizes(errs());
5679 
5680   return Changed;
5681 }
5682 
processModule(Module & M,llvm::function_ref<OptimizationRemarkEmitter & (Function *)> OREGetter)5683 bool MemProfContextDisambiguation::processModule(
5684     Module &M,
5685     llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
5686 
5687   // If we have an import summary, then the cloning decisions were made during
5688   // the thin link on the index. Apply them and return.
5689   if (ImportSummary)
5690     return applyImport(M);
5691 
5692   // TODO: If/when other types of memprof cloning are enabled beyond just for
5693   // hot and cold, we will need to change this to individually control the
5694   // AllocationType passed to addStackNodesForMIB during CCG construction.
5695   // Note that we specifically check this after applying imports above, so that
5696   // the option isn't needed to be passed to distributed ThinLTO backend
5697   // clang processes, which won't necessarily have visibility into the linker
5698   // dependences. Instead the information is communicated from the LTO link to
5699   // the backends via the combined summary index.
5700   if (!SupportsHotColdNew)
5701     return false;
5702 
5703   ModuleCallsiteContextGraph CCG(M, OREGetter);
5704   return CCG.process();
5705 }
5706 
MemProfContextDisambiguation(const ModuleSummaryIndex * Summary,bool isSamplePGO)5707 MemProfContextDisambiguation::MemProfContextDisambiguation(
5708     const ModuleSummaryIndex *Summary, bool isSamplePGO)
5709     : ImportSummary(Summary), isSamplePGO(isSamplePGO) {
5710   // Check the dot graph printing options once here, to make sure we have valid
5711   // and expected combinations.
5712   if (DotGraphScope == DotScope::Alloc && !AllocIdForDot.getNumOccurrences())
5713     llvm::report_fatal_error(
5714         "-memprof-dot-scope=alloc requires -memprof-dot-alloc-id");
5715   if (DotGraphScope == DotScope::Context &&
5716       !ContextIdForDot.getNumOccurrences())
5717     llvm::report_fatal_error(
5718         "-memprof-dot-scope=context requires -memprof-dot-context-id");
5719   if (DotGraphScope == DotScope::All && AllocIdForDot.getNumOccurrences() &&
5720       ContextIdForDot.getNumOccurrences())
5721     llvm::report_fatal_error(
5722         "-memprof-dot-scope=all can't have both -memprof-dot-alloc-id and "
5723         "-memprof-dot-context-id");
5724   if (ImportSummary) {
5725     // The MemProfImportSummary should only be used for testing ThinLTO
5726     // distributed backend handling via opt, in which case we don't have a
5727     // summary from the pass pipeline.
5728     assert(MemProfImportSummary.empty());
5729     return;
5730   }
5731   if (MemProfImportSummary.empty())
5732     return;
5733 
5734   auto ReadSummaryFile =
5735       errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary));
5736   if (!ReadSummaryFile) {
5737     logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(),
5738                           "Error loading file '" + MemProfImportSummary +
5739                               "': ");
5740     return;
5741   }
5742   auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile);
5743   if (!ImportSummaryForTestingOrErr) {
5744     logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(),
5745                           "Error parsing file '" + MemProfImportSummary +
5746                               "': ");
5747     return;
5748   }
5749   ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr);
5750   ImportSummary = ImportSummaryForTesting.get();
5751 }
5752 
run(Module & M,ModuleAnalysisManager & AM)5753 PreservedAnalyses MemProfContextDisambiguation::run(Module &M,
5754                                                     ModuleAnalysisManager &AM) {
5755   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
5756   auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
5757     return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
5758   };
5759   if (!processModule(M, OREGetter))
5760     return PreservedAnalyses::all();
5761   return PreservedAnalyses::none();
5762 }
5763 
run(ModuleSummaryIndex & Index,llvm::function_ref<bool (GlobalValue::GUID,const GlobalValueSummary *)> isPrevailing)5764 void MemProfContextDisambiguation::run(
5765     ModuleSummaryIndex &Index,
5766     llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
5767         isPrevailing) {
5768   // TODO: If/when other types of memprof cloning are enabled beyond just for
5769   // hot and cold, we will need to change this to individually control the
5770   // AllocationType passed to addStackNodesForMIB during CCG construction.
5771   // The index was set from the option, so these should be in sync.
5772   assert(Index.withSupportsHotColdNew() == SupportsHotColdNew);
5773   if (!SupportsHotColdNew)
5774     return;
5775 
5776   IndexCallsiteContextGraph CCG(Index, isPrevailing);
5777   CCG.process();
5778 }
5779