1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements support for context disambiguation of allocation 10 // calls for profile guided heap optimization. Specifically, it uses Memprof 11 // profiles which indicate context specific allocation behavior (currently 12 // distinguishing cold vs hot memory allocations). Cloning is performed to 13 // expose the cold allocation call contexts, and the allocation calls are 14 // subsequently annotated with an attribute for later transformation. 15 // 16 // The transformations can be performed either directly on IR (regular LTO), or 17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). 18 // Both types of LTO operate on a the same base graph representation, which 19 // uses CRTP to support either IR or Index formats. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/ADT/DenseSet.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/SetOperations.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/MemoryProfileInfo.h" 33 #include "llvm/Analysis/ModuleSummaryAnalysis.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/ModuleSummaryIndex.h" 40 #include "llvm/Pass.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/FileSystem.h" 43 #include "llvm/Support/GraphWriter.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Transforms/IPO.h" 46 #include "llvm/Transforms/Utils/Cloning.h" 47 #include <deque> 48 #include <sstream> 49 #include <unordered_map> 50 #include <vector> 51 using namespace llvm; 52 using namespace llvm::memprof; 53 54 #define DEBUG_TYPE "memprof-context-disambiguation" 55 56 STATISTIC(FunctionClonesAnalysis, 57 "Number of function clones created during whole program analysis"); 58 STATISTIC(FunctionClonesThinBackend, 59 "Number of function clones created during ThinLTO backend"); 60 STATISTIC(FunctionsClonedThinBackend, 61 "Number of functions that had clones created during ThinLTO backend"); 62 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " 63 "cloned) during whole program analysis"); 64 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " 65 "during whole program analysis"); 66 STATISTIC(AllocTypeNotColdThinBackend, 67 "Number of not cold static allocations (possibly cloned) during " 68 "ThinLTO backend"); 69 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " 70 "(possibly cloned) during ThinLTO backend"); 71 STATISTIC(OrigAllocsThinBackend, 72 "Number of original (not cloned) allocations with memprof profiles " 73 "during ThinLTO backend"); 74 STATISTIC( 75 AllocVersionsThinBackend, 76 "Number of allocation versions (including clones) during ThinLTO backend"); 77 STATISTIC(MaxAllocVersionsThinBackend, 78 "Maximum number of allocation versions created for an original " 79 "allocation during ThinLTO backend"); 80 STATISTIC(UnclonableAllocsThinBackend, 81 "Number of unclonable ambigous allocations during ThinLTO backend"); 82 STATISTIC(RemovedEdgesWithMismatchedCallees, 83 "Number of edges removed due to mismatched callees (profiled vs IR)"); 84 STATISTIC(FoundProfiledCalleeCount, 85 "Number of profiled callees found via tail calls"); 86 STATISTIC(FoundProfiledCalleeDepth, 87 "Aggregate depth of profiled callees found via tail calls"); 88 STATISTIC(FoundProfiledCalleeMaxDepth, 89 "Maximum depth of profiled callees found via tail calls"); 90 STATISTIC(FoundProfiledCalleeNonUniquelyCount, 91 "Number of profiled callees found via multiple tail call chains"); 92 93 static cl::opt<std::string> DotFilePathPrefix( 94 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, 95 cl::value_desc("filename"), 96 cl::desc("Specify the path prefix of the MemProf dot files.")); 97 98 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), 99 cl::Hidden, 100 cl::desc("Export graph to dot files.")); 101 102 static cl::opt<bool> 103 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, 104 cl::desc("Dump CallingContextGraph to stdout after each stage.")); 105 106 static cl::opt<bool> 107 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, 108 cl::desc("Perform verification checks on CallingContextGraph.")); 109 110 static cl::opt<bool> 111 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, 112 cl::desc("Perform frequent verification checks on nodes.")); 113 114 static cl::opt<std::string> MemProfImportSummary( 115 "memprof-import-summary", 116 cl::desc("Import summary to use for testing the ThinLTO backend via opt"), 117 cl::Hidden); 118 119 static cl::opt<unsigned> 120 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), 121 cl::Hidden, 122 cl::desc("Max depth to recursively search for missing " 123 "frames through tail calls.")); 124 125 namespace llvm { 126 cl::opt<bool> EnableMemProfContextDisambiguation( 127 "enable-memprof-context-disambiguation", cl::init(false), cl::Hidden, 128 cl::ZeroOrMore, cl::desc("Enable MemProf context disambiguation")); 129 130 // Indicate we are linking with an allocator that supports hot/cold operator 131 // new interfaces. 132 cl::opt<bool> SupportsHotColdNew( 133 "supports-hot-cold-new", cl::init(false), cl::Hidden, 134 cl::desc("Linking with hot/cold operator new interfaces")); 135 } // namespace llvm 136 137 extern cl::opt<bool> MemProfReportHintedSizes; 138 139 namespace { 140 /// CRTP base for graphs built from either IR or ThinLTO summary index. 141 /// 142 /// The graph represents the call contexts in all memprof metadata on allocation 143 /// calls, with nodes for the allocations themselves, as well as for the calls 144 /// in each context. The graph is initially built from the allocation memprof 145 /// metadata (or summary) MIBs. It is then updated to match calls with callsite 146 /// metadata onto the nodes, updating it to reflect any inlining performed on 147 /// those calls. 148 /// 149 /// Each MIB (representing an allocation's call context with allocation 150 /// behavior) is assigned a unique context id during the graph build. The edges 151 /// and nodes in the graph are decorated with the context ids they carry. This 152 /// is used to correctly update the graph when cloning is performed so that we 153 /// can uniquify the context for a single (possibly cloned) allocation. 154 template <typename DerivedCCG, typename FuncTy, typename CallTy> 155 class CallsiteContextGraph { 156 public: 157 CallsiteContextGraph() = default; 158 CallsiteContextGraph(const CallsiteContextGraph &) = default; 159 CallsiteContextGraph(CallsiteContextGraph &&) = default; 160 161 /// Main entry point to perform analysis and transformations on graph. 162 bool process(); 163 164 /// Perform cloning on the graph necessary to uniquely identify the allocation 165 /// behavior of an allocation based on its context. 166 void identifyClones(); 167 168 /// Assign callsite clones to functions, cloning functions as needed to 169 /// accommodate the combinations of their callsite clones reached by callers. 170 /// For regular LTO this clones functions and callsites in the IR, but for 171 /// ThinLTO the cloning decisions are noted in the summaries and later applied 172 /// in applyImport. 173 bool assignFunctions(); 174 175 void dump() const; 176 void print(raw_ostream &OS) const; 177 void printTotalSizes(raw_ostream &OS) const; 178 179 friend raw_ostream &operator<<(raw_ostream &OS, 180 const CallsiteContextGraph &CCG) { 181 CCG.print(OS); 182 return OS; 183 } 184 185 friend struct GraphTraits< 186 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 187 friend struct DOTGraphTraits< 188 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 189 190 void exportToDot(std::string Label) const; 191 192 /// Represents a function clone via FuncTy pointer and clone number pair. 193 struct FuncInfo final 194 : public std::pair<FuncTy *, unsigned /*Clone number*/> { 195 using Base = std::pair<FuncTy *, unsigned>; 196 FuncInfo(const Base &B) : Base(B) {} 197 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} 198 explicit operator bool() const { return this->first != nullptr; } 199 FuncTy *func() const { return this->first; } 200 unsigned cloneNo() const { return this->second; } 201 }; 202 203 /// Represents a callsite clone via CallTy and clone number pair. 204 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { 205 using Base = std::pair<CallTy, unsigned>; 206 CallInfo(const Base &B) : Base(B) {} 207 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) 208 : Base(Call, CloneNo) {} 209 explicit operator bool() const { return (bool)this->first; } 210 CallTy call() const { return this->first; } 211 unsigned cloneNo() const { return this->second; } 212 void setCloneNo(unsigned N) { this->second = N; } 213 void print(raw_ostream &OS) const { 214 if (!operator bool()) { 215 assert(!cloneNo()); 216 OS << "null Call"; 217 return; 218 } 219 call()->print(OS); 220 OS << "\t(clone " << cloneNo() << ")"; 221 } 222 void dump() const { 223 print(dbgs()); 224 dbgs() << "\n"; 225 } 226 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { 227 Call.print(OS); 228 return OS; 229 } 230 }; 231 232 struct ContextEdge; 233 234 /// Node in the Callsite Context Graph 235 struct ContextNode { 236 // Keep this for now since in the IR case where we have an Instruction* it 237 // is not as immediately discoverable. Used for printing richer information 238 // when dumping graph. 239 bool IsAllocation; 240 241 // Keeps track of when the Call was reset to null because there was 242 // recursion. 243 bool Recursive = false; 244 245 // The corresponding allocation or interior call. 246 CallInfo Call; 247 248 // For alloc nodes this is a unique id assigned when constructed, and for 249 // callsite stack nodes it is the original stack id when the node is 250 // constructed from the memprof MIB metadata on the alloc nodes. Note that 251 // this is only used when matching callsite metadata onto the stack nodes 252 // created when processing the allocation memprof MIBs, and for labeling 253 // nodes in the dot graph. Therefore we don't bother to assign a value for 254 // clones. 255 uint64_t OrigStackOrAllocId = 0; 256 257 // This will be formed by ORing together the AllocationType enum values 258 // for contexts including this node. 259 uint8_t AllocTypes = 0; 260 261 // Edges to all callees in the profiled call stacks. 262 // TODO: Should this be a map (from Callee node) for more efficient lookup? 263 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; 264 265 // Edges to all callers in the profiled call stacks. 266 // TODO: Should this be a map (from Caller node) for more efficient lookup? 267 std::vector<std::shared_ptr<ContextEdge>> CallerEdges; 268 269 // Get the list of edges from which we can compute allocation information 270 // such as the context ids and allocation type of this node. 271 const std::vector<std::shared_ptr<ContextEdge>> * 272 getEdgesWithAllocInfo() const { 273 // If node has any callees, compute from those, otherwise compute from 274 // callers (i.e. if this is the leaf allocation node). 275 if (!CalleeEdges.empty()) 276 return &CalleeEdges; 277 if (!CallerEdges.empty()) { 278 // A node with caller edges but no callee edges must be the allocation 279 // node. 280 assert(IsAllocation); 281 return &CallerEdges; 282 } 283 return nullptr; 284 } 285 286 // Compute the context ids for this node from the union of its edge context 287 // ids. 288 DenseSet<uint32_t> getContextIds() const { 289 DenseSet<uint32_t> ContextIds; 290 auto *Edges = getEdgesWithAllocInfo(); 291 if (!Edges) 292 return {}; 293 unsigned Count = 0; 294 for (auto &Edge : *Edges) 295 Count += Edge->getContextIds().size(); 296 ContextIds.reserve(Count); 297 for (auto &Edge : *Edges) 298 ContextIds.insert(Edge->getContextIds().begin(), 299 Edge->getContextIds().end()); 300 return ContextIds; 301 } 302 303 // Compute the allocation type for this node from the OR of its edge 304 // allocation types. 305 uint8_t computeAllocType() const { 306 auto *Edges = getEdgesWithAllocInfo(); 307 if (!Edges) 308 return (uint8_t)AllocationType::None; 309 uint8_t BothTypes = 310 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 311 uint8_t AllocType = (uint8_t)AllocationType::None; 312 for (auto &Edge : *Edges) { 313 AllocType |= Edge->AllocTypes; 314 // Bail early if alloc type reached both, no further refinement. 315 if (AllocType == BothTypes) 316 return AllocType; 317 } 318 return AllocType; 319 } 320 321 // The context ids set for this node is empty if its edge context ids are 322 // also all empty. 323 bool emptyContextIds() const { 324 auto *Edges = getEdgesWithAllocInfo(); 325 if (!Edges) 326 return true; 327 for (auto &Edge : *Edges) { 328 if (!Edge->getContextIds().empty()) 329 return false; 330 } 331 return true; 332 } 333 334 // List of clones of this ContextNode, initially empty. 335 std::vector<ContextNode *> Clones; 336 337 // If a clone, points to the original uncloned node. 338 ContextNode *CloneOf = nullptr; 339 340 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} 341 342 ContextNode(bool IsAllocation, CallInfo C) 343 : IsAllocation(IsAllocation), Call(C) {} 344 345 void addClone(ContextNode *Clone) { 346 if (CloneOf) { 347 CloneOf->Clones.push_back(Clone); 348 Clone->CloneOf = CloneOf; 349 } else { 350 Clones.push_back(Clone); 351 assert(!Clone->CloneOf); 352 Clone->CloneOf = this; 353 } 354 } 355 356 ContextNode *getOrigNode() { 357 if (!CloneOf) 358 return this; 359 return CloneOf; 360 } 361 362 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 363 unsigned int ContextId); 364 365 ContextEdge *findEdgeFromCallee(const ContextNode *Callee); 366 ContextEdge *findEdgeFromCaller(const ContextNode *Caller); 367 void eraseCalleeEdge(const ContextEdge *Edge); 368 void eraseCallerEdge(const ContextEdge *Edge); 369 370 void setCall(CallInfo C) { Call = C; } 371 372 bool hasCall() const { return (bool)Call.call(); } 373 374 void printCall(raw_ostream &OS) const { Call.print(OS); } 375 376 // True if this node was effectively removed from the graph, in which case 377 // it should have an allocation type of None and empty context ids. 378 bool isRemoved() const { 379 assert((AllocTypes == (uint8_t)AllocationType::None) == 380 emptyContextIds()); 381 return AllocTypes == (uint8_t)AllocationType::None; 382 } 383 384 void dump() const; 385 void print(raw_ostream &OS) const; 386 387 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { 388 Node.print(OS); 389 return OS; 390 } 391 }; 392 393 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or 394 /// callee. 395 struct ContextEdge { 396 ContextNode *Callee; 397 ContextNode *Caller; 398 399 // This will be formed by ORing together the AllocationType enum values 400 // for contexts including this edge. 401 uint8_t AllocTypes = 0; 402 403 // The set of IDs for contexts including this edge. 404 DenseSet<uint32_t> ContextIds; 405 406 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, 407 DenseSet<uint32_t> ContextIds) 408 : Callee(Callee), Caller(Caller), AllocTypes(AllocType), 409 ContextIds(std::move(ContextIds)) {} 410 411 DenseSet<uint32_t> &getContextIds() { return ContextIds; } 412 413 void dump() const; 414 void print(raw_ostream &OS) const; 415 416 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { 417 Edge.print(OS); 418 return OS; 419 } 420 }; 421 422 /// Helpers to remove callee edges that have allocation type None (due to not 423 /// carrying any context ids) after transformations. 424 void removeNoneTypeCalleeEdges(ContextNode *Node); 425 void 426 recursivelyRemoveNoneTypeCalleeEdges(ContextNode *Node, 427 DenseSet<const ContextNode *> &Visited); 428 429 protected: 430 /// Get a list of nodes corresponding to the stack ids in the given callsite 431 /// context. 432 template <class NodeT, class IteratorT> 433 std::vector<uint64_t> 434 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); 435 436 /// Adds nodes for the given allocation and any stack ids on its memprof MIB 437 /// metadata (or summary). 438 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); 439 440 /// Adds nodes for the given MIB stack ids. 441 template <class NodeT, class IteratorT> 442 void addStackNodesForMIB(ContextNode *AllocNode, 443 CallStack<NodeT, IteratorT> &StackContext, 444 CallStack<NodeT, IteratorT> &CallsiteContext, 445 AllocationType AllocType, uint64_t TotalSize); 446 447 /// Matches all callsite metadata (or summary) to the nodes created for 448 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any 449 /// inlining performed on those callsite instructions. 450 void updateStackNodes(); 451 452 /// Update graph to conservatively handle any callsite stack nodes that target 453 /// multiple different callee target functions. 454 void handleCallsitesWithMultipleTargets(); 455 456 /// Save lists of calls with MemProf metadata in each function, for faster 457 /// iteration. 458 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; 459 460 /// Map from callsite node to the enclosing caller function. 461 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; 462 463 private: 464 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; 465 466 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, 467 const FuncTy *, DenseSet<uint32_t>>; 468 469 /// Assigns the given Node to calls at or inlined into the location with 470 /// the Node's stack id, after post order traversing and processing its 471 /// caller nodes. Uses the call information recorded in the given 472 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences 473 /// as needed. Called by updateStackNodes which sets up the given 474 /// StackIdToMatchingCalls map. 475 void assignStackNodesPostOrder( 476 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 477 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); 478 479 /// Duplicates the given set of context ids, updating the provided 480 /// map from each original id with the newly generated context ids, 481 /// and returning the new duplicated id set. 482 DenseSet<uint32_t> duplicateContextIds( 483 const DenseSet<uint32_t> &StackSequenceContextIds, 484 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 485 486 /// Propagates all duplicated context ids across the graph. 487 void propagateDuplicateContextIds( 488 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 489 490 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, 491 /// else to its callers. Also updates OrigNode's edges to remove any context 492 /// ids moved to the newly created edge. 493 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, 494 bool TowardsCallee, 495 DenseSet<uint32_t> RemainingContextIds); 496 497 /// Get the stack id corresponding to the given Id or Index (for IR this will 498 /// return itself, for a summary index this will return the id recorded in the 499 /// index for that stack id index value). 500 uint64_t getStackId(uint64_t IdOrIndex) const { 501 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); 502 } 503 504 /// Returns true if the given call targets the callee of the given edge, or if 505 /// we were able to identify the call chain through intermediate tail calls. 506 /// In the latter case new context nodes are added to the graph for the 507 /// identified tail calls, and their synthesized nodes are added to 508 /// TailCallToContextNodeMap. The EdgeIter is updated in the latter case for 509 /// the updated edges and to prepare it for an increment in the caller. 510 bool 511 calleesMatch(CallTy Call, EdgeIter &EI, 512 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); 513 514 /// Returns true if the given call targets the given function, or if we were 515 /// able to identify the call chain through intermediate tail calls (in which 516 /// case FoundCalleeChain will be populated). 517 bool calleeMatchesFunc( 518 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, 519 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { 520 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( 521 Call, Func, CallerFunc, FoundCalleeChain); 522 } 523 524 /// Get a list of nodes corresponding to the stack ids in the given 525 /// callsite's context. 526 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { 527 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( 528 Call); 529 } 530 531 /// Get the last stack id in the context for callsite. 532 uint64_t getLastStackId(CallTy Call) { 533 return static_cast<DerivedCCG *>(this)->getLastStackId(Call); 534 } 535 536 /// Update the allocation call to record type of allocated memory. 537 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { 538 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; 539 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); 540 } 541 542 /// Update non-allocation call to invoke (possibly cloned) function 543 /// CalleeFunc. 544 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { 545 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); 546 } 547 548 /// Clone the given function for the given callsite, recording mapping of all 549 /// of the functions tracked calls to their new versions in the CallMap. 550 /// Assigns new clones to clone number CloneNo. 551 FuncInfo cloneFunctionForCallsite( 552 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 553 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 554 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( 555 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); 556 } 557 558 /// Gets a label to use in the dot graph for the given call clone in the given 559 /// function. 560 std::string getLabel(const FuncTy *Func, const CallTy Call, 561 unsigned CloneNo) const { 562 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); 563 } 564 565 /// Helpers to find the node corresponding to the given call or stackid. 566 ContextNode *getNodeForInst(const CallInfo &C); 567 ContextNode *getNodeForAlloc(const CallInfo &C); 568 ContextNode *getNodeForStackId(uint64_t StackId); 569 570 /// Computes the alloc type corresponding to the given context ids, by 571 /// unioning their recorded alloc types. 572 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); 573 574 /// Returns the allocation type of the intersection of the contexts of two 575 /// nodes (based on their provided context id sets), optimized for the case 576 /// when Node1Ids is smaller than Node2Ids. 577 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, 578 const DenseSet<uint32_t> &Node2Ids); 579 580 /// Returns the allocation type of the intersection of the contexts of two 581 /// nodes (based on their provided context id sets). 582 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, 583 const DenseSet<uint32_t> &Node2Ids); 584 585 /// Create a clone of Edge's callee and move Edge to that new callee node, 586 /// performing the necessary context id and allocation type updates. 587 /// If callee's caller edge iterator is supplied, it is updated when removing 588 /// the edge from that list. If ContextIdsToMove is non-empty, only that 589 /// subset of Edge's ids are moved to an edge to the new callee. 590 ContextNode * 591 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 592 EdgeIter *CallerEdgeI = nullptr, 593 DenseSet<uint32_t> ContextIdsToMove = {}); 594 595 /// Change the callee of Edge to existing callee clone NewCallee, performing 596 /// the necessary context id and allocation type updates. 597 /// If callee's caller edge iterator is supplied, it is updated when removing 598 /// the edge from that list. If ContextIdsToMove is non-empty, only that 599 /// subset of Edge's ids are moved to an edge to the new callee. 600 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 601 ContextNode *NewCallee, 602 EdgeIter *CallerEdgeI = nullptr, 603 bool NewClone = false, 604 DenseSet<uint32_t> ContextIdsToMove = {}); 605 606 /// Recursively perform cloning on the graph for the given Node and its 607 /// callers, in order to uniquely identify the allocation behavior of an 608 /// allocation given its context. The context ids of the allocation being 609 /// processed are given in AllocContextIds. 610 void identifyClones(ContextNode *Node, DenseSet<const ContextNode *> &Visited, 611 const DenseSet<uint32_t> &AllocContextIds); 612 613 /// Map from each context ID to the AllocationType assigned to that context. 614 DenseMap<uint32_t, AllocationType> ContextIdToAllocationType; 615 616 /// Map from each contextID to the profiled aggregate allocation size, 617 /// optionally populated when requested (via MemProfReportHintedSizes). 618 DenseMap<uint32_t, uint64_t> ContextIdToTotalSize; 619 620 /// Identifies the context node created for a stack id when adding the MIB 621 /// contexts to the graph. This is used to locate the context nodes when 622 /// trying to assign the corresponding callsites with those stack ids to these 623 /// nodes. 624 DenseMap<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; 625 626 /// Maps to track the calls to their corresponding nodes in the graph. 627 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; 628 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; 629 630 /// Owner of all ContextNode unique_ptrs. 631 std::vector<std::unique_ptr<ContextNode>> NodeOwner; 632 633 /// Perform sanity checks on graph when requested. 634 void check() const; 635 636 /// Keeps track of the last unique context id assigned. 637 unsigned int LastContextId = 0; 638 }; 639 640 template <typename DerivedCCG, typename FuncTy, typename CallTy> 641 using ContextNode = 642 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; 643 template <typename DerivedCCG, typename FuncTy, typename CallTy> 644 using ContextEdge = 645 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; 646 template <typename DerivedCCG, typename FuncTy, typename CallTy> 647 using FuncInfo = 648 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; 649 template <typename DerivedCCG, typename FuncTy, typename CallTy> 650 using CallInfo = 651 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; 652 653 /// CRTP derived class for graphs built from IR (regular LTO). 654 class ModuleCallsiteContextGraph 655 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 656 Instruction *> { 657 public: 658 ModuleCallsiteContextGraph( 659 Module &M, 660 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); 661 662 private: 663 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 664 Instruction *>; 665 666 uint64_t getStackId(uint64_t IdOrIndex) const; 667 bool calleeMatchesFunc( 668 Instruction *Call, const Function *Func, const Function *CallerFunc, 669 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); 670 bool findProfiledCalleeThroughTailCalls( 671 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 672 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 673 bool &FoundMultipleCalleeChains); 674 uint64_t getLastStackId(Instruction *Call); 675 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); 676 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 677 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 678 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 679 Instruction *>::FuncInfo 680 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 681 std::map<CallInfo, CallInfo> &CallMap, 682 std::vector<CallInfo> &CallsWithMetadataInFunc, 683 unsigned CloneNo); 684 std::string getLabel(const Function *Func, const Instruction *Call, 685 unsigned CloneNo) const; 686 687 const Module &Mod; 688 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 689 }; 690 691 /// Represents a call in the summary index graph, which can either be an 692 /// allocation or an interior callsite node in an allocation's context. 693 /// Holds a pointer to the corresponding data structure in the index. 694 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { 695 IndexCall() : PointerUnion() {} 696 IndexCall(std::nullptr_t) : IndexCall() {} 697 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} 698 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} 699 IndexCall(PointerUnion PT) : PointerUnion(PT) {} 700 701 IndexCall *operator->() { return this; } 702 703 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } 704 705 void print(raw_ostream &OS) const { 706 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) { 707 OS << *AI; 708 } else { 709 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase()); 710 assert(CI); 711 OS << *CI; 712 } 713 } 714 }; 715 716 /// CRTP derived class for graphs built from summary index (ThinLTO). 717 class IndexCallsiteContextGraph 718 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 719 IndexCall> { 720 public: 721 IndexCallsiteContextGraph( 722 ModuleSummaryIndex &Index, 723 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 724 isPrevailing); 725 726 ~IndexCallsiteContextGraph() { 727 // Now that we are done with the graph it is safe to add the new 728 // CallsiteInfo structs to the function summary vectors. The graph nodes 729 // point into locations within these vectors, so we don't want to add them 730 // any earlier. 731 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { 732 auto *FS = I.first; 733 for (auto &Callsite : I.second) 734 FS->addCallsite(*Callsite.second); 735 } 736 } 737 738 private: 739 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 740 IndexCall>; 741 742 uint64_t getStackId(uint64_t IdOrIndex) const; 743 bool calleeMatchesFunc( 744 IndexCall &Call, const FunctionSummary *Func, 745 const FunctionSummary *CallerFunc, 746 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); 747 bool findProfiledCalleeThroughTailCalls( 748 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 749 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 750 bool &FoundMultipleCalleeChains); 751 uint64_t getLastStackId(IndexCall &Call); 752 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); 753 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 754 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 755 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 756 IndexCall>::FuncInfo 757 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 758 std::map<CallInfo, CallInfo> &CallMap, 759 std::vector<CallInfo> &CallsWithMetadataInFunc, 760 unsigned CloneNo); 761 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, 762 unsigned CloneNo) const; 763 764 // Saves mapping from function summaries containing memprof records back to 765 // its VI, for use in checking and debugging. 766 std::map<const FunctionSummary *, ValueInfo> FSToVIMap; 767 768 const ModuleSummaryIndex &Index; 769 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 770 isPrevailing; 771 772 // Saves/owns the callsite info structures synthesized for missing tail call 773 // frames that we discover while building the graph. 774 // It maps from the summary of the function making the tail call, to a map 775 // of callee ValueInfo to corresponding synthesized callsite info. 776 std::unordered_map<FunctionSummary *, 777 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> 778 FunctionCalleesToSynthesizedCallsiteInfos; 779 }; 780 } // namespace 781 782 namespace llvm { 783 template <> 784 struct DenseMapInfo<typename CallsiteContextGraph< 785 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> 786 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; 787 template <> 788 struct DenseMapInfo<typename CallsiteContextGraph< 789 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> 790 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; 791 template <> 792 struct DenseMapInfo<IndexCall> 793 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; 794 } // end namespace llvm 795 796 namespace { 797 798 struct FieldSeparator { 799 bool Skip = true; 800 const char *Sep; 801 802 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} 803 }; 804 805 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { 806 if (FS.Skip) { 807 FS.Skip = false; 808 return OS; 809 } 810 return OS << FS.Sep; 811 } 812 813 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc 814 // type we should actually use on the corresponding allocation. 815 // If we can't clone a node that has NotCold+Cold alloc type, we will fall 816 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold 817 // from NotCold. 818 AllocationType allocTypeToUse(uint8_t AllocTypes) { 819 assert(AllocTypes != (uint8_t)AllocationType::None); 820 if (AllocTypes == 821 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 822 return AllocationType::NotCold; 823 else 824 return (AllocationType)AllocTypes; 825 } 826 827 // Helper to check if the alloc types for all edges recorded in the 828 // InAllocTypes vector match the alloc types for all edges in the Edges 829 // vector. 830 template <typename DerivedCCG, typename FuncTy, typename CallTy> 831 bool allocTypesMatch( 832 const std::vector<uint8_t> &InAllocTypes, 833 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> 834 &Edges) { 835 return std::equal( 836 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), 837 [](const uint8_t &l, 838 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { 839 // Can share if one of the edges is None type - don't 840 // care about the type along that edge as it doesn't 841 // exist for those context ids. 842 if (l == (uint8_t)AllocationType::None || 843 r->AllocTypes == (uint8_t)AllocationType::None) 844 return true; 845 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); 846 }); 847 } 848 849 } // end anonymous namespace 850 851 template <typename DerivedCCG, typename FuncTy, typename CallTy> 852 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 853 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( 854 const CallInfo &C) { 855 ContextNode *Node = getNodeForAlloc(C); 856 if (Node) 857 return Node; 858 859 return NonAllocationCallToContextNodeMap.lookup(C); 860 } 861 862 template <typename DerivedCCG, typename FuncTy, typename CallTy> 863 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 864 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( 865 const CallInfo &C) { 866 return AllocationCallToContextNodeMap.lookup(C); 867 } 868 869 template <typename DerivedCCG, typename FuncTy, typename CallTy> 870 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 871 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( 872 uint64_t StackId) { 873 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); 874 if (StackEntryNode != StackEntryIdToContextNodeMap.end()) 875 return StackEntryNode->second; 876 return nullptr; 877 } 878 879 template <typename DerivedCCG, typename FuncTy, typename CallTy> 880 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 881 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 882 unsigned int ContextId) { 883 for (auto &Edge : CallerEdges) { 884 if (Edge->Caller == Caller) { 885 Edge->AllocTypes |= (uint8_t)AllocType; 886 Edge->getContextIds().insert(ContextId); 887 return; 888 } 889 } 890 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( 891 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); 892 CallerEdges.push_back(Edge); 893 Caller->CalleeEdges.push_back(Edge); 894 } 895 896 template <typename DerivedCCG, typename FuncTy, typename CallTy> 897 void CallsiteContextGraph< 898 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { 899 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 900 auto Edge = *EI; 901 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 902 assert(Edge->ContextIds.empty()); 903 Edge->Callee->eraseCallerEdge(Edge.get()); 904 EI = Node->CalleeEdges.erase(EI); 905 } else 906 ++EI; 907 } 908 } 909 910 template <typename DerivedCCG, typename FuncTy, typename CallTy> 911 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 912 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 913 findEdgeFromCallee(const ContextNode *Callee) { 914 for (const auto &Edge : CalleeEdges) 915 if (Edge->Callee == Callee) 916 return Edge.get(); 917 return nullptr; 918 } 919 920 template <typename DerivedCCG, typename FuncTy, typename CallTy> 921 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 922 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 923 findEdgeFromCaller(const ContextNode *Caller) { 924 for (const auto &Edge : CallerEdges) 925 if (Edge->Caller == Caller) 926 return Edge.get(); 927 return nullptr; 928 } 929 930 template <typename DerivedCCG, typename FuncTy, typename CallTy> 931 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 932 eraseCalleeEdge(const ContextEdge *Edge) { 933 auto EI = llvm::find_if( 934 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { 935 return CalleeEdge.get() == Edge; 936 }); 937 assert(EI != CalleeEdges.end()); 938 CalleeEdges.erase(EI); 939 } 940 941 template <typename DerivedCCG, typename FuncTy, typename CallTy> 942 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 943 eraseCallerEdge(const ContextEdge *Edge) { 944 auto EI = llvm::find_if( 945 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { 946 return CallerEdge.get() == Edge; 947 }); 948 assert(EI != CallerEdges.end()); 949 CallerEdges.erase(EI); 950 } 951 952 template <typename DerivedCCG, typename FuncTy, typename CallTy> 953 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( 954 DenseSet<uint32_t> &ContextIds) { 955 uint8_t BothTypes = 956 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 957 uint8_t AllocType = (uint8_t)AllocationType::None; 958 for (auto Id : ContextIds) { 959 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 960 // Bail early if alloc type reached both, no further refinement. 961 if (AllocType == BothTypes) 962 return AllocType; 963 } 964 return AllocType; 965 } 966 967 template <typename DerivedCCG, typename FuncTy, typename CallTy> 968 uint8_t 969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( 970 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 971 uint8_t BothTypes = 972 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 973 uint8_t AllocType = (uint8_t)AllocationType::None; 974 for (auto Id : Node1Ids) { 975 if (!Node2Ids.count(Id)) 976 continue; 977 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 978 // Bail early if alloc type reached both, no further refinement. 979 if (AllocType == BothTypes) 980 return AllocType; 981 } 982 return AllocType; 983 } 984 985 template <typename DerivedCCG, typename FuncTy, typename CallTy> 986 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( 987 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 988 if (Node1Ids.size() < Node2Ids.size()) 989 return intersectAllocTypesImpl(Node1Ids, Node2Ids); 990 else 991 return intersectAllocTypesImpl(Node2Ids, Node1Ids); 992 } 993 994 template <typename DerivedCCG, typename FuncTy, typename CallTy> 995 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 996 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( 997 CallInfo Call, const FuncTy *F) { 998 assert(!getNodeForAlloc(Call)); 999 NodeOwner.push_back( 1000 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); 1001 ContextNode *AllocNode = NodeOwner.back().get(); 1002 AllocationCallToContextNodeMap[Call] = AllocNode; 1003 NodeToCallingFunc[AllocNode] = F; 1004 // Use LastContextId as a uniq id for MIB allocation nodes. 1005 AllocNode->OrigStackOrAllocId = LastContextId; 1006 // Alloc type should be updated as we add in the MIBs. We should assert 1007 // afterwards that it is not still None. 1008 AllocNode->AllocTypes = (uint8_t)AllocationType::None; 1009 1010 return AllocNode; 1011 } 1012 1013 static std::string getAllocTypeString(uint8_t AllocTypes) { 1014 if (!AllocTypes) 1015 return "None"; 1016 std::string Str; 1017 if (AllocTypes & (uint8_t)AllocationType::NotCold) 1018 Str += "NotCold"; 1019 if (AllocTypes & (uint8_t)AllocationType::Cold) 1020 Str += "Cold"; 1021 return Str; 1022 } 1023 1024 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1025 template <class NodeT, class IteratorT> 1026 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( 1027 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, 1028 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType, 1029 uint64_t TotalSize) { 1030 assert(!MemProfReportHintedSizes || TotalSize > 0); 1031 // Treating the hot alloc type as NotCold before the disambiguation for "hot" 1032 // is done. 1033 if (AllocType == AllocationType::Hot) 1034 AllocType = AllocationType::NotCold; 1035 1036 ContextIdToAllocationType[++LastContextId] = AllocType; 1037 1038 if (MemProfReportHintedSizes) { 1039 assert(TotalSize); 1040 ContextIdToTotalSize[LastContextId] = TotalSize; 1041 } 1042 1043 // Update alloc type and context ids for this MIB. 1044 AllocNode->AllocTypes |= (uint8_t)AllocType; 1045 1046 // Now add or update nodes for each stack id in alloc's context. 1047 // Later when processing the stack ids on non-alloc callsites we will adjust 1048 // for any inlining in the context. 1049 ContextNode *PrevNode = AllocNode; 1050 // Look for recursion (direct recursion should have been collapsed by 1051 // module summary analysis, here we should just be detecting mutual 1052 // recursion). Mark these nodes so we don't try to clone. 1053 SmallSet<uint64_t, 8> StackIdSet; 1054 // Skip any on the allocation call (inlining). 1055 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); 1056 ContextIter != StackContext.end(); ++ContextIter) { 1057 auto StackId = getStackId(*ContextIter); 1058 ContextNode *StackNode = getNodeForStackId(StackId); 1059 if (!StackNode) { 1060 NodeOwner.push_back( 1061 std::make_unique<ContextNode>(/*IsAllocation=*/false)); 1062 StackNode = NodeOwner.back().get(); 1063 StackEntryIdToContextNodeMap[StackId] = StackNode; 1064 StackNode->OrigStackOrAllocId = StackId; 1065 } 1066 auto Ins = StackIdSet.insert(StackId); 1067 if (!Ins.second) 1068 StackNode->Recursive = true; 1069 StackNode->AllocTypes |= (uint8_t)AllocType; 1070 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); 1071 PrevNode = StackNode; 1072 } 1073 } 1074 1075 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1076 DenseSet<uint32_t> 1077 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( 1078 const DenseSet<uint32_t> &StackSequenceContextIds, 1079 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 1080 DenseSet<uint32_t> NewContextIds; 1081 for (auto OldId : StackSequenceContextIds) { 1082 NewContextIds.insert(++LastContextId); 1083 OldToNewContextIds[OldId].insert(LastContextId); 1084 assert(ContextIdToAllocationType.count(OldId)); 1085 // The new context has the same allocation type as original. 1086 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; 1087 // For now set this to 0 so we don't duplicate sizes. Not clear how to divvy 1088 // up the size. Assume that if we are able to duplicate context ids that we 1089 // will be able to disambiguate all copies. 1090 ContextIdToTotalSize[LastContextId] = 0; 1091 } 1092 return NewContextIds; 1093 } 1094 1095 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1096 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1097 propagateDuplicateContextIds( 1098 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 1099 // Build a set of duplicated context ids corresponding to the input id set. 1100 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { 1101 DenseSet<uint32_t> NewIds; 1102 for (auto Id : ContextIds) 1103 if (auto NewId = OldToNewContextIds.find(Id); 1104 NewId != OldToNewContextIds.end()) 1105 NewIds.insert(NewId->second.begin(), NewId->second.end()); 1106 return NewIds; 1107 }; 1108 1109 // Recursively update context ids sets along caller edges. 1110 auto UpdateCallers = [&](ContextNode *Node, 1111 DenseSet<const ContextEdge *> &Visited, 1112 auto &&UpdateCallers) -> void { 1113 for (const auto &Edge : Node->CallerEdges) { 1114 auto Inserted = Visited.insert(Edge.get()); 1115 if (!Inserted.second) 1116 continue; 1117 ContextNode *NextNode = Edge->Caller; 1118 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); 1119 // Only need to recursively iterate to NextNode via this caller edge if 1120 // it resulted in any added ids to NextNode. 1121 if (!NewIdsToAdd.empty()) { 1122 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1123 UpdateCallers(NextNode, Visited, UpdateCallers); 1124 } 1125 } 1126 }; 1127 1128 DenseSet<const ContextEdge *> Visited; 1129 for (auto &Entry : AllocationCallToContextNodeMap) { 1130 auto *Node = Entry.second; 1131 UpdateCallers(Node, Visited, UpdateCallers); 1132 } 1133 } 1134 1135 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1136 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( 1137 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee, 1138 // This must be passed by value to make a copy since it will be adjusted 1139 // as ids are moved. 1140 DenseSet<uint32_t> RemainingContextIds) { 1141 auto &OrigEdges = 1142 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; 1143 // Increment iterator in loop so that we can remove edges as needed. 1144 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { 1145 auto Edge = *EI; 1146 // Remove any matching context ids from Edge, return set that were found and 1147 // removed, these are the new edge's context ids. Also update the remaining 1148 // (not found ids). 1149 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; 1150 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, 1151 NotFoundContextIds); 1152 RemainingContextIds.swap(NotFoundContextIds); 1153 // If no matching context ids for this edge, skip it. 1154 if (NewEdgeContextIds.empty()) { 1155 ++EI; 1156 continue; 1157 } 1158 if (TowardsCallee) { 1159 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds); 1160 auto NewEdge = std::make_shared<ContextEdge>( 1161 Edge->Callee, NewNode, NewAllocType, std::move(NewEdgeContextIds)); 1162 NewNode->CalleeEdges.push_back(NewEdge); 1163 NewEdge->Callee->CallerEdges.push_back(NewEdge); 1164 } else { 1165 uint8_t NewAllocType = computeAllocType(NewEdgeContextIds); 1166 auto NewEdge = std::make_shared<ContextEdge>( 1167 NewNode, Edge->Caller, NewAllocType, std::move(NewEdgeContextIds)); 1168 NewNode->CallerEdges.push_back(NewEdge); 1169 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 1170 } 1171 // Remove old edge if context ids empty. 1172 if (Edge->getContextIds().empty()) { 1173 if (TowardsCallee) { 1174 Edge->Callee->eraseCallerEdge(Edge.get()); 1175 EI = OrigNode->CalleeEdges.erase(EI); 1176 } else { 1177 Edge->Caller->eraseCalleeEdge(Edge.get()); 1178 EI = OrigNode->CallerEdges.erase(EI); 1179 } 1180 continue; 1181 } 1182 ++EI; 1183 } 1184 } 1185 1186 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1187 static void checkEdge( 1188 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { 1189 // Confirm that alloc type is not None and that we have at least one context 1190 // id. 1191 assert(Edge->AllocTypes != (uint8_t)AllocationType::None); 1192 assert(!Edge->ContextIds.empty()); 1193 } 1194 1195 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1196 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, 1197 bool CheckEdges = true) { 1198 if (Node->isRemoved()) 1199 return; 1200 #ifndef NDEBUG 1201 // Compute node's context ids once for use in asserts. 1202 auto NodeContextIds = Node->getContextIds(); 1203 #endif 1204 // Node's context ids should be the union of both its callee and caller edge 1205 // context ids. 1206 if (Node->CallerEdges.size()) { 1207 DenseSet<uint32_t> CallerEdgeContextIds( 1208 Node->CallerEdges.front()->ContextIds); 1209 for (const auto &Edge : llvm::drop_begin(Node->CallerEdges)) { 1210 if (CheckEdges) 1211 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1212 set_union(CallerEdgeContextIds, Edge->ContextIds); 1213 } 1214 // Node can have more context ids than callers if some contexts terminate at 1215 // node and some are longer. 1216 assert(NodeContextIds == CallerEdgeContextIds || 1217 set_is_subset(CallerEdgeContextIds, NodeContextIds)); 1218 } 1219 if (Node->CalleeEdges.size()) { 1220 DenseSet<uint32_t> CalleeEdgeContextIds( 1221 Node->CalleeEdges.front()->ContextIds); 1222 for (const auto &Edge : llvm::drop_begin(Node->CalleeEdges)) { 1223 if (CheckEdges) 1224 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1225 set_union(CalleeEdgeContextIds, Edge->getContextIds()); 1226 } 1227 assert(NodeContextIds == CalleeEdgeContextIds); 1228 } 1229 } 1230 1231 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1232 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1233 assignStackNodesPostOrder(ContextNode *Node, 1234 DenseSet<const ContextNode *> &Visited, 1235 DenseMap<uint64_t, std::vector<CallContextInfo>> 1236 &StackIdToMatchingCalls) { 1237 auto Inserted = Visited.insert(Node); 1238 if (!Inserted.second) 1239 return; 1240 // Post order traversal. Iterate over a copy since we may add nodes and 1241 // therefore new callers during the recursive call, invalidating any 1242 // iterator over the original edge vector. We don't need to process these 1243 // new nodes as they were already processed on creation. 1244 auto CallerEdges = Node->CallerEdges; 1245 for (auto &Edge : CallerEdges) { 1246 // Skip any that have been removed during the recursion. 1247 if (!Edge) 1248 continue; 1249 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls); 1250 } 1251 1252 // If this node's stack id is in the map, update the graph to contain new 1253 // nodes representing any inlining at interior callsites. Note we move the 1254 // associated context ids over to the new nodes. 1255 1256 // Ignore this node if it is for an allocation or we didn't record any 1257 // stack id lists ending at it. 1258 if (Node->IsAllocation || 1259 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) 1260 return; 1261 1262 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; 1263 // Handle the simple case first. A single call with a single stack id. 1264 // In this case there is no need to create any new context nodes, simply 1265 // assign the context node for stack id to this Call. 1266 if (Calls.size() == 1) { 1267 auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; 1268 if (Ids.size() == 1) { 1269 assert(SavedContextIds.empty()); 1270 // It should be this Node 1271 assert(Node == getNodeForStackId(Ids[0])); 1272 if (Node->Recursive) 1273 return; 1274 Node->setCall(Call); 1275 NonAllocationCallToContextNodeMap[Call] = Node; 1276 NodeToCallingFunc[Node] = Func; 1277 return; 1278 } 1279 } 1280 1281 // Find the node for the last stack id, which should be the same 1282 // across all calls recorded for this id, and is this node's id. 1283 uint64_t LastId = Node->OrigStackOrAllocId; 1284 ContextNode *LastNode = getNodeForStackId(LastId); 1285 // We should only have kept stack ids that had nodes. 1286 assert(LastNode); 1287 1288 for (unsigned I = 0; I < Calls.size(); I++) { 1289 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1290 // Skip any for which we didn't assign any ids, these don't get a node in 1291 // the graph. 1292 if (SavedContextIds.empty()) 1293 continue; 1294 1295 assert(LastId == Ids.back()); 1296 1297 ContextNode *FirstNode = getNodeForStackId(Ids[0]); 1298 assert(FirstNode); 1299 1300 // Recompute the context ids for this stack id sequence (the 1301 // intersection of the context ids of the corresponding nodes). 1302 // Start with the ids we saved in the map for this call, which could be 1303 // duplicated context ids. We have to recompute as we might have overlap 1304 // overlap between the saved context ids for different last nodes, and 1305 // removed them already during the post order traversal. 1306 set_intersect(SavedContextIds, FirstNode->getContextIds()); 1307 ContextNode *PrevNode = nullptr; 1308 for (auto Id : Ids) { 1309 ContextNode *CurNode = getNodeForStackId(Id); 1310 // We should only have kept stack ids that had nodes and weren't 1311 // recursive. 1312 assert(CurNode); 1313 assert(!CurNode->Recursive); 1314 if (!PrevNode) { 1315 PrevNode = CurNode; 1316 continue; 1317 } 1318 auto *Edge = CurNode->findEdgeFromCallee(PrevNode); 1319 if (!Edge) { 1320 SavedContextIds.clear(); 1321 break; 1322 } 1323 PrevNode = CurNode; 1324 set_intersect(SavedContextIds, Edge->getContextIds()); 1325 1326 // If we now have no context ids for clone, skip this call. 1327 if (SavedContextIds.empty()) 1328 break; 1329 } 1330 if (SavedContextIds.empty()) 1331 continue; 1332 1333 // Create new context node. 1334 NodeOwner.push_back( 1335 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); 1336 ContextNode *NewNode = NodeOwner.back().get(); 1337 NodeToCallingFunc[NewNode] = Func; 1338 NonAllocationCallToContextNodeMap[Call] = NewNode; 1339 NewNode->AllocTypes = computeAllocType(SavedContextIds); 1340 1341 // Connect to callees of innermost stack frame in inlined call chain. 1342 // This updates context ids for FirstNode's callee's to reflect those 1343 // moved to NewNode. 1344 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true, SavedContextIds); 1345 1346 // Connect to callers of outermost stack frame in inlined call chain. 1347 // This updates context ids for FirstNode's caller's to reflect those 1348 // moved to NewNode. 1349 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false, SavedContextIds); 1350 1351 // Now we need to remove context ids from edges/nodes between First and 1352 // Last Node. 1353 PrevNode = nullptr; 1354 for (auto Id : Ids) { 1355 ContextNode *CurNode = getNodeForStackId(Id); 1356 // We should only have kept stack ids that had nodes. 1357 assert(CurNode); 1358 1359 // Remove the context ids moved to NewNode from CurNode, and the 1360 // edge from the prior node. 1361 if (PrevNode) { 1362 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); 1363 assert(PrevEdge); 1364 set_subtract(PrevEdge->getContextIds(), SavedContextIds); 1365 if (PrevEdge->getContextIds().empty()) { 1366 PrevNode->eraseCallerEdge(PrevEdge); 1367 CurNode->eraseCalleeEdge(PrevEdge); 1368 } 1369 } 1370 // Since we update the edges from leaf to tail, only look at the callee 1371 // edges. This isn't an alloc node, so if there are no callee edges, the 1372 // alloc type is None. 1373 CurNode->AllocTypes = CurNode->CalleeEdges.empty() 1374 ? (uint8_t)AllocationType::None 1375 : CurNode->computeAllocType(); 1376 PrevNode = CurNode; 1377 } 1378 if (VerifyNodes) { 1379 checkNode<DerivedCCG, FuncTy, CallTy>(NewNode, /*CheckEdges=*/true); 1380 for (auto Id : Ids) { 1381 ContextNode *CurNode = getNodeForStackId(Id); 1382 // We should only have kept stack ids that had nodes. 1383 assert(CurNode); 1384 checkNode<DerivedCCG, FuncTy, CallTy>(CurNode, /*CheckEdges=*/true); 1385 } 1386 } 1387 } 1388 } 1389 1390 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1391 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { 1392 // Map of stack id to all calls with that as the last (outermost caller) 1393 // callsite id that has a context node (some might not due to pruning 1394 // performed during matching of the allocation profile contexts). 1395 // The CallContextInfo contains the Call and a list of its stack ids with 1396 // ContextNodes, the function containing Call, and the set of context ids 1397 // the analysis will eventually identify for use in any new node created 1398 // for that callsite. 1399 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; 1400 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 1401 for (auto &Call : CallsWithMetadata) { 1402 // Ignore allocations, already handled. 1403 if (AllocationCallToContextNodeMap.count(Call)) 1404 continue; 1405 auto StackIdsWithContextNodes = 1406 getStackIdsWithContextNodesForCall(Call.call()); 1407 // If there were no nodes created for MIBs on allocs (maybe this was in 1408 // the unambiguous part of the MIB stack that was pruned), ignore. 1409 if (StackIdsWithContextNodes.empty()) 1410 continue; 1411 // Otherwise, record this Call along with the list of ids for the last 1412 // (outermost caller) stack id with a node. 1413 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( 1414 {Call.call(), StackIdsWithContextNodes, Func, {}}); 1415 } 1416 } 1417 1418 // First make a pass through all stack ids that correspond to a call, 1419 // as identified in the above loop. Compute the context ids corresponding to 1420 // each of these calls when they correspond to multiple stack ids due to 1421 // due to inlining. Perform any duplication of context ids required when 1422 // there is more than one call with the same stack ids. Their (possibly newly 1423 // duplicated) context ids are saved in the StackIdToMatchingCalls map. 1424 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; 1425 for (auto &It : StackIdToMatchingCalls) { 1426 auto &Calls = It.getSecond(); 1427 // Skip single calls with a single stack id. These don't need a new node. 1428 if (Calls.size() == 1) { 1429 auto &Ids = std::get<1>(Calls[0]); 1430 if (Ids.size() == 1) 1431 continue; 1432 } 1433 // In order to do the best and maximal matching of inlined calls to context 1434 // node sequences we will sort the vectors of stack ids in descending order 1435 // of length, and within each length, lexicographically by stack id. The 1436 // latter is so that we can specially handle calls that have identical stack 1437 // id sequences (either due to cloning or artificially because of the MIB 1438 // context pruning). 1439 std::stable_sort(Calls.begin(), Calls.end(), 1440 [](const CallContextInfo &A, const CallContextInfo &B) { 1441 auto &IdsA = std::get<1>(A); 1442 auto &IdsB = std::get<1>(B); 1443 return IdsA.size() > IdsB.size() || 1444 (IdsA.size() == IdsB.size() && IdsA < IdsB); 1445 }); 1446 1447 // Find the node for the last stack id, which should be the same 1448 // across all calls recorded for this id, and is the id for this 1449 // entry in the StackIdToMatchingCalls map. 1450 uint64_t LastId = It.getFirst(); 1451 ContextNode *LastNode = getNodeForStackId(LastId); 1452 // We should only have kept stack ids that had nodes. 1453 assert(LastNode); 1454 1455 if (LastNode->Recursive) 1456 continue; 1457 1458 // Initialize the context ids with the last node's. We will subsequently 1459 // refine the context ids by computing the intersection along all edges. 1460 DenseSet<uint32_t> LastNodeContextIds = LastNode->getContextIds(); 1461 assert(!LastNodeContextIds.empty()); 1462 1463 for (unsigned I = 0; I < Calls.size(); I++) { 1464 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1465 assert(SavedContextIds.empty()); 1466 assert(LastId == Ids.back()); 1467 1468 // First compute the context ids for this stack id sequence (the 1469 // intersection of the context ids of the corresponding nodes). 1470 // Start with the remaining saved ids for the last node. 1471 assert(!LastNodeContextIds.empty()); 1472 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; 1473 1474 ContextNode *PrevNode = LastNode; 1475 ContextNode *CurNode = LastNode; 1476 bool Skip = false; 1477 1478 // Iterate backwards through the stack Ids, starting after the last Id 1479 // in the list, which was handled once outside for all Calls. 1480 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1481 auto Id = *IdIter; 1482 CurNode = getNodeForStackId(Id); 1483 // We should only have kept stack ids that had nodes. 1484 assert(CurNode); 1485 1486 if (CurNode->Recursive) { 1487 Skip = true; 1488 break; 1489 } 1490 1491 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1492 // If there is no edge then the nodes belong to different MIB contexts, 1493 // and we should skip this inlined context sequence. For example, this 1494 // particular inlined context may include stack ids A->B, and we may 1495 // indeed have nodes for both A and B, but it is possible that they were 1496 // never profiled in sequence in a single MIB for any allocation (i.e. 1497 // we might have profiled an allocation that involves the callsite A, 1498 // but through a different one of its callee callsites, and we might 1499 // have profiled an allocation that involves callsite B, but reached 1500 // from a different caller callsite). 1501 if (!Edge) { 1502 Skip = true; 1503 break; 1504 } 1505 PrevNode = CurNode; 1506 1507 // Update the context ids, which is the intersection of the ids along 1508 // all edges in the sequence. 1509 set_intersect(StackSequenceContextIds, Edge->getContextIds()); 1510 1511 // If we now have no context ids for clone, skip this call. 1512 if (StackSequenceContextIds.empty()) { 1513 Skip = true; 1514 break; 1515 } 1516 } 1517 if (Skip) 1518 continue; 1519 1520 // If some of this call's stack ids did not have corresponding nodes (due 1521 // to pruning), don't include any context ids for contexts that extend 1522 // beyond these nodes. Otherwise we would be matching part of unrelated / 1523 // not fully matching stack contexts. To do this, subtract any context ids 1524 // found in caller nodes of the last node found above. 1525 if (Ids.back() != getLastStackId(Call)) { 1526 for (const auto &PE : LastNode->CallerEdges) { 1527 set_subtract(StackSequenceContextIds, PE->getContextIds()); 1528 if (StackSequenceContextIds.empty()) 1529 break; 1530 } 1531 // If we now have no context ids for clone, skip this call. 1532 if (StackSequenceContextIds.empty()) 1533 continue; 1534 } 1535 1536 // Check if the next set of stack ids is the same (since the Calls vector 1537 // of tuples is sorted by the stack ids we can just look at the next one). 1538 bool DuplicateContextIds = false; 1539 if (I + 1 < Calls.size()) { 1540 auto NextIds = std::get<1>(Calls[I + 1]); 1541 DuplicateContextIds = Ids == NextIds; 1542 } 1543 1544 // If we don't have duplicate context ids, then we can assign all the 1545 // context ids computed for the original node sequence to this call. 1546 // If there are duplicate calls with the same stack ids then we synthesize 1547 // new context ids that are duplicates of the originals. These are 1548 // assigned to SavedContextIds, which is a reference into the map entry 1549 // for this call, allowing us to access these ids later on. 1550 OldToNewContextIds.reserve(OldToNewContextIds.size() + 1551 StackSequenceContextIds.size()); 1552 SavedContextIds = 1553 DuplicateContextIds 1554 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) 1555 : StackSequenceContextIds; 1556 assert(!SavedContextIds.empty()); 1557 1558 if (!DuplicateContextIds) { 1559 // Update saved last node's context ids to remove those that are 1560 // assigned to other calls, so that it is ready for the next call at 1561 // this stack id. 1562 set_subtract(LastNodeContextIds, StackSequenceContextIds); 1563 if (LastNodeContextIds.empty()) 1564 break; 1565 } 1566 } 1567 } 1568 1569 // Propagate the duplicate context ids over the graph. 1570 propagateDuplicateContextIds(OldToNewContextIds); 1571 1572 if (VerifyCCG) 1573 check(); 1574 1575 // Now perform a post-order traversal over the graph, starting with the 1576 // allocation nodes, essentially processing nodes from callers to callees. 1577 // For any that contains an id in the map, update the graph to contain new 1578 // nodes representing any inlining at interior callsites. Note we move the 1579 // associated context ids over to the new nodes. 1580 DenseSet<const ContextNode *> Visited; 1581 for (auto &Entry : AllocationCallToContextNodeMap) 1582 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); 1583 if (VerifyCCG) 1584 check(); 1585 } 1586 1587 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { 1588 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1589 Call->getMetadata(LLVMContext::MD_callsite)); 1590 return CallsiteContext.back(); 1591 } 1592 1593 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { 1594 assert(isa<CallsiteInfo *>(Call.getBase())); 1595 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1596 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1597 // Need to convert index into stack id. 1598 return Index.getStackIdAtIndex(CallsiteContext.back()); 1599 } 1600 1601 static const std::string MemProfCloneSuffix = ".memprof."; 1602 1603 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { 1604 // We use CloneNo == 0 to refer to the original version, which doesn't get 1605 // renamed with a suffix. 1606 if (!CloneNo) 1607 return Base.str(); 1608 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); 1609 } 1610 1611 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, 1612 const Instruction *Call, 1613 unsigned CloneNo) const { 1614 return (Twine(Call->getFunction()->getName()) + " -> " + 1615 cast<CallBase>(Call)->getCalledFunction()->getName()) 1616 .str(); 1617 } 1618 1619 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, 1620 const IndexCall &Call, 1621 unsigned CloneNo) const { 1622 auto VI = FSToVIMap.find(Func); 1623 assert(VI != FSToVIMap.end()); 1624 if (isa<AllocInfo *>(Call.getBase())) 1625 return (VI->second.name() + " -> alloc").str(); 1626 else { 1627 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase()); 1628 return (VI->second.name() + " -> " + 1629 getMemProfFuncName(Callsite->Callee.name(), 1630 Callsite->Clones[CloneNo])) 1631 .str(); 1632 } 1633 } 1634 1635 std::vector<uint64_t> 1636 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( 1637 Instruction *Call) { 1638 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1639 Call->getMetadata(LLVMContext::MD_callsite)); 1640 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( 1641 CallsiteContext); 1642 } 1643 1644 std::vector<uint64_t> 1645 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { 1646 assert(isa<CallsiteInfo *>(Call.getBase())); 1647 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1648 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1649 return getStackIdsWithContextNodes<CallsiteInfo, 1650 SmallVector<unsigned>::const_iterator>( 1651 CallsiteContext); 1652 } 1653 1654 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1655 template <class NodeT, class IteratorT> 1656 std::vector<uint64_t> 1657 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( 1658 CallStack<NodeT, IteratorT> &CallsiteContext) { 1659 std::vector<uint64_t> StackIds; 1660 for (auto IdOrIndex : CallsiteContext) { 1661 auto StackId = getStackId(IdOrIndex); 1662 ContextNode *Node = getNodeForStackId(StackId); 1663 if (!Node) 1664 break; 1665 StackIds.push_back(StackId); 1666 } 1667 return StackIds; 1668 } 1669 1670 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( 1671 Module &M, 1672 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) 1673 : Mod(M), OREGetter(OREGetter) { 1674 for (auto &F : M) { 1675 std::vector<CallInfo> CallsWithMetadata; 1676 for (auto &BB : F) { 1677 for (auto &I : BB) { 1678 if (!isa<CallBase>(I)) 1679 continue; 1680 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { 1681 CallsWithMetadata.push_back(&I); 1682 auto *AllocNode = addAllocNode(&I, &F); 1683 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); 1684 assert(CallsiteMD); 1685 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); 1686 // Add all of the MIBs and their stack nodes. 1687 for (auto &MDOp : MemProfMD->operands()) { 1688 auto *MIBMD = cast<const MDNode>(MDOp); 1689 MDNode *StackNode = getMIBStackNode(MIBMD); 1690 assert(StackNode); 1691 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); 1692 addStackNodesForMIB<MDNode, MDNode::op_iterator>( 1693 AllocNode, StackContext, CallsiteContext, 1694 getMIBAllocType(MIBMD), getMIBTotalSize(MIBMD)); 1695 } 1696 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1697 // Memprof and callsite metadata on memory allocations no longer 1698 // needed. 1699 I.setMetadata(LLVMContext::MD_memprof, nullptr); 1700 I.setMetadata(LLVMContext::MD_callsite, nullptr); 1701 } 1702 // For callsite metadata, add to list for this function for later use. 1703 else if (I.getMetadata(LLVMContext::MD_callsite)) 1704 CallsWithMetadata.push_back(&I); 1705 } 1706 } 1707 if (!CallsWithMetadata.empty()) 1708 FuncToCallsWithMetadata[&F] = CallsWithMetadata; 1709 } 1710 1711 if (DumpCCG) { 1712 dbgs() << "CCG before updating call stack chains:\n"; 1713 dbgs() << *this; 1714 } 1715 1716 if (ExportToDot) 1717 exportToDot("prestackupdate"); 1718 1719 updateStackNodes(); 1720 1721 handleCallsitesWithMultipleTargets(); 1722 1723 // Strip off remaining callsite metadata, no longer needed. 1724 for (auto &FuncEntry : FuncToCallsWithMetadata) 1725 for (auto &Call : FuncEntry.second) 1726 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); 1727 } 1728 1729 IndexCallsiteContextGraph::IndexCallsiteContextGraph( 1730 ModuleSummaryIndex &Index, 1731 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 1732 isPrevailing) 1733 : Index(Index), isPrevailing(isPrevailing) { 1734 for (auto &I : Index) { 1735 auto VI = Index.getValueInfo(I); 1736 for (auto &S : VI.getSummaryList()) { 1737 // We should only add the prevailing nodes. Otherwise we may try to clone 1738 // in a weak copy that won't be linked (and may be different than the 1739 // prevailing version). 1740 // We only keep the memprof summary on the prevailing copy now when 1741 // building the combined index, as a space optimization, however don't 1742 // rely on this optimization. The linker doesn't resolve local linkage 1743 // values so don't check whether those are prevailing. 1744 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1745 !isPrevailing(VI.getGUID(), S.get())) 1746 continue; 1747 auto *FS = dyn_cast<FunctionSummary>(S.get()); 1748 if (!FS) 1749 continue; 1750 std::vector<CallInfo> CallsWithMetadata; 1751 if (!FS->allocs().empty()) { 1752 for (auto &AN : FS->mutableAllocs()) { 1753 // This can happen because of recursion elimination handling that 1754 // currently exists in ModuleSummaryAnalysis. Skip these for now. 1755 // We still added them to the summary because we need to be able to 1756 // correlate properly in applyImport in the backends. 1757 if (AN.MIBs.empty()) 1758 continue; 1759 CallsWithMetadata.push_back({&AN}); 1760 auto *AllocNode = addAllocNode({&AN}, FS); 1761 // Pass an empty CallStack to the CallsiteContext (second) 1762 // parameter, since for ThinLTO we already collapsed out the inlined 1763 // stack ids on the allocation call during ModuleSummaryAnalysis. 1764 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1765 EmptyContext; 1766 unsigned I = 0; 1767 assert(!MemProfReportHintedSizes || 1768 AN.TotalSizes.size() == AN.MIBs.size()); 1769 // Now add all of the MIBs and their stack nodes. 1770 for (auto &MIB : AN.MIBs) { 1771 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1772 StackContext(&MIB); 1773 uint64_t TotalSize = 0; 1774 if (MemProfReportHintedSizes) 1775 TotalSize = AN.TotalSizes[I]; 1776 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( 1777 AllocNode, StackContext, EmptyContext, MIB.AllocType, 1778 TotalSize); 1779 I++; 1780 } 1781 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1782 // Initialize version 0 on the summary alloc node to the current alloc 1783 // type, unless it has both types in which case make it default, so 1784 // that in the case where we aren't able to clone the original version 1785 // always ends up with the default allocation behavior. 1786 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); 1787 } 1788 } 1789 // For callsite metadata, add to list for this function for later use. 1790 if (!FS->callsites().empty()) 1791 for (auto &SN : FS->mutableCallsites()) 1792 CallsWithMetadata.push_back({&SN}); 1793 1794 if (!CallsWithMetadata.empty()) 1795 FuncToCallsWithMetadata[FS] = CallsWithMetadata; 1796 1797 if (!FS->allocs().empty() || !FS->callsites().empty()) 1798 FSToVIMap[FS] = VI; 1799 } 1800 } 1801 1802 if (DumpCCG) { 1803 dbgs() << "CCG before updating call stack chains:\n"; 1804 dbgs() << *this; 1805 } 1806 1807 if (ExportToDot) 1808 exportToDot("prestackupdate"); 1809 1810 updateStackNodes(); 1811 1812 handleCallsitesWithMultipleTargets(); 1813 } 1814 1815 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1816 void CallsiteContextGraph<DerivedCCG, FuncTy, 1817 CallTy>::handleCallsitesWithMultipleTargets() { 1818 // Look for and workaround callsites that call multiple functions. 1819 // This can happen for indirect calls, which needs better handling, and in 1820 // more rare cases (e.g. macro expansion). 1821 // TODO: To fix this for indirect calls we will want to perform speculative 1822 // devirtualization using either the normal PGO info with ICP, or using the 1823 // information in the profiled MemProf contexts. We can do this prior to 1824 // this transformation for regular LTO, and for ThinLTO we can simulate that 1825 // effect in the summary and perform the actual speculative devirtualization 1826 // while cloning in the ThinLTO backend. 1827 1828 // Keep track of the new nodes synthesized for discovered tail calls missing 1829 // from the profiled contexts. 1830 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; 1831 1832 for (auto &Entry : NonAllocationCallToContextNodeMap) { 1833 auto *Node = Entry.second; 1834 assert(Node->Clones.empty()); 1835 // Check all node callees and see if in the same function. 1836 auto Call = Node->Call.call(); 1837 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end(); 1838 ++EI) { 1839 auto Edge = *EI; 1840 if (!Edge->Callee->hasCall()) 1841 continue; 1842 assert(NodeToCallingFunc.count(Edge->Callee)); 1843 // Check if the called function matches that of the callee node. 1844 if (calleesMatch(Call, EI, TailCallToContextNodeMap)) 1845 continue; 1846 RemovedEdgesWithMismatchedCallees++; 1847 // Work around by setting Node to have a null call, so it gets 1848 // skipped during cloning. Otherwise assignFunctions will assert 1849 // because its data structures are not designed to handle this case. 1850 Node->setCall(CallInfo()); 1851 break; 1852 } 1853 } 1854 1855 // Remove all mismatched nodes identified in the above loop from the node map 1856 // (checking whether they have a null call which is set above). For a 1857 // MapVector like NonAllocationCallToContextNodeMap it is much more efficient 1858 // to do the removal via remove_if than by individually erasing entries above. 1859 NonAllocationCallToContextNodeMap.remove_if( 1860 [](const auto &it) { return !it.second->hasCall(); }); 1861 1862 // Add the new nodes after the above loop so that the iteration is not 1863 // invalidated. 1864 for (auto &[Call, Node] : TailCallToContextNodeMap) 1865 NonAllocationCallToContextNodeMap[Call] = Node; 1866 } 1867 1868 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1869 // In the Module (IR) case this is already the Id. 1870 return IdOrIndex; 1871 } 1872 1873 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1874 // In the Index case this is an index into the stack id list in the summary 1875 // index, convert it to an Id. 1876 return Index.getStackIdAtIndex(IdOrIndex); 1877 } 1878 1879 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1880 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( 1881 CallTy Call, EdgeIter &EI, 1882 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { 1883 auto Edge = *EI; 1884 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; 1885 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; 1886 // Will be populated in order of callee to caller if we find a chain of tail 1887 // calls between the profiled caller and callee. 1888 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; 1889 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc, 1890 FoundCalleeChain)) 1891 return false; 1892 1893 // The usual case where the profiled callee matches that of the IR/summary. 1894 if (FoundCalleeChain.empty()) 1895 return true; 1896 1897 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { 1898 auto *CurEdge = Callee->findEdgeFromCaller(Caller); 1899 // If there is already an edge between these nodes, simply update it and 1900 // return. 1901 if (CurEdge) { 1902 CurEdge->ContextIds.insert(Edge->ContextIds.begin(), 1903 Edge->ContextIds.end()); 1904 CurEdge->AllocTypes |= Edge->AllocTypes; 1905 return; 1906 } 1907 // Otherwise, create a new edge and insert it into the caller and callee 1908 // lists. 1909 auto NewEdge = std::make_shared<ContextEdge>( 1910 Callee, Caller, Edge->AllocTypes, Edge->ContextIds); 1911 Callee->CallerEdges.push_back(NewEdge); 1912 if (Caller == Edge->Caller) { 1913 // If we are inserting the new edge into the current edge's caller, insert 1914 // the new edge before the current iterator position, and then increment 1915 // back to the current edge. 1916 EI = Caller->CalleeEdges.insert(EI, NewEdge); 1917 ++EI; 1918 assert(*EI == Edge && 1919 "Iterator position not restored after insert and increment"); 1920 } else 1921 Caller->CalleeEdges.push_back(NewEdge); 1922 }; 1923 1924 // Create new nodes for each found callee and connect in between the profiled 1925 // caller and callee. 1926 auto *CurCalleeNode = Edge->Callee; 1927 for (auto &[NewCall, Func] : FoundCalleeChain) { 1928 ContextNode *NewNode = nullptr; 1929 // First check if we have already synthesized a node for this tail call. 1930 if (TailCallToContextNodeMap.count(NewCall)) { 1931 NewNode = TailCallToContextNodeMap[NewCall]; 1932 NewNode->AllocTypes |= Edge->AllocTypes; 1933 } else { 1934 FuncToCallsWithMetadata[Func].push_back({NewCall}); 1935 // Create Node and record node info. 1936 NodeOwner.push_back( 1937 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall)); 1938 NewNode = NodeOwner.back().get(); 1939 NodeToCallingFunc[NewNode] = Func; 1940 TailCallToContextNodeMap[NewCall] = NewNode; 1941 NewNode->AllocTypes = Edge->AllocTypes; 1942 } 1943 1944 // Hook up node to its callee node 1945 AddEdge(NewNode, CurCalleeNode); 1946 1947 CurCalleeNode = NewNode; 1948 } 1949 1950 // Hook up edge's original caller to new callee node. 1951 AddEdge(Edge->Caller, CurCalleeNode); 1952 1953 // Remove old edge 1954 Edge->Callee->eraseCallerEdge(Edge.get()); 1955 EI = Edge->Caller->CalleeEdges.erase(EI); 1956 1957 // To simplify the increment of EI in the caller, subtract one from EI. 1958 // In the final AddEdge call we would have either added a new callee edge, 1959 // to Edge->Caller, or found an existing one. Either way we are guaranteed 1960 // that there is at least one callee edge. 1961 assert(!Edge->Caller->CalleeEdges.empty()); 1962 --EI; 1963 1964 return true; 1965 } 1966 1967 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 1968 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 1969 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 1970 bool &FoundMultipleCalleeChains) { 1971 // Stop recursive search if we have already explored the maximum specified 1972 // depth. 1973 if (Depth > TailCallSearchDepth) 1974 return false; 1975 1976 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { 1977 FoundCalleeChain.push_back({Callsite, F}); 1978 }; 1979 1980 auto *CalleeFunc = dyn_cast<Function>(CurCallee); 1981 if (!CalleeFunc) { 1982 auto *Alias = dyn_cast<GlobalAlias>(CurCallee); 1983 assert(Alias); 1984 CalleeFunc = dyn_cast<Function>(Alias->getAliasee()); 1985 assert(CalleeFunc); 1986 } 1987 1988 // Look for tail calls in this function, and check if they either call the 1989 // profiled callee directly, or indirectly (via a recursive search). 1990 // Only succeed if there is a single unique tail call chain found between the 1991 // profiled caller and callee, otherwise we could perform incorrect cloning. 1992 bool FoundSingleCalleeChain = false; 1993 for (auto &BB : *CalleeFunc) { 1994 for (auto &I : BB) { 1995 auto *CB = dyn_cast<CallBase>(&I); 1996 if (!CB || !CB->isTailCall()) 1997 continue; 1998 auto *CalledValue = CB->getCalledOperand(); 1999 auto *CalledFunction = CB->getCalledFunction(); 2000 if (CalledValue && !CalledFunction) { 2001 CalledValue = CalledValue->stripPointerCasts(); 2002 // Stripping pointer casts can reveal a called function. 2003 CalledFunction = dyn_cast<Function>(CalledValue); 2004 } 2005 // Check if this is an alias to a function. If so, get the 2006 // called aliasee for the checks below. 2007 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 2008 assert(!CalledFunction && 2009 "Expected null called function in callsite for alias"); 2010 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 2011 } 2012 if (!CalledFunction) 2013 continue; 2014 if (CalledFunction == ProfiledCallee) { 2015 if (FoundSingleCalleeChain) { 2016 FoundMultipleCalleeChains = true; 2017 return false; 2018 } 2019 FoundSingleCalleeChain = true; 2020 FoundProfiledCalleeCount++; 2021 FoundProfiledCalleeDepth += Depth; 2022 if (Depth > FoundProfiledCalleeMaxDepth) 2023 FoundProfiledCalleeMaxDepth = Depth; 2024 SaveCallsiteInfo(&I, CalleeFunc); 2025 } else if (findProfiledCalleeThroughTailCalls( 2026 ProfiledCallee, CalledFunction, Depth + 1, 2027 FoundCalleeChain, FoundMultipleCalleeChains)) { 2028 // findProfiledCalleeThroughTailCalls should not have returned 2029 // true if FoundMultipleCalleeChains. 2030 assert(!FoundMultipleCalleeChains); 2031 if (FoundSingleCalleeChain) { 2032 FoundMultipleCalleeChains = true; 2033 return false; 2034 } 2035 FoundSingleCalleeChain = true; 2036 SaveCallsiteInfo(&I, CalleeFunc); 2037 } else if (FoundMultipleCalleeChains) 2038 return false; 2039 } 2040 } 2041 2042 return FoundSingleCalleeChain; 2043 } 2044 2045 bool ModuleCallsiteContextGraph::calleeMatchesFunc( 2046 Instruction *Call, const Function *Func, const Function *CallerFunc, 2047 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { 2048 auto *CB = dyn_cast<CallBase>(Call); 2049 if (!CB->getCalledOperand()) 2050 return false; 2051 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 2052 auto *CalleeFunc = dyn_cast<Function>(CalleeVal); 2053 if (CalleeFunc == Func) 2054 return true; 2055 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 2056 if (Alias && Alias->getAliasee() == Func) 2057 return true; 2058 2059 // Recursively search for the profiled callee through tail calls starting with 2060 // the actual Callee. The discovered tail call chain is saved in 2061 // FoundCalleeChain, and we will fixup the graph to include these callsites 2062 // after returning. 2063 // FIXME: We will currently redo the same recursive walk if we find the same 2064 // mismatched callee from another callsite. We can improve this with more 2065 // bookkeeping of the created chain of new nodes for each mismatch. 2066 unsigned Depth = 1; 2067 bool FoundMultipleCalleeChains = false; 2068 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth, 2069 FoundCalleeChain, 2070 FoundMultipleCalleeChains)) { 2071 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " 2072 << Func->getName() << " from " << CallerFunc->getName() 2073 << " that actually called " << CalleeVal->getName() 2074 << (FoundMultipleCalleeChains 2075 ? " (found multiple possible chains)" 2076 : "") 2077 << "\n"); 2078 if (FoundMultipleCalleeChains) 2079 FoundProfiledCalleeNonUniquelyCount++; 2080 return false; 2081 } 2082 2083 return true; 2084 } 2085 2086 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 2087 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 2088 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 2089 bool &FoundMultipleCalleeChains) { 2090 // Stop recursive search if we have already explored the maximum specified 2091 // depth. 2092 if (Depth > TailCallSearchDepth) 2093 return false; 2094 2095 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { 2096 // Make a CallsiteInfo for each discovered callee, if one hasn't already 2097 // been synthesized. 2098 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) || 2099 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee)) 2100 // StackIds is empty (we don't have debug info available in the index for 2101 // these callsites) 2102 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = 2103 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>()); 2104 CallsiteInfo *NewCallsiteInfo = 2105 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); 2106 FoundCalleeChain.push_back({NewCallsiteInfo, FS}); 2107 }; 2108 2109 // Look for tail calls in this function, and check if they either call the 2110 // profiled callee directly, or indirectly (via a recursive search). 2111 // Only succeed if there is a single unique tail call chain found between the 2112 // profiled caller and callee, otherwise we could perform incorrect cloning. 2113 bool FoundSingleCalleeChain = false; 2114 for (auto &S : CurCallee.getSummaryList()) { 2115 if (!GlobalValue::isLocalLinkage(S->linkage()) && 2116 !isPrevailing(CurCallee.getGUID(), S.get())) 2117 continue; 2118 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()); 2119 if (!FS) 2120 continue; 2121 auto FSVI = CurCallee; 2122 auto *AS = dyn_cast<AliasSummary>(S.get()); 2123 if (AS) 2124 FSVI = AS->getAliaseeVI(); 2125 for (auto &CallEdge : FS->calls()) { 2126 if (!CallEdge.second.hasTailCall()) 2127 continue; 2128 if (CallEdge.first == ProfiledCallee) { 2129 if (FoundSingleCalleeChain) { 2130 FoundMultipleCalleeChains = true; 2131 return false; 2132 } 2133 FoundSingleCalleeChain = true; 2134 FoundProfiledCalleeCount++; 2135 FoundProfiledCalleeDepth += Depth; 2136 if (Depth > FoundProfiledCalleeMaxDepth) 2137 FoundProfiledCalleeMaxDepth = Depth; 2138 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 2139 // Add FS to FSToVIMap in case it isn't already there. 2140 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 2141 FSToVIMap[FS] = FSVI; 2142 } else if (findProfiledCalleeThroughTailCalls( 2143 ProfiledCallee, CallEdge.first, Depth + 1, 2144 FoundCalleeChain, FoundMultipleCalleeChains)) { 2145 // findProfiledCalleeThroughTailCalls should not have returned 2146 // true if FoundMultipleCalleeChains. 2147 assert(!FoundMultipleCalleeChains); 2148 if (FoundSingleCalleeChain) { 2149 FoundMultipleCalleeChains = true; 2150 return false; 2151 } 2152 FoundSingleCalleeChain = true; 2153 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 2154 // Add FS to FSToVIMap in case it isn't already there. 2155 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 2156 FSToVIMap[FS] = FSVI; 2157 } else if (FoundMultipleCalleeChains) 2158 return false; 2159 } 2160 } 2161 2162 return FoundSingleCalleeChain; 2163 } 2164 2165 bool IndexCallsiteContextGraph::calleeMatchesFunc( 2166 IndexCall &Call, const FunctionSummary *Func, 2167 const FunctionSummary *CallerFunc, 2168 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { 2169 ValueInfo Callee = 2170 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee; 2171 // If there is no summary list then this is a call to an externally defined 2172 // symbol. 2173 AliasSummary *Alias = 2174 Callee.getSummaryList().empty() 2175 ? nullptr 2176 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); 2177 assert(FSToVIMap.count(Func)); 2178 auto FuncVI = FSToVIMap[Func]; 2179 if (Callee == FuncVI || 2180 // If callee is an alias, check the aliasee, since only function 2181 // summary base objects will contain the stack node summaries and thus 2182 // get a context node. 2183 (Alias && Alias->getAliaseeVI() == FuncVI)) 2184 return true; 2185 2186 // Recursively search for the profiled callee through tail calls starting with 2187 // the actual Callee. The discovered tail call chain is saved in 2188 // FoundCalleeChain, and we will fixup the graph to include these callsites 2189 // after returning. 2190 // FIXME: We will currently redo the same recursive walk if we find the same 2191 // mismatched callee from another callsite. We can improve this with more 2192 // bookkeeping of the created chain of new nodes for each mismatch. 2193 unsigned Depth = 1; 2194 bool FoundMultipleCalleeChains = false; 2195 if (!findProfiledCalleeThroughTailCalls( 2196 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { 2197 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI 2198 << " from " << FSToVIMap[CallerFunc] 2199 << " that actually called " << Callee 2200 << (FoundMultipleCalleeChains 2201 ? " (found multiple possible chains)" 2202 : "") 2203 << "\n"); 2204 if (FoundMultipleCalleeChains) 2205 FoundProfiledCalleeNonUniquelyCount++; 2206 return false; 2207 } 2208 2209 return true; 2210 } 2211 2212 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2213 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() 2214 const { 2215 print(dbgs()); 2216 dbgs() << "\n"; 2217 } 2218 2219 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2220 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( 2221 raw_ostream &OS) const { 2222 OS << "Node " << this << "\n"; 2223 OS << "\t"; 2224 printCall(OS); 2225 if (Recursive) 2226 OS << " (recursive)"; 2227 OS << "\n"; 2228 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; 2229 OS << "\tContextIds:"; 2230 // Make a copy of the computed context ids that we can sort for stability. 2231 auto ContextIds = getContextIds(); 2232 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2233 std::sort(SortedIds.begin(), SortedIds.end()); 2234 for (auto Id : SortedIds) 2235 OS << " " << Id; 2236 OS << "\n"; 2237 OS << "\tCalleeEdges:\n"; 2238 for (auto &Edge : CalleeEdges) 2239 OS << "\t\t" << *Edge << "\n"; 2240 OS << "\tCallerEdges:\n"; 2241 for (auto &Edge : CallerEdges) 2242 OS << "\t\t" << *Edge << "\n"; 2243 if (!Clones.empty()) { 2244 OS << "\tClones: "; 2245 FieldSeparator FS; 2246 for (auto *Clone : Clones) 2247 OS << FS << Clone; 2248 OS << "\n"; 2249 } else if (CloneOf) { 2250 OS << "\tClone of " << CloneOf << "\n"; 2251 } 2252 } 2253 2254 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2255 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() 2256 const { 2257 print(dbgs()); 2258 dbgs() << "\n"; 2259 } 2260 2261 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2262 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( 2263 raw_ostream &OS) const { 2264 OS << "Edge from Callee " << Callee << " to Caller: " << Caller 2265 << " AllocTypes: " << getAllocTypeString(AllocTypes); 2266 OS << " ContextIds:"; 2267 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2268 std::sort(SortedIds.begin(), SortedIds.end()); 2269 for (auto Id : SortedIds) 2270 OS << " " << Id; 2271 } 2272 2273 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2274 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { 2275 print(dbgs()); 2276 } 2277 2278 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2279 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( 2280 raw_ostream &OS) const { 2281 OS << "Callsite Context Graph:\n"; 2282 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2283 for (const auto Node : nodes<GraphType>(this)) { 2284 if (Node->isRemoved()) 2285 continue; 2286 Node->print(OS); 2287 OS << "\n"; 2288 } 2289 } 2290 2291 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2292 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::printTotalSizes( 2293 raw_ostream &OS) const { 2294 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2295 for (const auto Node : nodes<GraphType>(this)) { 2296 if (Node->isRemoved()) 2297 continue; 2298 if (!Node->IsAllocation) 2299 continue; 2300 DenseSet<uint32_t> ContextIds = Node->getContextIds(); 2301 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2302 std::sort(SortedIds.begin(), SortedIds.end()); 2303 for (auto Id : SortedIds) { 2304 auto SizeI = ContextIdToTotalSize.find(Id); 2305 assert(SizeI != ContextIdToTotalSize.end()); 2306 auto TypeI = ContextIdToAllocationType.find(Id); 2307 assert(TypeI != ContextIdToAllocationType.end()); 2308 OS << getAllocTypeString((uint8_t)TypeI->second) << " context " << Id 2309 << " with total size " << SizeI->second << " is " 2310 << getAllocTypeString(Node->AllocTypes) << " after cloning\n"; 2311 } 2312 } 2313 } 2314 2315 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2316 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { 2317 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2318 for (const auto Node : nodes<GraphType>(this)) { 2319 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 2320 for (auto &Edge : Node->CallerEdges) 2321 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2322 } 2323 } 2324 2325 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2326 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { 2327 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2328 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; 2329 2330 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; 2331 static NodeRef getNode(const NodePtrTy &P) { return P.get(); } 2332 2333 using nodes_iterator = 2334 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, 2335 decltype(&getNode)>; 2336 2337 static nodes_iterator nodes_begin(GraphType G) { 2338 return nodes_iterator(G->NodeOwner.begin(), &getNode); 2339 } 2340 2341 static nodes_iterator nodes_end(GraphType G) { 2342 return nodes_iterator(G->NodeOwner.end(), &getNode); 2343 } 2344 2345 static NodeRef getEntryNode(GraphType G) { 2346 return G->NodeOwner.begin()->get(); 2347 } 2348 2349 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; 2350 static const ContextNode<DerivedCCG, FuncTy, CallTy> * 2351 GetCallee(const EdgePtrTy &P) { 2352 return P->Callee; 2353 } 2354 2355 using ChildIteratorType = 2356 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< 2357 DerivedCCG, FuncTy, CallTy>>>::const_iterator, 2358 decltype(&GetCallee)>; 2359 2360 static ChildIteratorType child_begin(NodeRef N) { 2361 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); 2362 } 2363 2364 static ChildIteratorType child_end(NodeRef N) { 2365 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); 2366 } 2367 }; 2368 2369 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2370 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> 2371 : public DefaultDOTGraphTraits { 2372 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 2373 2374 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2375 using GTraits = GraphTraits<GraphType>; 2376 using NodeRef = typename GTraits::NodeRef; 2377 using ChildIteratorType = typename GTraits::ChildIteratorType; 2378 2379 static std::string getNodeLabel(NodeRef Node, GraphType G) { 2380 std::string LabelString = 2381 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + 2382 Twine(Node->OrigStackOrAllocId)) 2383 .str(); 2384 LabelString += "\n"; 2385 if (Node->hasCall()) { 2386 auto Func = G->NodeToCallingFunc.find(Node); 2387 assert(Func != G->NodeToCallingFunc.end()); 2388 LabelString += 2389 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); 2390 } else { 2391 LabelString += "null call"; 2392 if (Node->Recursive) 2393 LabelString += " (recursive)"; 2394 else 2395 LabelString += " (external)"; 2396 } 2397 return LabelString; 2398 } 2399 2400 static std::string getNodeAttributes(NodeRef Node, GraphType) { 2401 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + 2402 getContextIds(Node->getContextIds()) + "\"") 2403 .str(); 2404 AttributeString += 2405 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); 2406 AttributeString += ",style=\"filled\""; 2407 if (Node->CloneOf) { 2408 AttributeString += ",color=\"blue\""; 2409 AttributeString += ",style=\"filled,bold,dashed\""; 2410 } else 2411 AttributeString += ",style=\"filled\""; 2412 return AttributeString; 2413 } 2414 2415 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, 2416 GraphType) { 2417 auto &Edge = *(ChildIter.getCurrent()); 2418 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + 2419 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"") 2420 .str(); 2421 } 2422 2423 // Since the NodeOwners list includes nodes that are no longer connected to 2424 // the graph, skip them here. 2425 static bool isNodeHidden(NodeRef Node, GraphType) { 2426 return Node->isRemoved(); 2427 } 2428 2429 private: 2430 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { 2431 std::string IdString = "ContextIds:"; 2432 if (ContextIds.size() < 100) { 2433 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2434 std::sort(SortedIds.begin(), SortedIds.end()); 2435 for (auto Id : SortedIds) 2436 IdString += (" " + Twine(Id)).str(); 2437 } else { 2438 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); 2439 } 2440 return IdString; 2441 } 2442 2443 static std::string getColor(uint8_t AllocTypes) { 2444 if (AllocTypes == (uint8_t)AllocationType::NotCold) 2445 // Color "brown1" actually looks like a lighter red. 2446 return "brown1"; 2447 if (AllocTypes == (uint8_t)AllocationType::Cold) 2448 return "cyan"; 2449 if (AllocTypes == 2450 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 2451 // Lighter purple. 2452 return "mediumorchid1"; 2453 return "gray"; 2454 } 2455 2456 static std::string getNodeId(NodeRef Node) { 2457 std::stringstream SStream; 2458 SStream << std::hex << "N0x" << (unsigned long long)Node; 2459 std::string Result = SStream.str(); 2460 return Result; 2461 } 2462 }; 2463 2464 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2465 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( 2466 std::string Label) const { 2467 WriteGraph(this, "", false, Label, 2468 DotFilePathPrefix + "ccg." + Label + ".dot"); 2469 } 2470 2471 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2472 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 2473 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( 2474 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI, 2475 DenseSet<uint32_t> ContextIdsToMove) { 2476 ContextNode *Node = Edge->Callee; 2477 NodeOwner.push_back( 2478 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); 2479 ContextNode *Clone = NodeOwner.back().get(); 2480 Node->addClone(Clone); 2481 assert(NodeToCallingFunc.count(Node)); 2482 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; 2483 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true, 2484 ContextIdsToMove); 2485 return Clone; 2486 } 2487 2488 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2489 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 2490 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 2491 ContextNode *NewCallee, EdgeIter *CallerEdgeI, 2492 bool NewClone, 2493 DenseSet<uint32_t> ContextIdsToMove) { 2494 // NewCallee and Edge's current callee must be clones of the same original 2495 // node (Edge's current callee may be the original node too). 2496 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); 2497 2498 ContextNode *OldCallee = Edge->Callee; 2499 2500 // We might already have an edge to the new callee from earlier cloning for a 2501 // different allocation. If one exists we will reuse it. 2502 auto ExistingEdgeToNewCallee = NewCallee->findEdgeFromCaller(Edge->Caller); 2503 2504 // Callers will pass an empty ContextIdsToMove set when they want to move the 2505 // edge. Copy in Edge's ids for simplicity. 2506 if (ContextIdsToMove.empty()) 2507 ContextIdsToMove = Edge->getContextIds(); 2508 2509 // If we are moving all of Edge's ids, then just move the whole Edge. 2510 // Otherwise only move the specified subset, to a new edge if needed. 2511 if (Edge->getContextIds().size() == ContextIdsToMove.size()) { 2512 // Moving the whole Edge. 2513 if (CallerEdgeI) 2514 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); 2515 else 2516 OldCallee->eraseCallerEdge(Edge.get()); 2517 if (ExistingEdgeToNewCallee) { 2518 // Since we already have an edge to NewCallee, simply move the ids 2519 // onto it, and remove the existing Edge. 2520 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), 2521 ContextIdsToMove.end()); 2522 ExistingEdgeToNewCallee->AllocTypes |= Edge->AllocTypes; 2523 assert(Edge->ContextIds == ContextIdsToMove); 2524 Edge->ContextIds.clear(); 2525 Edge->AllocTypes = (uint8_t)AllocationType::None; 2526 Edge->Caller->eraseCalleeEdge(Edge.get()); 2527 } else { 2528 // Otherwise just reconnect Edge to NewCallee. 2529 Edge->Callee = NewCallee; 2530 NewCallee->CallerEdges.push_back(Edge); 2531 // Don't need to update Edge's context ids since we are simply 2532 // reconnecting it. 2533 } 2534 // In either case, need to update the alloc types on New Callee. 2535 NewCallee->AllocTypes |= Edge->AllocTypes; 2536 } else { 2537 // Only moving a subset of Edge's ids. 2538 if (CallerEdgeI) 2539 ++CallerEdgeI; 2540 // Compute the alloc type of the subset of ids being moved. 2541 auto CallerEdgeAllocType = computeAllocType(ContextIdsToMove); 2542 if (ExistingEdgeToNewCallee) { 2543 // Since we already have an edge to NewCallee, simply move the ids 2544 // onto it. 2545 ExistingEdgeToNewCallee->getContextIds().insert(ContextIdsToMove.begin(), 2546 ContextIdsToMove.end()); 2547 ExistingEdgeToNewCallee->AllocTypes |= CallerEdgeAllocType; 2548 } else { 2549 // Otherwise, create a new edge to NewCallee for the ids being moved. 2550 auto NewEdge = std::make_shared<ContextEdge>( 2551 NewCallee, Edge->Caller, CallerEdgeAllocType, ContextIdsToMove); 2552 Edge->Caller->CalleeEdges.push_back(NewEdge); 2553 NewCallee->CallerEdges.push_back(NewEdge); 2554 } 2555 // In either case, need to update the alloc types on NewCallee, and remove 2556 // those ids and update the alloc type on the original Edge. 2557 NewCallee->AllocTypes |= CallerEdgeAllocType; 2558 set_subtract(Edge->ContextIds, ContextIdsToMove); 2559 Edge->AllocTypes = computeAllocType(Edge->ContextIds); 2560 } 2561 // Now walk the old callee node's callee edges and move Edge's context ids 2562 // over to the corresponding edge into the clone (which is created here if 2563 // this is a newly created clone). 2564 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { 2565 // The context ids moving to the new callee are the subset of this edge's 2566 // context ids and the context ids on the caller edge being moved. 2567 DenseSet<uint32_t> EdgeContextIdsToMove = 2568 set_intersection(OldCalleeEdge->getContextIds(), ContextIdsToMove); 2569 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); 2570 OldCalleeEdge->AllocTypes = 2571 computeAllocType(OldCalleeEdge->getContextIds()); 2572 if (!NewClone) { 2573 // Update context ids / alloc type on corresponding edge to NewCallee. 2574 // There is a chance this may not exist if we are reusing an existing 2575 // clone, specifically during function assignment, where we would have 2576 // removed none type edges after creating the clone. If we can't find 2577 // a corresponding edge there, fall through to the cloning below. 2578 if (auto *NewCalleeEdge = 2579 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { 2580 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 2581 EdgeContextIdsToMove.end()); 2582 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 2583 continue; 2584 } 2585 } 2586 auto NewEdge = std::make_shared<ContextEdge>( 2587 OldCalleeEdge->Callee, NewCallee, 2588 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 2589 NewCallee->CalleeEdges.push_back(NewEdge); 2590 NewEdge->Callee->CallerEdges.push_back(NewEdge); 2591 } 2592 // Recompute the node alloc type now that its callee edges have been 2593 // updated (since we will compute from those edges). 2594 OldCallee->AllocTypes = OldCallee->computeAllocType(); 2595 // OldCallee alloc type should be None iff its context id set is now empty. 2596 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == 2597 OldCallee->emptyContextIds()); 2598 if (VerifyCCG) { 2599 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); 2600 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); 2601 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) 2602 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, 2603 /*CheckEdges=*/false); 2604 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) 2605 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, 2606 /*CheckEdges=*/false); 2607 } 2608 } 2609 2610 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2611 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 2612 recursivelyRemoveNoneTypeCalleeEdges( 2613 ContextNode *Node, DenseSet<const ContextNode *> &Visited) { 2614 auto Inserted = Visited.insert(Node); 2615 if (!Inserted.second) 2616 return; 2617 2618 removeNoneTypeCalleeEdges(Node); 2619 2620 for (auto *Clone : Node->Clones) 2621 recursivelyRemoveNoneTypeCalleeEdges(Clone, Visited); 2622 2623 // The recursive call may remove some of this Node's caller edges. 2624 // Iterate over a copy and skip any that were removed. 2625 auto CallerEdges = Node->CallerEdges; 2626 for (auto &Edge : CallerEdges) { 2627 // Skip any that have been removed by an earlier recursive call. 2628 if (Edge->Callee == nullptr && Edge->Caller == nullptr) { 2629 assert(!is_contained(Node->CallerEdges, Edge)); 2630 continue; 2631 } 2632 recursivelyRemoveNoneTypeCalleeEdges(Edge->Caller, Visited); 2633 } 2634 } 2635 2636 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2637 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { 2638 DenseSet<const ContextNode *> Visited; 2639 for (auto &Entry : AllocationCallToContextNodeMap) { 2640 Visited.clear(); 2641 identifyClones(Entry.second, Visited, Entry.second->getContextIds()); 2642 } 2643 Visited.clear(); 2644 for (auto &Entry : AllocationCallToContextNodeMap) 2645 recursivelyRemoveNoneTypeCalleeEdges(Entry.second, Visited); 2646 if (VerifyCCG) 2647 check(); 2648 } 2649 2650 // helper function to check an AllocType is cold or notcold or both. 2651 bool checkColdOrNotCold(uint8_t AllocType) { 2652 return (AllocType == (uint8_t)AllocationType::Cold) || 2653 (AllocType == (uint8_t)AllocationType::NotCold) || 2654 (AllocType == 2655 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); 2656 } 2657 2658 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2659 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( 2660 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 2661 const DenseSet<uint32_t> &AllocContextIds) { 2662 if (VerifyNodes) 2663 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 2664 assert(!Node->CloneOf); 2665 2666 // If Node as a null call, then either it wasn't found in the module (regular 2667 // LTO) or summary index (ThinLTO), or there were other conditions blocking 2668 // cloning (e.g. recursion, calls multiple targets, etc). 2669 // Do this here so that we don't try to recursively clone callers below, which 2670 // isn't useful at least for this node. 2671 if (!Node->hasCall()) 2672 return; 2673 2674 #ifndef NDEBUG 2675 auto Insert = 2676 #endif 2677 Visited.insert(Node); 2678 // We should not have visited this node yet. 2679 assert(Insert.second); 2680 // The recursive call to identifyClones may delete the current edge from the 2681 // CallerEdges vector. Make a copy and iterate on that, simpler than passing 2682 // in an iterator and having recursive call erase from it. Other edges may 2683 // also get removed during the recursion, which will have null Callee and 2684 // Caller pointers (and are deleted later), so we skip those below. 2685 { 2686 auto CallerEdges = Node->CallerEdges; 2687 for (auto &Edge : CallerEdges) { 2688 // Skip any that have been removed by an earlier recursive call. 2689 if (Edge->Callee == nullptr && Edge->Caller == nullptr) { 2690 assert(!llvm::count(Node->CallerEdges, Edge)); 2691 continue; 2692 } 2693 // Ignore any caller we previously visited via another edge. 2694 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { 2695 identifyClones(Edge->Caller, Visited, AllocContextIds); 2696 } 2697 } 2698 } 2699 2700 // Check if we reached an unambiguous call or have have only a single caller. 2701 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2702 return; 2703 2704 // We need to clone. 2705 2706 // Try to keep the original version as alloc type NotCold. This will make 2707 // cases with indirect calls or any other situation with an unknown call to 2708 // the original function get the default behavior. We do this by sorting the 2709 // CallerEdges of the Node we will clone by alloc type. 2710 // 2711 // Give NotCold edge the lowest sort priority so those edges are at the end of 2712 // the caller edges vector, and stay on the original version (since the below 2713 // code clones greedily until it finds all remaining edges have the same type 2714 // and leaves the remaining ones on the original Node). 2715 // 2716 // We shouldn't actually have any None type edges, so the sorting priority for 2717 // that is arbitrary, and we assert in that case below. 2718 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, 2719 /*Cold*/ 1, 2720 /*NotColdCold*/ 2}; 2721 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), 2722 [&](const std::shared_ptr<ContextEdge> &A, 2723 const std::shared_ptr<ContextEdge> &B) { 2724 // Nodes with non-empty context ids should be sorted before 2725 // those with empty context ids. 2726 if (A->ContextIds.empty()) 2727 // Either B ContextIds are non-empty (in which case we 2728 // should return false because B < A), or B ContextIds 2729 // are empty, in which case they are equal, and we should 2730 // maintain the original relative ordering. 2731 return false; 2732 if (B->ContextIds.empty()) 2733 return true; 2734 2735 if (A->AllocTypes == B->AllocTypes) 2736 // Use the first context id for each edge as a 2737 // tie-breaker. 2738 return *A->ContextIds.begin() < *B->ContextIds.begin(); 2739 return AllocTypeCloningPriority[A->AllocTypes] < 2740 AllocTypeCloningPriority[B->AllocTypes]; 2741 }); 2742 2743 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2744 2745 // Iterate until we find no more opportunities for disambiguating the alloc 2746 // types via cloning. In most cases this loop will terminate once the Node 2747 // has a single allocation type, in which case no more cloning is needed. 2748 // We need to be able to remove Edge from CallerEdges, so need to adjust 2749 // iterator inside the loop. 2750 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { 2751 auto CallerEdge = *EI; 2752 2753 // See if cloning the prior caller edge left this node with a single alloc 2754 // type or a single caller. In that case no more cloning of Node is needed. 2755 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2756 break; 2757 2758 // Only need to process the ids along this edge pertaining to the given 2759 // allocation. 2760 auto CallerEdgeContextsForAlloc = 2761 set_intersection(CallerEdge->getContextIds(), AllocContextIds); 2762 if (CallerEdgeContextsForAlloc.empty()) { 2763 ++EI; 2764 continue; 2765 } 2766 auto CallerAllocTypeForAlloc = computeAllocType(CallerEdgeContextsForAlloc); 2767 2768 // Compute the node callee edge alloc types corresponding to the context ids 2769 // for this caller edge. 2770 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; 2771 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); 2772 for (auto &CalleeEdge : Node->CalleeEdges) 2773 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( 2774 CalleeEdge->getContextIds(), CallerEdgeContextsForAlloc)); 2775 2776 // Don't clone if doing so will not disambiguate any alloc types amongst 2777 // caller edges (including the callee edges that would be cloned). 2778 // Otherwise we will simply move all edges to the clone. 2779 // 2780 // First check if by cloning we will disambiguate the caller allocation 2781 // type from node's allocation type. Query allocTypeToUse so that we don't 2782 // bother cloning to distinguish NotCold+Cold from NotCold. Note that 2783 // neither of these should be None type. 2784 // 2785 // Then check if by cloning node at least one of the callee edges will be 2786 // disambiguated by splitting out different context ids. 2787 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); 2788 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2789 if (allocTypeToUse(CallerAllocTypeForAlloc) == 2790 allocTypeToUse(Node->AllocTypes) && 2791 allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2792 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { 2793 ++EI; 2794 continue; 2795 } 2796 2797 // First see if we can use an existing clone. Check each clone and its 2798 // callee edges for matching alloc types. 2799 ContextNode *Clone = nullptr; 2800 for (auto *CurClone : Node->Clones) { 2801 if (allocTypeToUse(CurClone->AllocTypes) != 2802 allocTypeToUse(CallerAllocTypeForAlloc)) 2803 continue; 2804 2805 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2806 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) 2807 continue; 2808 Clone = CurClone; 2809 break; 2810 } 2811 2812 // The edge iterator is adjusted when we move the CallerEdge to the clone. 2813 if (Clone) 2814 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI, /*NewClone=*/false, 2815 CallerEdgeContextsForAlloc); 2816 else 2817 Clone = 2818 moveEdgeToNewCalleeClone(CallerEdge, &EI, CallerEdgeContextsForAlloc); 2819 2820 assert(EI == Node->CallerEdges.end() || 2821 Node->AllocTypes != (uint8_t)AllocationType::None); 2822 // Sanity check that no alloc types on clone or its edges are None. 2823 assert(Clone->AllocTypes != (uint8_t)AllocationType::None); 2824 } 2825 2826 // We should still have some context ids on the original Node. 2827 assert(!Node->emptyContextIds()); 2828 2829 // Sanity check that no alloc types on node or edges are None. 2830 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2831 2832 if (VerifyNodes) 2833 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 2834 } 2835 2836 void ModuleCallsiteContextGraph::updateAllocationCall( 2837 CallInfo &Call, AllocationType AllocType) { 2838 std::string AllocTypeString = getAllocTypeAttributeString(AllocType); 2839 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), 2840 "memprof", AllocTypeString); 2841 cast<CallBase>(Call.call())->addFnAttr(A); 2842 OREGetter(Call.call()->getFunction()) 2843 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) 2844 << ore::NV("AllocationCall", Call.call()) << " in clone " 2845 << ore::NV("Caller", Call.call()->getFunction()) 2846 << " marked with memprof allocation attribute " 2847 << ore::NV("Attribute", AllocTypeString)); 2848 } 2849 2850 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, 2851 AllocationType AllocType) { 2852 auto *AI = Call.call().dyn_cast<AllocInfo *>(); 2853 assert(AI); 2854 assert(AI->Versions.size() > Call.cloneNo()); 2855 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; 2856 } 2857 2858 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2859 FuncInfo CalleeFunc) { 2860 if (CalleeFunc.cloneNo() > 0) 2861 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); 2862 OREGetter(CallerCall.call()->getFunction()) 2863 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) 2864 << ore::NV("Call", CallerCall.call()) << " in clone " 2865 << ore::NV("Caller", CallerCall.call()->getFunction()) 2866 << " assigned to call function clone " 2867 << ore::NV("Callee", CalleeFunc.func())); 2868 } 2869 2870 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2871 FuncInfo CalleeFunc) { 2872 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); 2873 assert(CI && 2874 "Caller cannot be an allocation which should not have profiled calls"); 2875 assert(CI->Clones.size() > CallerCall.cloneNo()); 2876 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); 2877 } 2878 2879 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 2880 Instruction *>::FuncInfo 2881 ModuleCallsiteContextGraph::cloneFunctionForCallsite( 2882 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2883 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2884 // Use existing LLVM facilities for cloning and obtaining Call in clone 2885 ValueToValueMapTy VMap; 2886 auto *NewFunc = CloneFunction(Func.func(), VMap); 2887 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); 2888 assert(!Func.func()->getParent()->getFunction(Name)); 2889 NewFunc->setName(Name); 2890 for (auto &Inst : CallsWithMetadataInFunc) { 2891 // This map always has the initial version in it. 2892 assert(Inst.cloneNo() == 0); 2893 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; 2894 } 2895 OREGetter(Func.func()) 2896 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) 2897 << "created clone " << ore::NV("NewFunction", NewFunc)); 2898 return {NewFunc, CloneNo}; 2899 } 2900 2901 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 2902 IndexCall>::FuncInfo 2903 IndexCallsiteContextGraph::cloneFunctionForCallsite( 2904 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2905 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2906 // Check how many clones we have of Call (and therefore function). 2907 // The next clone number is the current size of versions array. 2908 // Confirm this matches the CloneNo provided by the caller, which is based on 2909 // the number of function clones we have. 2910 assert(CloneNo == 2911 (Call.call().is<AllocInfo *>() 2912 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() 2913 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); 2914 // Walk all the instructions in this function. Create a new version for 2915 // each (by adding an entry to the Versions/Clones summary array), and copy 2916 // over the version being called for the function clone being cloned here. 2917 // Additionally, add an entry to the CallMap for the new function clone, 2918 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) 2919 // to the new call clone. 2920 for (auto &Inst : CallsWithMetadataInFunc) { 2921 // This map always has the initial version in it. 2922 assert(Inst.cloneNo() == 0); 2923 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { 2924 assert(AI->Versions.size() == CloneNo); 2925 // We assign the allocation type later (in updateAllocationCall), just add 2926 // an entry for it here. 2927 AI->Versions.push_back(0); 2928 } else { 2929 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); 2930 assert(CI && CI->Clones.size() == CloneNo); 2931 // We assign the clone number later (in updateCall), just add an entry for 2932 // it here. 2933 CI->Clones.push_back(0); 2934 } 2935 CallMap[Inst] = {Inst.call(), CloneNo}; 2936 } 2937 return {Func.func(), CloneNo}; 2938 } 2939 2940 // This method assigns cloned callsites to functions, cloning the functions as 2941 // needed. The assignment is greedy and proceeds roughly as follows: 2942 // 2943 // For each function Func: 2944 // For each call with graph Node having clones: 2945 // Initialize ClonesWorklist to Node and its clones 2946 // Initialize NodeCloneCount to 0 2947 // While ClonesWorklist is not empty: 2948 // Clone = pop front ClonesWorklist 2949 // NodeCloneCount++ 2950 // If Func has been cloned less than NodeCloneCount times: 2951 // If NodeCloneCount is 1: 2952 // Assign Clone to original Func 2953 // Continue 2954 // Create a new function clone 2955 // If other callers not assigned to call a function clone yet: 2956 // Assign them to call new function clone 2957 // Continue 2958 // Assign any other caller calling the cloned version to new clone 2959 // 2960 // For each caller of Clone: 2961 // If caller is assigned to call a specific function clone: 2962 // If we cannot assign Clone to that function clone: 2963 // Create new callsite Clone NewClone 2964 // Add NewClone to ClonesWorklist 2965 // Continue 2966 // Assign Clone to existing caller's called function clone 2967 // Else: 2968 // If Clone not already assigned to a function clone: 2969 // Assign to first function clone without assignment 2970 // Assign caller to selected function clone 2971 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2972 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { 2973 bool Changed = false; 2974 2975 // Keep track of the assignment of nodes (callsites) to function clones they 2976 // call. 2977 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; 2978 2979 // Update caller node to call function version CalleeFunc, by recording the 2980 // assignment in CallsiteToCalleeFuncCloneMap. 2981 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, 2982 const FuncInfo &CalleeFunc) { 2983 assert(Caller->hasCall()); 2984 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; 2985 }; 2986 2987 // Walk all functions for which we saw calls with memprof metadata, and handle 2988 // cloning for each of its calls. 2989 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 2990 FuncInfo OrigFunc(Func); 2991 // Map from each clone of OrigFunc to a map of remappings of each call of 2992 // interest (from original uncloned call to the corresponding cloned call in 2993 // that function clone). 2994 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; 2995 for (auto &Call : CallsWithMetadata) { 2996 ContextNode *Node = getNodeForInst(Call); 2997 // Skip call if we do not have a node for it (all uses of its stack ids 2998 // were either on inlined chains or pruned from the MIBs), or if we did 2999 // not create any clones for it. 3000 if (!Node || Node->Clones.empty()) 3001 continue; 3002 assert(Node->hasCall() && 3003 "Not having a call should have prevented cloning"); 3004 3005 // Track the assignment of function clones to clones of the current 3006 // callsite Node being handled. 3007 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; 3008 3009 // Assign callsite version CallsiteClone to function version FuncClone, 3010 // and also assign (possibly cloned) Call to CallsiteClone. 3011 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, 3012 CallInfo &Call, 3013 ContextNode *CallsiteClone, 3014 bool IsAlloc) { 3015 // Record the clone of callsite node assigned to this function clone. 3016 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; 3017 3018 assert(FuncClonesToCallMap.count(FuncClone)); 3019 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; 3020 CallInfo CallClone(Call); 3021 if (CallMap.count(Call)) 3022 CallClone = CallMap[Call]; 3023 CallsiteClone->setCall(CallClone); 3024 }; 3025 3026 // Keep track of the clones of callsite Node that need to be assigned to 3027 // function clones. This list may be expanded in the loop body below if we 3028 // find additional cloning is required. 3029 std::deque<ContextNode *> ClonesWorklist; 3030 // Ignore original Node if we moved all of its contexts to clones. 3031 if (!Node->emptyContextIds()) 3032 ClonesWorklist.push_back(Node); 3033 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), 3034 Node->Clones.end()); 3035 3036 // Now walk through all of the clones of this callsite Node that we need, 3037 // and determine the assignment to a corresponding clone of the current 3038 // function (creating new function clones as needed). 3039 unsigned NodeCloneCount = 0; 3040 while (!ClonesWorklist.empty()) { 3041 ContextNode *Clone = ClonesWorklist.front(); 3042 ClonesWorklist.pop_front(); 3043 NodeCloneCount++; 3044 if (VerifyNodes) 3045 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 3046 3047 // Need to create a new function clone if we have more callsite clones 3048 // than existing function clones, which would have been assigned to an 3049 // earlier clone in the list (we assign callsite clones to function 3050 // clones greedily). 3051 if (FuncClonesToCallMap.size() < NodeCloneCount) { 3052 // If this is the first callsite copy, assign to original function. 3053 if (NodeCloneCount == 1) { 3054 // Since FuncClonesToCallMap is empty in this case, no clones have 3055 // been created for this function yet, and no callers should have 3056 // been assigned a function clone for this callee node yet. 3057 assert(llvm::none_of( 3058 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 3059 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 3060 })); 3061 // Initialize with empty call map, assign Clone to original function 3062 // and its callers, and skip to the next clone. 3063 FuncClonesToCallMap[OrigFunc] = {}; 3064 AssignCallsiteCloneToFuncClone( 3065 OrigFunc, Call, Clone, 3066 AllocationCallToContextNodeMap.count(Call)); 3067 for (auto &CE : Clone->CallerEdges) { 3068 // Ignore any caller that does not have a recorded callsite Call. 3069 if (!CE->Caller->hasCall()) 3070 continue; 3071 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); 3072 } 3073 continue; 3074 } 3075 3076 // First locate which copy of OrigFunc to clone again. If a caller 3077 // of this callsite clone was already assigned to call a particular 3078 // function clone, we need to redirect all of those callers to the 3079 // new function clone, and update their other callees within this 3080 // function. 3081 FuncInfo PreviousAssignedFuncClone; 3082 auto EI = llvm::find_if( 3083 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 3084 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 3085 }); 3086 bool CallerAssignedToCloneOfFunc = false; 3087 if (EI != Clone->CallerEdges.end()) { 3088 const std::shared_ptr<ContextEdge> &Edge = *EI; 3089 PreviousAssignedFuncClone = 3090 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 3091 CallerAssignedToCloneOfFunc = true; 3092 } 3093 3094 // Clone function and save it along with the CallInfo map created 3095 // during cloning in the FuncClonesToCallMap. 3096 std::map<CallInfo, CallInfo> NewCallMap; 3097 unsigned CloneNo = FuncClonesToCallMap.size(); 3098 assert(CloneNo > 0 && "Clone 0 is the original function, which " 3099 "should already exist in the map"); 3100 FuncInfo NewFuncClone = cloneFunctionForCallsite( 3101 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); 3102 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); 3103 FunctionClonesAnalysis++; 3104 Changed = true; 3105 3106 // If no caller callsites were already assigned to a clone of this 3107 // function, we can simply assign this clone to the new func clone 3108 // and update all callers to it, then skip to the next clone. 3109 if (!CallerAssignedToCloneOfFunc) { 3110 AssignCallsiteCloneToFuncClone( 3111 NewFuncClone, Call, Clone, 3112 AllocationCallToContextNodeMap.count(Call)); 3113 for (auto &CE : Clone->CallerEdges) { 3114 // Ignore any caller that does not have a recorded callsite Call. 3115 if (!CE->Caller->hasCall()) 3116 continue; 3117 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 3118 } 3119 continue; 3120 } 3121 3122 // We may need to do additional node cloning in this case. 3123 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers 3124 // that were previously assigned to call PreviousAssignedFuncClone, 3125 // to record that they now call NewFuncClone. 3126 for (auto CE : Clone->CallerEdges) { 3127 // Skip any that have been removed on an earlier iteration. 3128 if (!CE) 3129 continue; 3130 // Ignore any caller that does not have a recorded callsite Call. 3131 if (!CE->Caller->hasCall()) 3132 continue; 3133 3134 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || 3135 // We subsequently fall through to later handling that 3136 // will perform any additional cloning required for 3137 // callers that were calling other function clones. 3138 CallsiteToCalleeFuncCloneMap[CE->Caller] != 3139 PreviousAssignedFuncClone) 3140 continue; 3141 3142 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 3143 3144 // If we are cloning a function that was already assigned to some 3145 // callers, then essentially we are creating new callsite clones 3146 // of the other callsites in that function that are reached by those 3147 // callers. Clone the other callees of the current callsite's caller 3148 // that were already assigned to PreviousAssignedFuncClone 3149 // accordingly. This is important since we subsequently update the 3150 // calls from the nodes in the graph and their assignments to callee 3151 // functions recorded in CallsiteToCalleeFuncCloneMap. 3152 for (auto CalleeEdge : CE->Caller->CalleeEdges) { 3153 // Skip any that have been removed on an earlier iteration when 3154 // cleaning up newly None type callee edges. 3155 if (!CalleeEdge) 3156 continue; 3157 ContextNode *Callee = CalleeEdge->Callee; 3158 // Skip the current callsite, we are looking for other 3159 // callsites Caller calls, as well as any that does not have a 3160 // recorded callsite Call. 3161 if (Callee == Clone || !Callee->hasCall()) 3162 continue; 3163 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); 3164 removeNoneTypeCalleeEdges(NewClone); 3165 // Moving the edge may have resulted in some none type 3166 // callee edges on the original Callee. 3167 removeNoneTypeCalleeEdges(Callee); 3168 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 3169 // If the Callee node was already assigned to call a specific 3170 // function version, make sure its new clone is assigned to call 3171 // that same function clone. 3172 if (CallsiteToCalleeFuncCloneMap.count(Callee)) 3173 RecordCalleeFuncOfCallsite( 3174 NewClone, CallsiteToCalleeFuncCloneMap[Callee]); 3175 // Update NewClone with the new Call clone of this callsite's Call 3176 // created for the new function clone created earlier. 3177 // Recall that we have already ensured when building the graph 3178 // that each caller can only call callsites within the same 3179 // function, so we are guaranteed that Callee Call is in the 3180 // current OrigFunc. 3181 // CallMap is set up as indexed by original Call at clone 0. 3182 CallInfo OrigCall(Callee->getOrigNode()->Call); 3183 OrigCall.setCloneNo(0); 3184 std::map<CallInfo, CallInfo> &CallMap = 3185 FuncClonesToCallMap[NewFuncClone]; 3186 assert(CallMap.count(OrigCall)); 3187 CallInfo NewCall(CallMap[OrigCall]); 3188 assert(NewCall); 3189 NewClone->setCall(NewCall); 3190 } 3191 } 3192 // Fall through to handling below to perform the recording of the 3193 // function for this callsite clone. This enables handling of cases 3194 // where the callers were assigned to different clones of a function. 3195 } 3196 3197 // See if we can use existing function clone. Walk through 3198 // all caller edges to see if any have already been assigned to 3199 // a clone of this callsite's function. If we can use it, do so. If not, 3200 // because that function clone is already assigned to a different clone 3201 // of this callsite, then we need to clone again. 3202 // Basically, this checking is needed to handle the case where different 3203 // caller functions/callsites may need versions of this function 3204 // containing different mixes of callsite clones across the different 3205 // callsites within the function. If that happens, we need to create 3206 // additional function clones to handle the various combinations. 3207 // 3208 // Keep track of any new clones of this callsite created by the 3209 // following loop, as well as any existing clone that we decided to 3210 // assign this clone to. 3211 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; 3212 FuncInfo FuncCloneAssignedToCurCallsiteClone; 3213 // We need to be able to remove Edge from CallerEdges, so need to adjust 3214 // iterator in the loop. 3215 for (auto EI = Clone->CallerEdges.begin(); 3216 EI != Clone->CallerEdges.end();) { 3217 auto Edge = *EI; 3218 // Ignore any caller that does not have a recorded callsite Call. 3219 if (!Edge->Caller->hasCall()) { 3220 EI++; 3221 continue; 3222 } 3223 // If this caller already assigned to call a version of OrigFunc, need 3224 // to ensure we can assign this callsite clone to that function clone. 3225 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { 3226 FuncInfo FuncCloneCalledByCaller = 3227 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 3228 // First we need to confirm that this function clone is available 3229 // for use by this callsite node clone. 3230 // 3231 // While FuncCloneToCurNodeCloneMap is built only for this Node and 3232 // its callsite clones, one of those callsite clones X could have 3233 // been assigned to the same function clone called by Edge's caller 3234 // - if Edge's caller calls another callsite within Node's original 3235 // function, and that callsite has another caller reaching clone X. 3236 // We need to clone Node again in this case. 3237 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && 3238 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != 3239 Clone) || 3240 // Detect when we have multiple callers of this callsite that 3241 // have already been assigned to specific, and different, clones 3242 // of OrigFunc (due to other unrelated callsites in Func they 3243 // reach via call contexts). Is this Clone of callsite Node 3244 // assigned to a different clone of OrigFunc? If so, clone Node 3245 // again. 3246 (FuncCloneAssignedToCurCallsiteClone && 3247 FuncCloneAssignedToCurCallsiteClone != 3248 FuncCloneCalledByCaller)) { 3249 // We need to use a different newly created callsite clone, in 3250 // order to assign it to another new function clone on a 3251 // subsequent iteration over the Clones array (adjusted below). 3252 // Note we specifically do not reset the 3253 // CallsiteToCalleeFuncCloneMap entry for this caller, so that 3254 // when this new clone is processed later we know which version of 3255 // the function to copy (so that other callsite clones we have 3256 // assigned to that function clone are properly cloned over). See 3257 // comments in the function cloning handling earlier. 3258 3259 // Check if we already have cloned this callsite again while 3260 // walking through caller edges, for a caller calling the same 3261 // function clone. If so, we can move this edge to that new clone 3262 // rather than creating yet another new clone. 3263 if (FuncCloneToNewCallsiteCloneMap.count( 3264 FuncCloneCalledByCaller)) { 3265 ContextNode *NewClone = 3266 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; 3267 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); 3268 // Cleanup any none type edges cloned over. 3269 removeNoneTypeCalleeEdges(NewClone); 3270 } else { 3271 // Create a new callsite clone. 3272 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); 3273 removeNoneTypeCalleeEdges(NewClone); 3274 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = 3275 NewClone; 3276 // Add to list of clones and process later. 3277 ClonesWorklist.push_back(NewClone); 3278 assert(EI == Clone->CallerEdges.end() || 3279 Clone->AllocTypes != (uint8_t)AllocationType::None); 3280 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 3281 } 3282 // Moving the caller edge may have resulted in some none type 3283 // callee edges. 3284 removeNoneTypeCalleeEdges(Clone); 3285 // We will handle the newly created callsite clone in a subsequent 3286 // iteration over this Node's Clones. Continue here since we 3287 // already adjusted iterator EI while moving the edge. 3288 continue; 3289 } 3290 3291 // Otherwise, we can use the function clone already assigned to this 3292 // caller. 3293 if (!FuncCloneAssignedToCurCallsiteClone) { 3294 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; 3295 // Assign Clone to FuncCloneCalledByCaller 3296 AssignCallsiteCloneToFuncClone( 3297 FuncCloneCalledByCaller, Call, Clone, 3298 AllocationCallToContextNodeMap.count(Call)); 3299 } else 3300 // Don't need to do anything - callsite is already calling this 3301 // function clone. 3302 assert(FuncCloneAssignedToCurCallsiteClone == 3303 FuncCloneCalledByCaller); 3304 3305 } else { 3306 // We have not already assigned this caller to a version of 3307 // OrigFunc. Do the assignment now. 3308 3309 // First check if we have already assigned this callsite clone to a 3310 // clone of OrigFunc for another caller during this iteration over 3311 // its caller edges. 3312 if (!FuncCloneAssignedToCurCallsiteClone) { 3313 // Find first function in FuncClonesToCallMap without an assigned 3314 // clone of this callsite Node. We should always have one 3315 // available at this point due to the earlier cloning when the 3316 // FuncClonesToCallMap size was smaller than the clone number. 3317 for (auto &CF : FuncClonesToCallMap) { 3318 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { 3319 FuncCloneAssignedToCurCallsiteClone = CF.first; 3320 break; 3321 } 3322 } 3323 assert(FuncCloneAssignedToCurCallsiteClone); 3324 // Assign Clone to FuncCloneAssignedToCurCallsiteClone 3325 AssignCallsiteCloneToFuncClone( 3326 FuncCloneAssignedToCurCallsiteClone, Call, Clone, 3327 AllocationCallToContextNodeMap.count(Call)); 3328 } else 3329 assert(FuncCloneToCurNodeCloneMap 3330 [FuncCloneAssignedToCurCallsiteClone] == Clone); 3331 // Update callers to record function version called. 3332 RecordCalleeFuncOfCallsite(Edge->Caller, 3333 FuncCloneAssignedToCurCallsiteClone); 3334 } 3335 3336 EI++; 3337 } 3338 } 3339 if (VerifyCCG) { 3340 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 3341 for (const auto &PE : Node->CalleeEdges) 3342 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 3343 for (const auto &CE : Node->CallerEdges) 3344 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 3345 for (auto *Clone : Node->Clones) { 3346 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 3347 for (const auto &PE : Clone->CalleeEdges) 3348 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 3349 for (const auto &CE : Clone->CallerEdges) 3350 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 3351 } 3352 } 3353 } 3354 } 3355 3356 auto UpdateCalls = [&](ContextNode *Node, 3357 DenseSet<const ContextNode *> &Visited, 3358 auto &&UpdateCalls) { 3359 auto Inserted = Visited.insert(Node); 3360 if (!Inserted.second) 3361 return; 3362 3363 for (auto *Clone : Node->Clones) 3364 UpdateCalls(Clone, Visited, UpdateCalls); 3365 3366 for (auto &Edge : Node->CallerEdges) 3367 UpdateCalls(Edge->Caller, Visited, UpdateCalls); 3368 3369 // Skip if either no call to update, or if we ended up with no context ids 3370 // (we moved all edges onto other clones). 3371 if (!Node->hasCall() || Node->emptyContextIds()) 3372 return; 3373 3374 if (Node->IsAllocation) { 3375 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); 3376 return; 3377 } 3378 3379 if (!CallsiteToCalleeFuncCloneMap.count(Node)) 3380 return; 3381 3382 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; 3383 updateCall(Node->Call, CalleeFunc); 3384 }; 3385 3386 // Performs DFS traversal starting from allocation nodes to update calls to 3387 // reflect cloning decisions recorded earlier. For regular LTO this will 3388 // update the actual calls in the IR to call the appropriate function clone 3389 // (and add attributes to allocation calls), whereas for ThinLTO the decisions 3390 // are recorded in the summary entries. 3391 DenseSet<const ContextNode *> Visited; 3392 for (auto &Entry : AllocationCallToContextNodeMap) 3393 UpdateCalls(Entry.second, Visited, UpdateCalls); 3394 3395 return Changed; 3396 } 3397 3398 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( 3399 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, 3400 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 3401 &FuncToAliasMap) { 3402 // The first "clone" is the original copy, we should only call this if we 3403 // needed to create new clones. 3404 assert(NumClones > 1); 3405 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 3406 VMaps.reserve(NumClones - 1); 3407 FunctionsClonedThinBackend++; 3408 for (unsigned I = 1; I < NumClones; I++) { 3409 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); 3410 auto *NewF = CloneFunction(&F, *VMaps.back()); 3411 FunctionClonesThinBackend++; 3412 // Strip memprof and callsite metadata from clone as they are no longer 3413 // needed. 3414 for (auto &BB : *NewF) { 3415 for (auto &Inst : BB) { 3416 Inst.setMetadata(LLVMContext::MD_memprof, nullptr); 3417 Inst.setMetadata(LLVMContext::MD_callsite, nullptr); 3418 } 3419 } 3420 std::string Name = getMemProfFuncName(F.getName(), I); 3421 auto *PrevF = M.getFunction(Name); 3422 if (PrevF) { 3423 // We might have created this when adjusting callsite in another 3424 // function. It should be a declaration. 3425 assert(PrevF->isDeclaration()); 3426 NewF->takeName(PrevF); 3427 PrevF->replaceAllUsesWith(NewF); 3428 PrevF->eraseFromParent(); 3429 } else 3430 NewF->setName(Name); 3431 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) 3432 << "created clone " << ore::NV("NewFunction", NewF)); 3433 3434 // Now handle aliases to this function, and clone those as well. 3435 if (!FuncToAliasMap.count(&F)) 3436 continue; 3437 for (auto *A : FuncToAliasMap[&F]) { 3438 std::string Name = getMemProfFuncName(A->getName(), I); 3439 auto *PrevA = M.getNamedAlias(Name); 3440 auto *NewA = GlobalAlias::create(A->getValueType(), 3441 A->getType()->getPointerAddressSpace(), 3442 A->getLinkage(), Name, NewF); 3443 NewA->copyAttributesFrom(A); 3444 if (PrevA) { 3445 // We might have created this when adjusting callsite in another 3446 // function. It should be a declaration. 3447 assert(PrevA->isDeclaration()); 3448 NewA->takeName(PrevA); 3449 PrevA->replaceAllUsesWith(NewA); 3450 PrevA->eraseFromParent(); 3451 } 3452 } 3453 } 3454 return VMaps; 3455 } 3456 3457 // Locate the summary for F. This is complicated by the fact that it might 3458 // have been internalized or promoted. 3459 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, 3460 const ModuleSummaryIndex *ImportSummary) { 3461 // FIXME: Ideally we would retain the original GUID in some fashion on the 3462 // function (e.g. as metadata), but for now do our best to locate the 3463 // summary without that information. 3464 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); 3465 if (!TheFnVI) 3466 // See if theFn was internalized, by checking index directly with 3467 // original name (this avoids the name adjustment done by getGUID() for 3468 // internal symbols). 3469 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); 3470 if (TheFnVI) 3471 return TheFnVI; 3472 // Now query with the original name before any promotion was performed. 3473 StringRef OrigName = 3474 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); 3475 std::string OrigId = GlobalValue::getGlobalIdentifier( 3476 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); 3477 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); 3478 if (TheFnVI) 3479 return TheFnVI; 3480 // Could be a promoted local imported from another module. We need to pass 3481 // down more info here to find the original module id. For now, try with 3482 // the OrigName which might have been stored in the OidGuidMap in the 3483 // index. This would not work if there were same-named locals in multiple 3484 // modules, however. 3485 auto OrigGUID = 3486 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); 3487 if (OrigGUID) 3488 TheFnVI = ImportSummary->getValueInfo(OrigGUID); 3489 return TheFnVI; 3490 } 3491 3492 bool MemProfContextDisambiguation::applyImport(Module &M) { 3493 assert(ImportSummary); 3494 bool Changed = false; 3495 3496 auto IsMemProfClone = [](const Function &F) { 3497 return F.getName().contains(MemProfCloneSuffix); 3498 }; 3499 3500 // We also need to clone any aliases that reference cloned functions, because 3501 // the modified callsites may invoke via the alias. Keep track of the aliases 3502 // for each function. 3503 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 3504 FuncToAliasMap; 3505 for (auto &A : M.aliases()) { 3506 auto *Aliasee = A.getAliaseeObject(); 3507 if (auto *F = dyn_cast<Function>(Aliasee)) 3508 FuncToAliasMap[F].insert(&A); 3509 } 3510 3511 for (auto &F : M) { 3512 if (F.isDeclaration() || IsMemProfClone(F)) 3513 continue; 3514 3515 OptimizationRemarkEmitter ORE(&F); 3516 3517 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 3518 bool ClonesCreated = false; 3519 unsigned NumClonesCreated = 0; 3520 auto CloneFuncIfNeeded = [&](unsigned NumClones) { 3521 // We should at least have version 0 which is the original copy. 3522 assert(NumClones > 0); 3523 // If only one copy needed use original. 3524 if (NumClones == 1) 3525 return; 3526 // If we already performed cloning of this function, confirm that the 3527 // requested number of clones matches (the thin link should ensure the 3528 // number of clones for each constituent callsite is consistent within 3529 // each function), before returning. 3530 if (ClonesCreated) { 3531 assert(NumClonesCreated == NumClones); 3532 return; 3533 } 3534 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); 3535 // The first "clone" is the original copy, which doesn't have a VMap. 3536 assert(VMaps.size() == NumClones - 1); 3537 Changed = true; 3538 ClonesCreated = true; 3539 NumClonesCreated = NumClones; 3540 }; 3541 3542 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, 3543 Function *CalledFunction) { 3544 // Perform cloning if not yet done. 3545 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); 3546 3547 // Should have skipped indirect calls via mayHaveMemprofSummary. 3548 assert(CalledFunction); 3549 assert(!IsMemProfClone(*CalledFunction)); 3550 3551 // Update the calls per the summary info. 3552 // Save orig name since it gets updated in the first iteration 3553 // below. 3554 auto CalleeOrigName = CalledFunction->getName(); 3555 for (unsigned J = 0; J < StackNode.Clones.size(); J++) { 3556 // Do nothing if this version calls the original version of its 3557 // callee. 3558 if (!StackNode.Clones[J]) 3559 continue; 3560 auto NewF = M.getOrInsertFunction( 3561 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), 3562 CalledFunction->getFunctionType()); 3563 CallBase *CBClone; 3564 // Copy 0 is the original function. 3565 if (!J) 3566 CBClone = CB; 3567 else 3568 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3569 CBClone->setCalledFunction(NewF); 3570 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 3571 << ore::NV("Call", CBClone) << " in clone " 3572 << ore::NV("Caller", CBClone->getFunction()) 3573 << " assigned to call function clone " 3574 << ore::NV("Callee", NewF.getCallee())); 3575 } 3576 }; 3577 3578 // Locate the summary for F. 3579 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); 3580 // If not found, this could be an imported local (see comment in 3581 // findValueInfoForFunc). Skip for now as it will be cloned in its original 3582 // module (where it would have been promoted to global scope so should 3583 // satisfy any reference in this module). 3584 if (!TheFnVI) 3585 continue; 3586 3587 auto *GVSummary = 3588 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); 3589 if (!GVSummary) { 3590 // Must have been imported, use the summary which matches the definition。 3591 // (might be multiple if this was a linkonce_odr). 3592 auto SrcModuleMD = F.getMetadata("thinlto_src_module"); 3593 assert(SrcModuleMD && 3594 "enable-import-metadata is needed to emit thinlto_src_module"); 3595 StringRef SrcModule = 3596 dyn_cast<MDString>(SrcModuleMD->getOperand(0))->getString(); 3597 for (auto &GVS : TheFnVI.getSummaryList()) { 3598 if (GVS->modulePath() == SrcModule) { 3599 GVSummary = GVS.get(); 3600 break; 3601 } 3602 } 3603 assert(GVSummary && GVSummary->modulePath() == SrcModule); 3604 } 3605 3606 // If this was an imported alias skip it as we won't have the function 3607 // summary, and it should be cloned in the original module. 3608 if (isa<AliasSummary>(GVSummary)) 3609 continue; 3610 3611 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); 3612 3613 if (FS->allocs().empty() && FS->callsites().empty()) 3614 continue; 3615 3616 auto SI = FS->callsites().begin(); 3617 auto AI = FS->allocs().begin(); 3618 3619 // To handle callsite infos synthesized for tail calls which have missing 3620 // frames in the profiled context, map callee VI to the synthesized callsite 3621 // info. 3622 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; 3623 // Iterate the callsites for this function in reverse, since we place all 3624 // those synthesized for tail calls at the end. 3625 for (auto CallsiteIt = FS->callsites().rbegin(); 3626 CallsiteIt != FS->callsites().rend(); CallsiteIt++) { 3627 auto &Callsite = *CallsiteIt; 3628 // Stop as soon as we see a non-synthesized callsite info (see comment 3629 // above loop). All the entries added for discovered tail calls have empty 3630 // stack ids. 3631 if (!Callsite.StackIdIndices.empty()) 3632 break; 3633 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite}); 3634 } 3635 3636 // Assume for now that the instructions are in the exact same order 3637 // as when the summary was created, but confirm this is correct by 3638 // matching the stack ids. 3639 for (auto &BB : F) { 3640 for (auto &I : BB) { 3641 auto *CB = dyn_cast<CallBase>(&I); 3642 // Same handling as when creating module summary. 3643 if (!mayHaveMemprofSummary(CB)) 3644 continue; 3645 3646 auto *CalledValue = CB->getCalledOperand(); 3647 auto *CalledFunction = CB->getCalledFunction(); 3648 if (CalledValue && !CalledFunction) { 3649 CalledValue = CalledValue->stripPointerCasts(); 3650 // Stripping pointer casts can reveal a called function. 3651 CalledFunction = dyn_cast<Function>(CalledValue); 3652 } 3653 // Check if this is an alias to a function. If so, get the 3654 // called aliasee for the checks below. 3655 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 3656 assert(!CalledFunction && 3657 "Expected null called function in callsite for alias"); 3658 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 3659 } 3660 3661 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 3662 I.getMetadata(LLVMContext::MD_callsite)); 3663 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); 3664 3665 // Include allocs that were already assigned a memprof function 3666 // attribute in the statistics. 3667 if (CB->getAttributes().hasFnAttr("memprof")) { 3668 assert(!MemProfMD); 3669 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 3670 ? AllocTypeColdThinBackend++ 3671 : AllocTypeNotColdThinBackend++; 3672 OrigAllocsThinBackend++; 3673 AllocVersionsThinBackend++; 3674 if (!MaxAllocVersionsThinBackend) 3675 MaxAllocVersionsThinBackend = 1; 3676 // Remove any remaining callsite metadata and we can skip the rest of 3677 // the handling for this instruction, since no cloning needed. 3678 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3679 continue; 3680 } 3681 3682 if (MemProfMD) { 3683 // Consult the next alloc node. 3684 assert(AI != FS->allocs().end()); 3685 auto &AllocNode = *(AI++); 3686 3687 // Sanity check that the MIB stack ids match between the summary and 3688 // instruction metadata. 3689 auto MIBIter = AllocNode.MIBs.begin(); 3690 for (auto &MDOp : MemProfMD->operands()) { 3691 assert(MIBIter != AllocNode.MIBs.end()); 3692 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = 3693 MIBIter->StackIdIndices.begin(); 3694 auto *MIBMD = cast<const MDNode>(MDOp); 3695 MDNode *StackMDNode = getMIBStackNode(MIBMD); 3696 assert(StackMDNode); 3697 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); 3698 auto ContextIterBegin = 3699 StackContext.beginAfterSharedPrefix(CallsiteContext); 3700 // Skip the checking on the first iteration. 3701 uint64_t LastStackContextId = 3702 (ContextIterBegin != StackContext.end() && 3703 *ContextIterBegin == 0) 3704 ? 1 3705 : 0; 3706 for (auto ContextIter = ContextIterBegin; 3707 ContextIter != StackContext.end(); ++ContextIter) { 3708 // If this is a direct recursion, simply skip the duplicate 3709 // entries, to be consistent with how the summary ids were 3710 // generated during ModuleSummaryAnalysis. 3711 if (LastStackContextId == *ContextIter) 3712 continue; 3713 LastStackContextId = *ContextIter; 3714 assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); 3715 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3716 *ContextIter); 3717 StackIdIndexIter++; 3718 } 3719 MIBIter++; 3720 } 3721 3722 // Perform cloning if not yet done. 3723 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); 3724 3725 OrigAllocsThinBackend++; 3726 AllocVersionsThinBackend += AllocNode.Versions.size(); 3727 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) 3728 MaxAllocVersionsThinBackend = AllocNode.Versions.size(); 3729 3730 // If there is only one version that means we didn't end up 3731 // considering this function for cloning, and in that case the alloc 3732 // will still be none type or should have gotten the default NotCold. 3733 // Skip that after calling clone helper since that does some sanity 3734 // checks that confirm we haven't decided yet that we need cloning. 3735 if (AllocNode.Versions.size() == 1) { 3736 assert((AllocationType)AllocNode.Versions[0] == 3737 AllocationType::NotCold || 3738 (AllocationType)AllocNode.Versions[0] == 3739 AllocationType::None); 3740 UnclonableAllocsThinBackend++; 3741 continue; 3742 } 3743 3744 // All versions should have a singular allocation type. 3745 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { 3746 return Type == ((uint8_t)AllocationType::NotCold | 3747 (uint8_t)AllocationType::Cold); 3748 })); 3749 3750 // Update the allocation types per the summary info. 3751 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { 3752 // Ignore any that didn't get an assigned allocation type. 3753 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) 3754 continue; 3755 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; 3756 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ 3757 : AllocTypeNotColdThinBackend++; 3758 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); 3759 auto A = llvm::Attribute::get(F.getContext(), "memprof", 3760 AllocTypeString); 3761 CallBase *CBClone; 3762 // Copy 0 is the original function. 3763 if (!J) 3764 CBClone = CB; 3765 else 3766 // Since VMaps are only created for new clones, we index with 3767 // clone J-1 (J==0 is the original clone and does not have a VMaps 3768 // entry). 3769 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3770 CBClone->addFnAttr(A); 3771 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) 3772 << ore::NV("AllocationCall", CBClone) << " in clone " 3773 << ore::NV("Caller", CBClone->getFunction()) 3774 << " marked with memprof allocation attribute " 3775 << ore::NV("Attribute", AllocTypeString)); 3776 } 3777 } else if (!CallsiteContext.empty()) { 3778 // Consult the next callsite node. 3779 assert(SI != FS->callsites().end()); 3780 auto &StackNode = *(SI++); 3781 3782 #ifndef NDEBUG 3783 // Sanity check that the stack ids match between the summary and 3784 // instruction metadata. 3785 auto StackIdIndexIter = StackNode.StackIdIndices.begin(); 3786 for (auto StackId : CallsiteContext) { 3787 assert(StackIdIndexIter != StackNode.StackIdIndices.end()); 3788 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3789 StackId); 3790 StackIdIndexIter++; 3791 } 3792 #endif 3793 3794 CloneCallsite(StackNode, CB, CalledFunction); 3795 } else if (CB->isTailCall()) { 3796 // Locate the synthesized callsite info for the callee VI, if any was 3797 // created, and use that for cloning. 3798 ValueInfo CalleeVI = 3799 findValueInfoForFunc(*CalledFunction, M, ImportSummary); 3800 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { 3801 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); 3802 assert(Callsite != MapTailCallCalleeVIToCallsite.end()); 3803 CloneCallsite(Callsite->second, CB, CalledFunction); 3804 } 3805 } 3806 // Memprof and callsite metadata on memory allocations no longer needed. 3807 I.setMetadata(LLVMContext::MD_memprof, nullptr); 3808 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3809 } 3810 } 3811 } 3812 3813 return Changed; 3814 } 3815 3816 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3817 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { 3818 if (DumpCCG) { 3819 dbgs() << "CCG before cloning:\n"; 3820 dbgs() << *this; 3821 } 3822 if (ExportToDot) 3823 exportToDot("postbuild"); 3824 3825 if (VerifyCCG) { 3826 check(); 3827 } 3828 3829 identifyClones(); 3830 3831 if (VerifyCCG) { 3832 check(); 3833 } 3834 3835 if (DumpCCG) { 3836 dbgs() << "CCG after cloning:\n"; 3837 dbgs() << *this; 3838 } 3839 if (ExportToDot) 3840 exportToDot("cloned"); 3841 3842 bool Changed = assignFunctions(); 3843 3844 if (DumpCCG) { 3845 dbgs() << "CCG after assigning function clones:\n"; 3846 dbgs() << *this; 3847 } 3848 if (ExportToDot) 3849 exportToDot("clonefuncassign"); 3850 3851 if (MemProfReportHintedSizes) 3852 printTotalSizes(errs()); 3853 3854 return Changed; 3855 } 3856 3857 bool MemProfContextDisambiguation::processModule( 3858 Module &M, 3859 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 3860 3861 // If we have an import summary, then the cloning decisions were made during 3862 // the thin link on the index. Apply them and return. 3863 if (ImportSummary) 3864 return applyImport(M); 3865 3866 // TODO: If/when other types of memprof cloning are enabled beyond just for 3867 // hot and cold, we will need to change this to individually control the 3868 // AllocationType passed to addStackNodesForMIB during CCG construction. 3869 // Note that we specifically check this after applying imports above, so that 3870 // the option isn't needed to be passed to distributed ThinLTO backend 3871 // clang processes, which won't necessarily have visibility into the linker 3872 // dependences. Instead the information is communicated from the LTO link to 3873 // the backends via the combined summary index. 3874 if (!SupportsHotColdNew) 3875 return false; 3876 3877 ModuleCallsiteContextGraph CCG(M, OREGetter); 3878 return CCG.process(); 3879 } 3880 3881 MemProfContextDisambiguation::MemProfContextDisambiguation( 3882 const ModuleSummaryIndex *Summary) 3883 : ImportSummary(Summary) { 3884 if (ImportSummary) { 3885 // The MemProfImportSummary should only be used for testing ThinLTO 3886 // distributed backend handling via opt, in which case we don't have a 3887 // summary from the pass pipeline. 3888 assert(MemProfImportSummary.empty()); 3889 return; 3890 } 3891 if (MemProfImportSummary.empty()) 3892 return; 3893 3894 auto ReadSummaryFile = 3895 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); 3896 if (!ReadSummaryFile) { 3897 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), 3898 "Error loading file '" + MemProfImportSummary + 3899 "': "); 3900 return; 3901 } 3902 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); 3903 if (!ImportSummaryForTestingOrErr) { 3904 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), 3905 "Error parsing file '" + MemProfImportSummary + 3906 "': "); 3907 return; 3908 } 3909 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); 3910 ImportSummary = ImportSummaryForTesting.get(); 3911 } 3912 3913 PreservedAnalyses MemProfContextDisambiguation::run(Module &M, 3914 ModuleAnalysisManager &AM) { 3915 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 3916 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 3917 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 3918 }; 3919 if (!processModule(M, OREGetter)) 3920 return PreservedAnalyses::all(); 3921 return PreservedAnalyses::none(); 3922 } 3923 3924 void MemProfContextDisambiguation::run( 3925 ModuleSummaryIndex &Index, 3926 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 3927 isPrevailing) { 3928 // TODO: If/when other types of memprof cloning are enabled beyond just for 3929 // hot and cold, we will need to change this to individually control the 3930 // AllocationType passed to addStackNodesForMIB during CCG construction. 3931 // The index was set from the option, so these should be in sync. 3932 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); 3933 if (!SupportsHotColdNew) 3934 return; 3935 3936 IndexCallsiteContextGraph CCG(Index, isPrevailing); 3937 CCG.process(); 3938 } 3939