1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements support for context disambiguation of allocation 10 // calls for profile guided heap optimization. Specifically, it uses Memprof 11 // profiles which indicate context specific allocation behavior (currently 12 // distinguishing cold vs hot memory allocations). Cloning is performed to 13 // expose the cold allocation call contexts, and the allocation calls are 14 // subsequently annotated with an attribute for later transformation. 15 // 16 // The transformations can be performed either directly on IR (regular LTO), or 17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). 18 // Both types of LTO operate on a the same base graph representation, which 19 // uses CRTP to support either IR or Index formats. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/ADT/DenseSet.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/SetOperations.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/MemoryProfileInfo.h" 33 #include "llvm/Analysis/ModuleSummaryAnalysis.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/ModuleSummaryIndex.h" 40 #include "llvm/Pass.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/FileSystem.h" 43 #include "llvm/Support/GraphWriter.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Transforms/IPO.h" 46 #include "llvm/Transforms/Utils/Cloning.h" 47 #include <sstream> 48 #include <unordered_map> 49 #include <vector> 50 using namespace llvm; 51 using namespace llvm::memprof; 52 53 #define DEBUG_TYPE "memprof-context-disambiguation" 54 55 STATISTIC(FunctionClonesAnalysis, 56 "Number of function clones created during whole program analysis"); 57 STATISTIC(FunctionClonesThinBackend, 58 "Number of function clones created during ThinLTO backend"); 59 STATISTIC(FunctionsClonedThinBackend, 60 "Number of functions that had clones created during ThinLTO backend"); 61 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " 62 "cloned) during whole program analysis"); 63 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " 64 "during whole program analysis"); 65 STATISTIC(AllocTypeNotColdThinBackend, 66 "Number of not cold static allocations (possibly cloned) during " 67 "ThinLTO backend"); 68 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " 69 "(possibly cloned) during ThinLTO backend"); 70 STATISTIC(OrigAllocsThinBackend, 71 "Number of original (not cloned) allocations with memprof profiles " 72 "during ThinLTO backend"); 73 STATISTIC( 74 AllocVersionsThinBackend, 75 "Number of allocation versions (including clones) during ThinLTO backend"); 76 STATISTIC(MaxAllocVersionsThinBackend, 77 "Maximum number of allocation versions created for an original " 78 "allocation during ThinLTO backend"); 79 STATISTIC(UnclonableAllocsThinBackend, 80 "Number of unclonable ambigous allocations during ThinLTO backend"); 81 STATISTIC(RemovedEdgesWithMismatchedCallees, 82 "Number of edges removed due to mismatched callees (profiled vs IR)"); 83 STATISTIC(FoundProfiledCalleeCount, 84 "Number of profiled callees found via tail calls"); 85 STATISTIC(FoundProfiledCalleeDepth, 86 "Aggregate depth of profiled callees found via tail calls"); 87 STATISTIC(FoundProfiledCalleeMaxDepth, 88 "Maximum depth of profiled callees found via tail calls"); 89 STATISTIC(FoundProfiledCalleeNonUniquelyCount, 90 "Number of profiled callees found via multiple tail call chains"); 91 92 static cl::opt<std::string> DotFilePathPrefix( 93 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, 94 cl::value_desc("filename"), 95 cl::desc("Specify the path prefix of the MemProf dot files.")); 96 97 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), 98 cl::Hidden, 99 cl::desc("Export graph to dot files.")); 100 101 static cl::opt<bool> 102 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, 103 cl::desc("Dump CallingContextGraph to stdout after each stage.")); 104 105 static cl::opt<bool> 106 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, 107 cl::desc("Perform verification checks on CallingContextGraph.")); 108 109 static cl::opt<bool> 110 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, 111 cl::desc("Perform frequent verification checks on nodes.")); 112 113 static cl::opt<std::string> MemProfImportSummary( 114 "memprof-import-summary", 115 cl::desc("Import summary to use for testing the ThinLTO backend via opt"), 116 cl::Hidden); 117 118 static cl::opt<unsigned> 119 TailCallSearchDepth("memprof-tail-call-search-depth", cl::init(5), 120 cl::Hidden, 121 cl::desc("Max depth to recursively search for missing " 122 "frames through tail calls.")); 123 124 namespace llvm { 125 // Indicate we are linking with an allocator that supports hot/cold operator 126 // new interfaces. 127 cl::opt<bool> SupportsHotColdNew( 128 "supports-hot-cold-new", cl::init(false), cl::Hidden, 129 cl::desc("Linking with hot/cold operator new interfaces")); 130 } // namespace llvm 131 132 namespace { 133 /// CRTP base for graphs built from either IR or ThinLTO summary index. 134 /// 135 /// The graph represents the call contexts in all memprof metadata on allocation 136 /// calls, with nodes for the allocations themselves, as well as for the calls 137 /// in each context. The graph is initially built from the allocation memprof 138 /// metadata (or summary) MIBs. It is then updated to match calls with callsite 139 /// metadata onto the nodes, updating it to reflect any inlining performed on 140 /// those calls. 141 /// 142 /// Each MIB (representing an allocation's call context with allocation 143 /// behavior) is assigned a unique context id during the graph build. The edges 144 /// and nodes in the graph are decorated with the context ids they carry. This 145 /// is used to correctly update the graph when cloning is performed so that we 146 /// can uniquify the context for a single (possibly cloned) allocation. 147 template <typename DerivedCCG, typename FuncTy, typename CallTy> 148 class CallsiteContextGraph { 149 public: 150 CallsiteContextGraph() = default; 151 CallsiteContextGraph(const CallsiteContextGraph &) = default; 152 CallsiteContextGraph(CallsiteContextGraph &&) = default; 153 154 /// Main entry point to perform analysis and transformations on graph. 155 bool process(); 156 157 /// Perform cloning on the graph necessary to uniquely identify the allocation 158 /// behavior of an allocation based on its context. 159 void identifyClones(); 160 161 /// Assign callsite clones to functions, cloning functions as needed to 162 /// accommodate the combinations of their callsite clones reached by callers. 163 /// For regular LTO this clones functions and callsites in the IR, but for 164 /// ThinLTO the cloning decisions are noted in the summaries and later applied 165 /// in applyImport. 166 bool assignFunctions(); 167 168 void dump() const; 169 void print(raw_ostream &OS) const; 170 171 friend raw_ostream &operator<<(raw_ostream &OS, 172 const CallsiteContextGraph &CCG) { 173 CCG.print(OS); 174 return OS; 175 } 176 177 friend struct GraphTraits< 178 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 179 friend struct DOTGraphTraits< 180 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 181 182 void exportToDot(std::string Label) const; 183 184 /// Represents a function clone via FuncTy pointer and clone number pair. 185 struct FuncInfo final 186 : public std::pair<FuncTy *, unsigned /*Clone number*/> { 187 using Base = std::pair<FuncTy *, unsigned>; 188 FuncInfo(const Base &B) : Base(B) {} 189 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} 190 explicit operator bool() const { return this->first != nullptr; } 191 FuncTy *func() const { return this->first; } 192 unsigned cloneNo() const { return this->second; } 193 }; 194 195 /// Represents a callsite clone via CallTy and clone number pair. 196 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { 197 using Base = std::pair<CallTy, unsigned>; 198 CallInfo(const Base &B) : Base(B) {} 199 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) 200 : Base(Call, CloneNo) {} 201 explicit operator bool() const { return (bool)this->first; } 202 CallTy call() const { return this->first; } 203 unsigned cloneNo() const { return this->second; } 204 void setCloneNo(unsigned N) { this->second = N; } 205 void print(raw_ostream &OS) const { 206 if (!operator bool()) { 207 assert(!cloneNo()); 208 OS << "null Call"; 209 return; 210 } 211 call()->print(OS); 212 OS << "\t(clone " << cloneNo() << ")"; 213 } 214 void dump() const { 215 print(dbgs()); 216 dbgs() << "\n"; 217 } 218 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { 219 Call.print(OS); 220 return OS; 221 } 222 }; 223 224 struct ContextEdge; 225 226 /// Node in the Callsite Context Graph 227 struct ContextNode { 228 // Keep this for now since in the IR case where we have an Instruction* it 229 // is not as immediately discoverable. Used for printing richer information 230 // when dumping graph. 231 bool IsAllocation; 232 233 // Keeps track of when the Call was reset to null because there was 234 // recursion. 235 bool Recursive = false; 236 237 // The corresponding allocation or interior call. 238 CallInfo Call; 239 240 // For alloc nodes this is a unique id assigned when constructed, and for 241 // callsite stack nodes it is the original stack id when the node is 242 // constructed from the memprof MIB metadata on the alloc nodes. Note that 243 // this is only used when matching callsite metadata onto the stack nodes 244 // created when processing the allocation memprof MIBs, and for labeling 245 // nodes in the dot graph. Therefore we don't bother to assign a value for 246 // clones. 247 uint64_t OrigStackOrAllocId = 0; 248 249 // This will be formed by ORing together the AllocationType enum values 250 // for contexts including this node. 251 uint8_t AllocTypes = 0; 252 253 // Edges to all callees in the profiled call stacks. 254 // TODO: Should this be a map (from Callee node) for more efficient lookup? 255 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; 256 257 // Edges to all callers in the profiled call stacks. 258 // TODO: Should this be a map (from Caller node) for more efficient lookup? 259 std::vector<std::shared_ptr<ContextEdge>> CallerEdges; 260 261 // The set of IDs for contexts including this node. 262 DenseSet<uint32_t> ContextIds; 263 264 // List of clones of this ContextNode, initially empty. 265 std::vector<ContextNode *> Clones; 266 267 // If a clone, points to the original uncloned node. 268 ContextNode *CloneOf = nullptr; 269 270 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} 271 272 ContextNode(bool IsAllocation, CallInfo C) 273 : IsAllocation(IsAllocation), Call(C) {} 274 275 void addClone(ContextNode *Clone) { 276 if (CloneOf) { 277 CloneOf->Clones.push_back(Clone); 278 Clone->CloneOf = CloneOf; 279 } else { 280 Clones.push_back(Clone); 281 assert(!Clone->CloneOf); 282 Clone->CloneOf = this; 283 } 284 } 285 286 ContextNode *getOrigNode() { 287 if (!CloneOf) 288 return this; 289 return CloneOf; 290 } 291 292 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 293 unsigned int ContextId); 294 295 ContextEdge *findEdgeFromCallee(const ContextNode *Callee); 296 ContextEdge *findEdgeFromCaller(const ContextNode *Caller); 297 void eraseCalleeEdge(const ContextEdge *Edge); 298 void eraseCallerEdge(const ContextEdge *Edge); 299 300 void setCall(CallInfo C) { Call = C; } 301 302 bool hasCall() const { return (bool)Call.call(); } 303 304 void printCall(raw_ostream &OS) const { Call.print(OS); } 305 306 // True if this node was effectively removed from the graph, in which case 307 // its context id set, caller edges, and callee edges should all be empty. 308 bool isRemoved() const { 309 assert(ContextIds.empty() == 310 (CalleeEdges.empty() && CallerEdges.empty())); 311 return ContextIds.empty(); 312 } 313 314 void dump() const; 315 void print(raw_ostream &OS) const; 316 317 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { 318 Node.print(OS); 319 return OS; 320 } 321 }; 322 323 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or 324 /// callee. 325 struct ContextEdge { 326 ContextNode *Callee; 327 ContextNode *Caller; 328 329 // This will be formed by ORing together the AllocationType enum values 330 // for contexts including this edge. 331 uint8_t AllocTypes = 0; 332 333 // The set of IDs for contexts including this edge. 334 DenseSet<uint32_t> ContextIds; 335 336 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, 337 DenseSet<uint32_t> ContextIds) 338 : Callee(Callee), Caller(Caller), AllocTypes(AllocType), 339 ContextIds(ContextIds) {} 340 341 DenseSet<uint32_t> &getContextIds() { return ContextIds; } 342 343 void dump() const; 344 void print(raw_ostream &OS) const; 345 346 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { 347 Edge.print(OS); 348 return OS; 349 } 350 }; 351 352 /// Helper to remove callee edges that have allocation type None (due to not 353 /// carrying any context ids) after transformations. 354 void removeNoneTypeCalleeEdges(ContextNode *Node); 355 356 protected: 357 /// Get a list of nodes corresponding to the stack ids in the given callsite 358 /// context. 359 template <class NodeT, class IteratorT> 360 std::vector<uint64_t> 361 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); 362 363 /// Adds nodes for the given allocation and any stack ids on its memprof MIB 364 /// metadata (or summary). 365 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); 366 367 /// Adds nodes for the given MIB stack ids. 368 template <class NodeT, class IteratorT> 369 void addStackNodesForMIB(ContextNode *AllocNode, 370 CallStack<NodeT, IteratorT> &StackContext, 371 CallStack<NodeT, IteratorT> &CallsiteContext, 372 AllocationType AllocType); 373 374 /// Matches all callsite metadata (or summary) to the nodes created for 375 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any 376 /// inlining performed on those callsite instructions. 377 void updateStackNodes(); 378 379 /// Update graph to conservatively handle any callsite stack nodes that target 380 /// multiple different callee target functions. 381 void handleCallsitesWithMultipleTargets(); 382 383 /// Save lists of calls with MemProf metadata in each function, for faster 384 /// iteration. 385 MapVector<FuncTy *, std::vector<CallInfo>> FuncToCallsWithMetadata; 386 387 /// Map from callsite node to the enclosing caller function. 388 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; 389 390 private: 391 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; 392 393 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, 394 const FuncTy *, DenseSet<uint32_t>>; 395 396 /// Assigns the given Node to calls at or inlined into the location with 397 /// the Node's stack id, after post order traversing and processing its 398 /// caller nodes. Uses the call information recorded in the given 399 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences 400 /// as needed. Called by updateStackNodes which sets up the given 401 /// StackIdToMatchingCalls map. 402 void assignStackNodesPostOrder( 403 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 404 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); 405 406 /// Duplicates the given set of context ids, updating the provided 407 /// map from each original id with the newly generated context ids, 408 /// and returning the new duplicated id set. 409 DenseSet<uint32_t> duplicateContextIds( 410 const DenseSet<uint32_t> &StackSequenceContextIds, 411 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 412 413 /// Propagates all duplicated context ids across the graph. 414 void propagateDuplicateContextIds( 415 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 416 417 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, 418 /// else to its callers. Also updates OrigNode's edges to remove any context 419 /// ids moved to the newly created edge. 420 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, 421 bool TowardsCallee); 422 423 /// Get the stack id corresponding to the given Id or Index (for IR this will 424 /// return itself, for a summary index this will return the id recorded in the 425 /// index for that stack id index value). 426 uint64_t getStackId(uint64_t IdOrIndex) const { 427 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); 428 } 429 430 /// Returns true if the given call targets the callee of the given edge, or if 431 /// we were able to identify the call chain through intermediate tail calls. 432 /// In the latter case new context nodes are added to the graph for the 433 /// identified tail calls, and their synthesized nodes are added to 434 /// TailCallToContextNodeMap. The EdgeIter is updated in either case to the 435 /// next element after the input position (either incremented or updated after 436 /// removing the old edge). 437 bool 438 calleesMatch(CallTy Call, EdgeIter &EI, 439 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap); 440 441 /// Returns true if the given call targets the given function, or if we were 442 /// able to identify the call chain through intermediate tail calls (in which 443 /// case FoundCalleeChain will be populated). 444 bool calleeMatchesFunc( 445 CallTy Call, const FuncTy *Func, const FuncTy *CallerFunc, 446 std::vector<std::pair<CallTy, FuncTy *>> &FoundCalleeChain) { 447 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc( 448 Call, Func, CallerFunc, FoundCalleeChain); 449 } 450 451 /// Get a list of nodes corresponding to the stack ids in the given 452 /// callsite's context. 453 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { 454 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( 455 Call); 456 } 457 458 /// Get the last stack id in the context for callsite. 459 uint64_t getLastStackId(CallTy Call) { 460 return static_cast<DerivedCCG *>(this)->getLastStackId(Call); 461 } 462 463 /// Update the allocation call to record type of allocated memory. 464 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { 465 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; 466 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); 467 } 468 469 /// Update non-allocation call to invoke (possibly cloned) function 470 /// CalleeFunc. 471 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { 472 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); 473 } 474 475 /// Clone the given function for the given callsite, recording mapping of all 476 /// of the functions tracked calls to their new versions in the CallMap. 477 /// Assigns new clones to clone number CloneNo. 478 FuncInfo cloneFunctionForCallsite( 479 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 480 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 481 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( 482 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); 483 } 484 485 /// Gets a label to use in the dot graph for the given call clone in the given 486 /// function. 487 std::string getLabel(const FuncTy *Func, const CallTy Call, 488 unsigned CloneNo) const { 489 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); 490 } 491 492 /// Helpers to find the node corresponding to the given call or stackid. 493 ContextNode *getNodeForInst(const CallInfo &C); 494 ContextNode *getNodeForAlloc(const CallInfo &C); 495 ContextNode *getNodeForStackId(uint64_t StackId); 496 497 /// Removes the node information recorded for the given call. 498 void unsetNodeForInst(const CallInfo &C); 499 500 /// Computes the alloc type corresponding to the given context ids, by 501 /// unioning their recorded alloc types. 502 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); 503 504 /// Returns the alloction type of the intersection of the contexts of two 505 /// nodes (based on their provided context id sets), optimized for the case 506 /// when Node1Ids is smaller than Node2Ids. 507 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, 508 const DenseSet<uint32_t> &Node2Ids); 509 510 /// Returns the alloction type of the intersection of the contexts of two 511 /// nodes (based on their provided context id sets). 512 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, 513 const DenseSet<uint32_t> &Node2Ids); 514 515 /// Create a clone of Edge's callee and move Edge to that new callee node, 516 /// performing the necessary context id and allocation type updates. 517 /// If callee's caller edge iterator is supplied, it is updated when removing 518 /// the edge from that list. 519 ContextNode * 520 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 521 EdgeIter *CallerEdgeI = nullptr); 522 523 /// Change the callee of Edge to existing callee clone NewCallee, performing 524 /// the necessary context id and allocation type updates. 525 /// If callee's caller edge iterator is supplied, it is updated when removing 526 /// the edge from that list. 527 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 528 ContextNode *NewCallee, 529 EdgeIter *CallerEdgeI = nullptr, 530 bool NewClone = false); 531 532 /// Recursively perform cloning on the graph for the given Node and its 533 /// callers, in order to uniquely identify the allocation behavior of an 534 /// allocation given its context. 535 void identifyClones(ContextNode *Node, 536 DenseSet<const ContextNode *> &Visited); 537 538 /// Map from each context ID to the AllocationType assigned to that context. 539 std::map<uint32_t, AllocationType> ContextIdToAllocationType; 540 541 /// Identifies the context node created for a stack id when adding the MIB 542 /// contexts to the graph. This is used to locate the context nodes when 543 /// trying to assign the corresponding callsites with those stack ids to these 544 /// nodes. 545 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; 546 547 /// Maps to track the calls to their corresponding nodes in the graph. 548 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; 549 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; 550 551 /// Owner of all ContextNode unique_ptrs. 552 std::vector<std::unique_ptr<ContextNode>> NodeOwner; 553 554 /// Perform sanity checks on graph when requested. 555 void check() const; 556 557 /// Keeps track of the last unique context id assigned. 558 unsigned int LastContextId = 0; 559 }; 560 561 template <typename DerivedCCG, typename FuncTy, typename CallTy> 562 using ContextNode = 563 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; 564 template <typename DerivedCCG, typename FuncTy, typename CallTy> 565 using ContextEdge = 566 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; 567 template <typename DerivedCCG, typename FuncTy, typename CallTy> 568 using FuncInfo = 569 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; 570 template <typename DerivedCCG, typename FuncTy, typename CallTy> 571 using CallInfo = 572 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; 573 574 /// CRTP derived class for graphs built from IR (regular LTO). 575 class ModuleCallsiteContextGraph 576 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 577 Instruction *> { 578 public: 579 ModuleCallsiteContextGraph( 580 Module &M, 581 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); 582 583 private: 584 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 585 Instruction *>; 586 587 uint64_t getStackId(uint64_t IdOrIndex) const; 588 bool calleeMatchesFunc( 589 Instruction *Call, const Function *Func, const Function *CallerFunc, 590 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain); 591 bool findProfiledCalleeThroughTailCalls( 592 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 593 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 594 bool &FoundMultipleCalleeChains); 595 uint64_t getLastStackId(Instruction *Call); 596 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); 597 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 598 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 599 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 600 Instruction *>::FuncInfo 601 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 602 std::map<CallInfo, CallInfo> &CallMap, 603 std::vector<CallInfo> &CallsWithMetadataInFunc, 604 unsigned CloneNo); 605 std::string getLabel(const Function *Func, const Instruction *Call, 606 unsigned CloneNo) const; 607 608 const Module &Mod; 609 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 610 }; 611 612 /// Represents a call in the summary index graph, which can either be an 613 /// allocation or an interior callsite node in an allocation's context. 614 /// Holds a pointer to the corresponding data structure in the index. 615 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { 616 IndexCall() : PointerUnion() {} 617 IndexCall(std::nullptr_t) : IndexCall() {} 618 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} 619 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} 620 IndexCall(PointerUnion PT) : PointerUnion(PT) {} 621 622 IndexCall *operator->() { return this; } 623 624 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } 625 626 void print(raw_ostream &OS) const { 627 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) { 628 OS << *AI; 629 } else { 630 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase()); 631 assert(CI); 632 OS << *CI; 633 } 634 } 635 }; 636 637 /// CRTP derived class for graphs built from summary index (ThinLTO). 638 class IndexCallsiteContextGraph 639 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 640 IndexCall> { 641 public: 642 IndexCallsiteContextGraph( 643 ModuleSummaryIndex &Index, 644 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 645 isPrevailing); 646 647 ~IndexCallsiteContextGraph() { 648 // Now that we are done with the graph it is safe to add the new 649 // CallsiteInfo structs to the function summary vectors. The graph nodes 650 // point into locations within these vectors, so we don't want to add them 651 // any earlier. 652 for (auto &I : FunctionCalleesToSynthesizedCallsiteInfos) { 653 auto *FS = I.first; 654 for (auto &Callsite : I.second) 655 FS->addCallsite(*Callsite.second); 656 } 657 } 658 659 private: 660 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 661 IndexCall>; 662 663 uint64_t getStackId(uint64_t IdOrIndex) const; 664 bool calleeMatchesFunc( 665 IndexCall &Call, const FunctionSummary *Func, 666 const FunctionSummary *CallerFunc, 667 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain); 668 bool findProfiledCalleeThroughTailCalls( 669 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 670 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 671 bool &FoundMultipleCalleeChains); 672 uint64_t getLastStackId(IndexCall &Call); 673 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); 674 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 675 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 676 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 677 IndexCall>::FuncInfo 678 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 679 std::map<CallInfo, CallInfo> &CallMap, 680 std::vector<CallInfo> &CallsWithMetadataInFunc, 681 unsigned CloneNo); 682 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, 683 unsigned CloneNo) const; 684 685 // Saves mapping from function summaries containing memprof records back to 686 // its VI, for use in checking and debugging. 687 std::map<const FunctionSummary *, ValueInfo> FSToVIMap; 688 689 const ModuleSummaryIndex &Index; 690 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 691 isPrevailing; 692 693 // Saves/owns the callsite info structures synthesized for missing tail call 694 // frames that we discover while building the graph. 695 // It maps from the summary of the function making the tail call, to a map 696 // of callee ValueInfo to corresponding synthesized callsite info. 697 std::unordered_map<FunctionSummary *, 698 std::map<ValueInfo, std::unique_ptr<CallsiteInfo>>> 699 FunctionCalleesToSynthesizedCallsiteInfos; 700 }; 701 } // namespace 702 703 namespace llvm { 704 template <> 705 struct DenseMapInfo<typename CallsiteContextGraph< 706 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> 707 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; 708 template <> 709 struct DenseMapInfo<typename CallsiteContextGraph< 710 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> 711 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; 712 template <> 713 struct DenseMapInfo<IndexCall> 714 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; 715 } // end namespace llvm 716 717 namespace { 718 719 struct FieldSeparator { 720 bool Skip = true; 721 const char *Sep; 722 723 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} 724 }; 725 726 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { 727 if (FS.Skip) { 728 FS.Skip = false; 729 return OS; 730 } 731 return OS << FS.Sep; 732 } 733 734 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc 735 // type we should actually use on the corresponding allocation. 736 // If we can't clone a node that has NotCold+Cold alloc type, we will fall 737 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold 738 // from NotCold. 739 AllocationType allocTypeToUse(uint8_t AllocTypes) { 740 assert(AllocTypes != (uint8_t)AllocationType::None); 741 if (AllocTypes == 742 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 743 return AllocationType::NotCold; 744 else 745 return (AllocationType)AllocTypes; 746 } 747 748 // Helper to check if the alloc types for all edges recorded in the 749 // InAllocTypes vector match the alloc types for all edges in the Edges 750 // vector. 751 template <typename DerivedCCG, typename FuncTy, typename CallTy> 752 bool allocTypesMatch( 753 const std::vector<uint8_t> &InAllocTypes, 754 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> 755 &Edges) { 756 return std::equal( 757 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), 758 [](const uint8_t &l, 759 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { 760 // Can share if one of the edges is None type - don't 761 // care about the type along that edge as it doesn't 762 // exist for those context ids. 763 if (l == (uint8_t)AllocationType::None || 764 r->AllocTypes == (uint8_t)AllocationType::None) 765 return true; 766 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); 767 }); 768 } 769 770 } // end anonymous namespace 771 772 template <typename DerivedCCG, typename FuncTy, typename CallTy> 773 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 774 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( 775 const CallInfo &C) { 776 ContextNode *Node = getNodeForAlloc(C); 777 if (Node) 778 return Node; 779 780 return NonAllocationCallToContextNodeMap.lookup(C); 781 } 782 783 template <typename DerivedCCG, typename FuncTy, typename CallTy> 784 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 785 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( 786 const CallInfo &C) { 787 return AllocationCallToContextNodeMap.lookup(C); 788 } 789 790 template <typename DerivedCCG, typename FuncTy, typename CallTy> 791 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 792 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( 793 uint64_t StackId) { 794 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); 795 if (StackEntryNode != StackEntryIdToContextNodeMap.end()) 796 return StackEntryNode->second; 797 return nullptr; 798 } 799 800 template <typename DerivedCCG, typename FuncTy, typename CallTy> 801 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst( 802 const CallInfo &C) { 803 AllocationCallToContextNodeMap.erase(C) || 804 NonAllocationCallToContextNodeMap.erase(C); 805 assert(!AllocationCallToContextNodeMap.count(C) && 806 !NonAllocationCallToContextNodeMap.count(C)); 807 } 808 809 template <typename DerivedCCG, typename FuncTy, typename CallTy> 810 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 811 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 812 unsigned int ContextId) { 813 for (auto &Edge : CallerEdges) { 814 if (Edge->Caller == Caller) { 815 Edge->AllocTypes |= (uint8_t)AllocType; 816 Edge->getContextIds().insert(ContextId); 817 return; 818 } 819 } 820 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( 821 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); 822 CallerEdges.push_back(Edge); 823 Caller->CalleeEdges.push_back(Edge); 824 } 825 826 template <typename DerivedCCG, typename FuncTy, typename CallTy> 827 void CallsiteContextGraph< 828 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { 829 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 830 auto Edge = *EI; 831 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 832 assert(Edge->ContextIds.empty()); 833 Edge->Callee->eraseCallerEdge(Edge.get()); 834 EI = Node->CalleeEdges.erase(EI); 835 } else 836 ++EI; 837 } 838 } 839 840 template <typename DerivedCCG, typename FuncTy, typename CallTy> 841 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 842 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 843 findEdgeFromCallee(const ContextNode *Callee) { 844 for (const auto &Edge : CalleeEdges) 845 if (Edge->Callee == Callee) 846 return Edge.get(); 847 return nullptr; 848 } 849 850 template <typename DerivedCCG, typename FuncTy, typename CallTy> 851 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 852 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 853 findEdgeFromCaller(const ContextNode *Caller) { 854 for (const auto &Edge : CallerEdges) 855 if (Edge->Caller == Caller) 856 return Edge.get(); 857 return nullptr; 858 } 859 860 template <typename DerivedCCG, typename FuncTy, typename CallTy> 861 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 862 eraseCalleeEdge(const ContextEdge *Edge) { 863 auto EI = llvm::find_if( 864 CalleeEdges, [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { 865 return CalleeEdge.get() == Edge; 866 }); 867 assert(EI != CalleeEdges.end()); 868 CalleeEdges.erase(EI); 869 } 870 871 template <typename DerivedCCG, typename FuncTy, typename CallTy> 872 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 873 eraseCallerEdge(const ContextEdge *Edge) { 874 auto EI = llvm::find_if( 875 CallerEdges, [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { 876 return CallerEdge.get() == Edge; 877 }); 878 assert(EI != CallerEdges.end()); 879 CallerEdges.erase(EI); 880 } 881 882 template <typename DerivedCCG, typename FuncTy, typename CallTy> 883 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( 884 DenseSet<uint32_t> &ContextIds) { 885 uint8_t BothTypes = 886 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 887 uint8_t AllocType = (uint8_t)AllocationType::None; 888 for (auto Id : ContextIds) { 889 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 890 // Bail early if alloc type reached both, no further refinement. 891 if (AllocType == BothTypes) 892 return AllocType; 893 } 894 return AllocType; 895 } 896 897 template <typename DerivedCCG, typename FuncTy, typename CallTy> 898 uint8_t 899 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( 900 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 901 uint8_t BothTypes = 902 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 903 uint8_t AllocType = (uint8_t)AllocationType::None; 904 for (auto Id : Node1Ids) { 905 if (!Node2Ids.count(Id)) 906 continue; 907 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 908 // Bail early if alloc type reached both, no further refinement. 909 if (AllocType == BothTypes) 910 return AllocType; 911 } 912 return AllocType; 913 } 914 915 template <typename DerivedCCG, typename FuncTy, typename CallTy> 916 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( 917 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 918 if (Node1Ids.size() < Node2Ids.size()) 919 return intersectAllocTypesImpl(Node1Ids, Node2Ids); 920 else 921 return intersectAllocTypesImpl(Node2Ids, Node1Ids); 922 } 923 924 template <typename DerivedCCG, typename FuncTy, typename CallTy> 925 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 926 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( 927 CallInfo Call, const FuncTy *F) { 928 assert(!getNodeForAlloc(Call)); 929 NodeOwner.push_back( 930 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); 931 ContextNode *AllocNode = NodeOwner.back().get(); 932 AllocationCallToContextNodeMap[Call] = AllocNode; 933 NodeToCallingFunc[AllocNode] = F; 934 // Use LastContextId as a uniq id for MIB allocation nodes. 935 AllocNode->OrigStackOrAllocId = LastContextId; 936 // Alloc type should be updated as we add in the MIBs. We should assert 937 // afterwards that it is not still None. 938 AllocNode->AllocTypes = (uint8_t)AllocationType::None; 939 940 return AllocNode; 941 } 942 943 template <typename DerivedCCG, typename FuncTy, typename CallTy> 944 template <class NodeT, class IteratorT> 945 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( 946 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, 947 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) { 948 // Treating the hot alloc type as NotCold before the disambiguation for "hot" 949 // is done. 950 if (AllocType == AllocationType::Hot) 951 AllocType = AllocationType::NotCold; 952 953 ContextIdToAllocationType[++LastContextId] = AllocType; 954 955 // Update alloc type and context ids for this MIB. 956 AllocNode->AllocTypes |= (uint8_t)AllocType; 957 AllocNode->ContextIds.insert(LastContextId); 958 959 // Now add or update nodes for each stack id in alloc's context. 960 // Later when processing the stack ids on non-alloc callsites we will adjust 961 // for any inlining in the context. 962 ContextNode *PrevNode = AllocNode; 963 // Look for recursion (direct recursion should have been collapsed by 964 // module summary analysis, here we should just be detecting mutual 965 // recursion). Mark these nodes so we don't try to clone. 966 SmallSet<uint64_t, 8> StackIdSet; 967 // Skip any on the allocation call (inlining). 968 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); 969 ContextIter != StackContext.end(); ++ContextIter) { 970 auto StackId = getStackId(*ContextIter); 971 ContextNode *StackNode = getNodeForStackId(StackId); 972 if (!StackNode) { 973 NodeOwner.push_back( 974 std::make_unique<ContextNode>(/*IsAllocation=*/false)); 975 StackNode = NodeOwner.back().get(); 976 StackEntryIdToContextNodeMap[StackId] = StackNode; 977 StackNode->OrigStackOrAllocId = StackId; 978 } 979 auto Ins = StackIdSet.insert(StackId); 980 if (!Ins.second) 981 StackNode->Recursive = true; 982 StackNode->ContextIds.insert(LastContextId); 983 StackNode->AllocTypes |= (uint8_t)AllocType; 984 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); 985 PrevNode = StackNode; 986 } 987 } 988 989 template <typename DerivedCCG, typename FuncTy, typename CallTy> 990 DenseSet<uint32_t> 991 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( 992 const DenseSet<uint32_t> &StackSequenceContextIds, 993 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 994 DenseSet<uint32_t> NewContextIds; 995 for (auto OldId : StackSequenceContextIds) { 996 NewContextIds.insert(++LastContextId); 997 OldToNewContextIds[OldId].insert(LastContextId); 998 assert(ContextIdToAllocationType.count(OldId)); 999 // The new context has the same allocation type as original. 1000 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; 1001 } 1002 return NewContextIds; 1003 } 1004 1005 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1006 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1007 propagateDuplicateContextIds( 1008 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 1009 // Build a set of duplicated context ids corresponding to the input id set. 1010 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { 1011 DenseSet<uint32_t> NewIds; 1012 for (auto Id : ContextIds) 1013 if (auto NewId = OldToNewContextIds.find(Id); 1014 NewId != OldToNewContextIds.end()) 1015 NewIds.insert(NewId->second.begin(), NewId->second.end()); 1016 return NewIds; 1017 }; 1018 1019 // Recursively update context ids sets along caller edges. 1020 auto UpdateCallers = [&](ContextNode *Node, 1021 DenseSet<const ContextEdge *> &Visited, 1022 auto &&UpdateCallers) -> void { 1023 for (const auto &Edge : Node->CallerEdges) { 1024 auto Inserted = Visited.insert(Edge.get()); 1025 if (!Inserted.second) 1026 continue; 1027 ContextNode *NextNode = Edge->Caller; 1028 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); 1029 // Only need to recursively iterate to NextNode via this caller edge if 1030 // it resulted in any added ids to NextNode. 1031 if (!NewIdsToAdd.empty()) { 1032 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1033 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1034 UpdateCallers(NextNode, Visited, UpdateCallers); 1035 } 1036 } 1037 }; 1038 1039 DenseSet<const ContextEdge *> Visited; 1040 for (auto &Entry : AllocationCallToContextNodeMap) { 1041 auto *Node = Entry.second; 1042 // Update ids on the allocation nodes before calling the recursive 1043 // update along caller edges, since this simplifies the logic during 1044 // that traversal. 1045 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); 1046 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 1047 UpdateCallers(Node, Visited, UpdateCallers); 1048 } 1049 } 1050 1051 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1052 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( 1053 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { 1054 // Make a copy of the context ids, since this will be adjusted below as they 1055 // are moved. 1056 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; 1057 auto &OrigEdges = 1058 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; 1059 // Increment iterator in loop so that we can remove edges as needed. 1060 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { 1061 auto Edge = *EI; 1062 // Remove any matching context ids from Edge, return set that were found and 1063 // removed, these are the new edge's context ids. Also update the remaining 1064 // (not found ids). 1065 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; 1066 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, 1067 NotFoundContextIds); 1068 RemainingContextIds.swap(NotFoundContextIds); 1069 // If no matching context ids for this edge, skip it. 1070 if (NewEdgeContextIds.empty()) { 1071 ++EI; 1072 continue; 1073 } 1074 if (TowardsCallee) { 1075 auto NewEdge = std::make_shared<ContextEdge>( 1076 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds), 1077 NewEdgeContextIds); 1078 NewNode->CalleeEdges.push_back(NewEdge); 1079 NewEdge->Callee->CallerEdges.push_back(NewEdge); 1080 } else { 1081 auto NewEdge = std::make_shared<ContextEdge>( 1082 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds), 1083 NewEdgeContextIds); 1084 NewNode->CallerEdges.push_back(NewEdge); 1085 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 1086 } 1087 // Remove old edge if context ids empty. 1088 if (Edge->getContextIds().empty()) { 1089 if (TowardsCallee) { 1090 Edge->Callee->eraseCallerEdge(Edge.get()); 1091 EI = OrigNode->CalleeEdges.erase(EI); 1092 } else { 1093 Edge->Caller->eraseCalleeEdge(Edge.get()); 1094 EI = OrigNode->CallerEdges.erase(EI); 1095 } 1096 continue; 1097 } 1098 ++EI; 1099 } 1100 } 1101 1102 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1103 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1104 assignStackNodesPostOrder(ContextNode *Node, 1105 DenseSet<const ContextNode *> &Visited, 1106 DenseMap<uint64_t, std::vector<CallContextInfo>> 1107 &StackIdToMatchingCalls) { 1108 auto Inserted = Visited.insert(Node); 1109 if (!Inserted.second) 1110 return; 1111 // Post order traversal. Iterate over a copy since we may add nodes and 1112 // therefore new callers during the recursive call, invalidating any 1113 // iterator over the original edge vector. We don't need to process these 1114 // new nodes as they were already processed on creation. 1115 auto CallerEdges = Node->CallerEdges; 1116 for (auto &Edge : CallerEdges) { 1117 // Skip any that have been removed during the recursion. 1118 if (!Edge) 1119 continue; 1120 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls); 1121 } 1122 1123 // If this node's stack id is in the map, update the graph to contain new 1124 // nodes representing any inlining at interior callsites. Note we move the 1125 // associated context ids over to the new nodes. 1126 1127 // Ignore this node if it is for an allocation or we didn't record any 1128 // stack id lists ending at it. 1129 if (Node->IsAllocation || 1130 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) 1131 return; 1132 1133 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; 1134 // Handle the simple case first. A single call with a single stack id. 1135 // In this case there is no need to create any new context nodes, simply 1136 // assign the context node for stack id to this Call. 1137 if (Calls.size() == 1) { 1138 auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; 1139 if (Ids.size() == 1) { 1140 assert(SavedContextIds.empty()); 1141 // It should be this Node 1142 assert(Node == getNodeForStackId(Ids[0])); 1143 if (Node->Recursive) 1144 return; 1145 Node->setCall(Call); 1146 NonAllocationCallToContextNodeMap[Call] = Node; 1147 NodeToCallingFunc[Node] = Func; 1148 return; 1149 } 1150 } 1151 1152 // Find the node for the last stack id, which should be the same 1153 // across all calls recorded for this id, and is this node's id. 1154 uint64_t LastId = Node->OrigStackOrAllocId; 1155 ContextNode *LastNode = getNodeForStackId(LastId); 1156 // We should only have kept stack ids that had nodes. 1157 assert(LastNode); 1158 1159 for (unsigned I = 0; I < Calls.size(); I++) { 1160 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1161 // Skip any for which we didn't assign any ids, these don't get a node in 1162 // the graph. 1163 if (SavedContextIds.empty()) 1164 continue; 1165 1166 assert(LastId == Ids.back()); 1167 1168 ContextNode *FirstNode = getNodeForStackId(Ids[0]); 1169 assert(FirstNode); 1170 1171 // Recompute the context ids for this stack id sequence (the 1172 // intersection of the context ids of the corresponding nodes). 1173 // Start with the ids we saved in the map for this call, which could be 1174 // duplicated context ids. We have to recompute as we might have overlap 1175 // overlap between the saved context ids for different last nodes, and 1176 // removed them already during the post order traversal. 1177 set_intersect(SavedContextIds, FirstNode->ContextIds); 1178 ContextNode *PrevNode = nullptr; 1179 for (auto Id : Ids) { 1180 ContextNode *CurNode = getNodeForStackId(Id); 1181 // We should only have kept stack ids that had nodes and weren't 1182 // recursive. 1183 assert(CurNode); 1184 assert(!CurNode->Recursive); 1185 if (!PrevNode) { 1186 PrevNode = CurNode; 1187 continue; 1188 } 1189 auto *Edge = CurNode->findEdgeFromCallee(PrevNode); 1190 if (!Edge) { 1191 SavedContextIds.clear(); 1192 break; 1193 } 1194 PrevNode = CurNode; 1195 set_intersect(SavedContextIds, Edge->getContextIds()); 1196 1197 // If we now have no context ids for clone, skip this call. 1198 if (SavedContextIds.empty()) 1199 break; 1200 } 1201 if (SavedContextIds.empty()) 1202 continue; 1203 1204 // Create new context node. 1205 NodeOwner.push_back( 1206 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); 1207 ContextNode *NewNode = NodeOwner.back().get(); 1208 NodeToCallingFunc[NewNode] = Func; 1209 NonAllocationCallToContextNodeMap[Call] = NewNode; 1210 NewNode->ContextIds = SavedContextIds; 1211 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); 1212 1213 // Connect to callees of innermost stack frame in inlined call chain. 1214 // This updates context ids for FirstNode's callee's to reflect those 1215 // moved to NewNode. 1216 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true); 1217 1218 // Connect to callers of outermost stack frame in inlined call chain. 1219 // This updates context ids for FirstNode's caller's to reflect those 1220 // moved to NewNode. 1221 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false); 1222 1223 // Now we need to remove context ids from edges/nodes between First and 1224 // Last Node. 1225 PrevNode = nullptr; 1226 for (auto Id : Ids) { 1227 ContextNode *CurNode = getNodeForStackId(Id); 1228 // We should only have kept stack ids that had nodes. 1229 assert(CurNode); 1230 1231 // Remove the context ids moved to NewNode from CurNode, and the 1232 // edge from the prior node. 1233 set_subtract(CurNode->ContextIds, NewNode->ContextIds); 1234 if (PrevNode) { 1235 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); 1236 assert(PrevEdge); 1237 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); 1238 if (PrevEdge->getContextIds().empty()) { 1239 PrevNode->eraseCallerEdge(PrevEdge); 1240 CurNode->eraseCalleeEdge(PrevEdge); 1241 } 1242 } 1243 PrevNode = CurNode; 1244 } 1245 } 1246 } 1247 1248 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1249 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { 1250 // Map of stack id to all calls with that as the last (outermost caller) 1251 // callsite id that has a context node (some might not due to pruning 1252 // performed during matching of the allocation profile contexts). 1253 // The CallContextInfo contains the Call and a list of its stack ids with 1254 // ContextNodes, the function containing Call, and the set of context ids 1255 // the analysis will eventually identify for use in any new node created 1256 // for that callsite. 1257 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; 1258 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 1259 for (auto &Call : CallsWithMetadata) { 1260 // Ignore allocations, already handled. 1261 if (AllocationCallToContextNodeMap.count(Call)) 1262 continue; 1263 auto StackIdsWithContextNodes = 1264 getStackIdsWithContextNodesForCall(Call.call()); 1265 // If there were no nodes created for MIBs on allocs (maybe this was in 1266 // the unambiguous part of the MIB stack that was pruned), ignore. 1267 if (StackIdsWithContextNodes.empty()) 1268 continue; 1269 // Otherwise, record this Call along with the list of ids for the last 1270 // (outermost caller) stack id with a node. 1271 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( 1272 {Call.call(), StackIdsWithContextNodes, Func, {}}); 1273 } 1274 } 1275 1276 // First make a pass through all stack ids that correspond to a call, 1277 // as identified in the above loop. Compute the context ids corresponding to 1278 // each of these calls when they correspond to multiple stack ids due to 1279 // due to inlining. Perform any duplication of context ids required when 1280 // there is more than one call with the same stack ids. Their (possibly newly 1281 // duplicated) context ids are saved in the StackIdToMatchingCalls map. 1282 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; 1283 for (auto &It : StackIdToMatchingCalls) { 1284 auto &Calls = It.getSecond(); 1285 // Skip single calls with a single stack id. These don't need a new node. 1286 if (Calls.size() == 1) { 1287 auto &Ids = std::get<1>(Calls[0]); 1288 if (Ids.size() == 1) 1289 continue; 1290 } 1291 // In order to do the best and maximal matching of inlined calls to context 1292 // node sequences we will sort the vectors of stack ids in descending order 1293 // of length, and within each length, lexicographically by stack id. The 1294 // latter is so that we can specially handle calls that have identical stack 1295 // id sequences (either due to cloning or artificially because of the MIB 1296 // context pruning). 1297 std::stable_sort(Calls.begin(), Calls.end(), 1298 [](const CallContextInfo &A, const CallContextInfo &B) { 1299 auto &IdsA = std::get<1>(A); 1300 auto &IdsB = std::get<1>(B); 1301 return IdsA.size() > IdsB.size() || 1302 (IdsA.size() == IdsB.size() && IdsA < IdsB); 1303 }); 1304 1305 // Find the node for the last stack id, which should be the same 1306 // across all calls recorded for this id, and is the id for this 1307 // entry in the StackIdToMatchingCalls map. 1308 uint64_t LastId = It.getFirst(); 1309 ContextNode *LastNode = getNodeForStackId(LastId); 1310 // We should only have kept stack ids that had nodes. 1311 assert(LastNode); 1312 1313 if (LastNode->Recursive) 1314 continue; 1315 1316 // Initialize the context ids with the last node's. We will subsequently 1317 // refine the context ids by computing the intersection along all edges. 1318 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; 1319 assert(!LastNodeContextIds.empty()); 1320 1321 for (unsigned I = 0; I < Calls.size(); I++) { 1322 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1323 assert(SavedContextIds.empty()); 1324 assert(LastId == Ids.back()); 1325 1326 // First compute the context ids for this stack id sequence (the 1327 // intersection of the context ids of the corresponding nodes). 1328 // Start with the remaining saved ids for the last node. 1329 assert(!LastNodeContextIds.empty()); 1330 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; 1331 1332 ContextNode *PrevNode = LastNode; 1333 ContextNode *CurNode = LastNode; 1334 bool Skip = false; 1335 1336 // Iterate backwards through the stack Ids, starting after the last Id 1337 // in the list, which was handled once outside for all Calls. 1338 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1339 auto Id = *IdIter; 1340 CurNode = getNodeForStackId(Id); 1341 // We should only have kept stack ids that had nodes. 1342 assert(CurNode); 1343 1344 if (CurNode->Recursive) { 1345 Skip = true; 1346 break; 1347 } 1348 1349 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1350 // If there is no edge then the nodes belong to different MIB contexts, 1351 // and we should skip this inlined context sequence. For example, this 1352 // particular inlined context may include stack ids A->B, and we may 1353 // indeed have nodes for both A and B, but it is possible that they were 1354 // never profiled in sequence in a single MIB for any allocation (i.e. 1355 // we might have profiled an allocation that involves the callsite A, 1356 // but through a different one of its callee callsites, and we might 1357 // have profiled an allocation that involves callsite B, but reached 1358 // from a different caller callsite). 1359 if (!Edge) { 1360 Skip = true; 1361 break; 1362 } 1363 PrevNode = CurNode; 1364 1365 // Update the context ids, which is the intersection of the ids along 1366 // all edges in the sequence. 1367 set_intersect(StackSequenceContextIds, Edge->getContextIds()); 1368 1369 // If we now have no context ids for clone, skip this call. 1370 if (StackSequenceContextIds.empty()) { 1371 Skip = true; 1372 break; 1373 } 1374 } 1375 if (Skip) 1376 continue; 1377 1378 // If some of this call's stack ids did not have corresponding nodes (due 1379 // to pruning), don't include any context ids for contexts that extend 1380 // beyond these nodes. Otherwise we would be matching part of unrelated / 1381 // not fully matching stack contexts. To do this, subtract any context ids 1382 // found in caller nodes of the last node found above. 1383 if (Ids.back() != getLastStackId(Call)) { 1384 for (const auto &PE : CurNode->CallerEdges) { 1385 set_subtract(StackSequenceContextIds, PE->getContextIds()); 1386 if (StackSequenceContextIds.empty()) 1387 break; 1388 } 1389 // If we now have no context ids for clone, skip this call. 1390 if (StackSequenceContextIds.empty()) 1391 continue; 1392 } 1393 1394 // Check if the next set of stack ids is the same (since the Calls vector 1395 // of tuples is sorted by the stack ids we can just look at the next one). 1396 bool DuplicateContextIds = false; 1397 if (I + 1 < Calls.size()) { 1398 auto NextIds = std::get<1>(Calls[I + 1]); 1399 DuplicateContextIds = Ids == NextIds; 1400 } 1401 1402 // If we don't have duplicate context ids, then we can assign all the 1403 // context ids computed for the original node sequence to this call. 1404 // If there are duplicate calls with the same stack ids then we synthesize 1405 // new context ids that are duplicates of the originals. These are 1406 // assigned to SavedContextIds, which is a reference into the map entry 1407 // for this call, allowing us to access these ids later on. 1408 OldToNewContextIds.reserve(OldToNewContextIds.size() + 1409 StackSequenceContextIds.size()); 1410 SavedContextIds = 1411 DuplicateContextIds 1412 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) 1413 : StackSequenceContextIds; 1414 assert(!SavedContextIds.empty()); 1415 1416 if (!DuplicateContextIds) { 1417 // Update saved last node's context ids to remove those that are 1418 // assigned to other calls, so that it is ready for the next call at 1419 // this stack id. 1420 set_subtract(LastNodeContextIds, StackSequenceContextIds); 1421 if (LastNodeContextIds.empty()) 1422 break; 1423 } 1424 } 1425 } 1426 1427 // Propagate the duplicate context ids over the graph. 1428 propagateDuplicateContextIds(OldToNewContextIds); 1429 1430 if (VerifyCCG) 1431 check(); 1432 1433 // Now perform a post-order traversal over the graph, starting with the 1434 // allocation nodes, essentially processing nodes from callers to callees. 1435 // For any that contains an id in the map, update the graph to contain new 1436 // nodes representing any inlining at interior callsites. Note we move the 1437 // associated context ids over to the new nodes. 1438 DenseSet<const ContextNode *> Visited; 1439 for (auto &Entry : AllocationCallToContextNodeMap) 1440 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); 1441 } 1442 1443 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { 1444 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1445 Call->getMetadata(LLVMContext::MD_callsite)); 1446 return CallsiteContext.back(); 1447 } 1448 1449 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { 1450 assert(isa<CallsiteInfo *>(Call.getBase())); 1451 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1452 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1453 // Need to convert index into stack id. 1454 return Index.getStackIdAtIndex(CallsiteContext.back()); 1455 } 1456 1457 static const std::string MemProfCloneSuffix = ".memprof."; 1458 1459 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { 1460 // We use CloneNo == 0 to refer to the original version, which doesn't get 1461 // renamed with a suffix. 1462 if (!CloneNo) 1463 return Base.str(); 1464 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); 1465 } 1466 1467 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, 1468 const Instruction *Call, 1469 unsigned CloneNo) const { 1470 return (Twine(Call->getFunction()->getName()) + " -> " + 1471 cast<CallBase>(Call)->getCalledFunction()->getName()) 1472 .str(); 1473 } 1474 1475 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, 1476 const IndexCall &Call, 1477 unsigned CloneNo) const { 1478 auto VI = FSToVIMap.find(Func); 1479 assert(VI != FSToVIMap.end()); 1480 if (isa<AllocInfo *>(Call.getBase())) 1481 return (VI->second.name() + " -> alloc").str(); 1482 else { 1483 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase()); 1484 return (VI->second.name() + " -> " + 1485 getMemProfFuncName(Callsite->Callee.name(), 1486 Callsite->Clones[CloneNo])) 1487 .str(); 1488 } 1489 } 1490 1491 std::vector<uint64_t> 1492 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( 1493 Instruction *Call) { 1494 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1495 Call->getMetadata(LLVMContext::MD_callsite)); 1496 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( 1497 CallsiteContext); 1498 } 1499 1500 std::vector<uint64_t> 1501 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { 1502 assert(isa<CallsiteInfo *>(Call.getBase())); 1503 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1504 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1505 return getStackIdsWithContextNodes<CallsiteInfo, 1506 SmallVector<unsigned>::const_iterator>( 1507 CallsiteContext); 1508 } 1509 1510 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1511 template <class NodeT, class IteratorT> 1512 std::vector<uint64_t> 1513 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( 1514 CallStack<NodeT, IteratorT> &CallsiteContext) { 1515 std::vector<uint64_t> StackIds; 1516 for (auto IdOrIndex : CallsiteContext) { 1517 auto StackId = getStackId(IdOrIndex); 1518 ContextNode *Node = getNodeForStackId(StackId); 1519 if (!Node) 1520 break; 1521 StackIds.push_back(StackId); 1522 } 1523 return StackIds; 1524 } 1525 1526 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( 1527 Module &M, 1528 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) 1529 : Mod(M), OREGetter(OREGetter) { 1530 for (auto &F : M) { 1531 std::vector<CallInfo> CallsWithMetadata; 1532 for (auto &BB : F) { 1533 for (auto &I : BB) { 1534 if (!isa<CallBase>(I)) 1535 continue; 1536 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { 1537 CallsWithMetadata.push_back(&I); 1538 auto *AllocNode = addAllocNode(&I, &F); 1539 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); 1540 assert(CallsiteMD); 1541 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); 1542 // Add all of the MIBs and their stack nodes. 1543 for (auto &MDOp : MemProfMD->operands()) { 1544 auto *MIBMD = cast<const MDNode>(MDOp); 1545 MDNode *StackNode = getMIBStackNode(MIBMD); 1546 assert(StackNode); 1547 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); 1548 addStackNodesForMIB<MDNode, MDNode::op_iterator>( 1549 AllocNode, StackContext, CallsiteContext, 1550 getMIBAllocType(MIBMD)); 1551 } 1552 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1553 // Memprof and callsite metadata on memory allocations no longer 1554 // needed. 1555 I.setMetadata(LLVMContext::MD_memprof, nullptr); 1556 I.setMetadata(LLVMContext::MD_callsite, nullptr); 1557 } 1558 // For callsite metadata, add to list for this function for later use. 1559 else if (I.getMetadata(LLVMContext::MD_callsite)) 1560 CallsWithMetadata.push_back(&I); 1561 } 1562 } 1563 if (!CallsWithMetadata.empty()) 1564 FuncToCallsWithMetadata[&F] = CallsWithMetadata; 1565 } 1566 1567 if (DumpCCG) { 1568 dbgs() << "CCG before updating call stack chains:\n"; 1569 dbgs() << *this; 1570 } 1571 1572 if (ExportToDot) 1573 exportToDot("prestackupdate"); 1574 1575 updateStackNodes(); 1576 1577 handleCallsitesWithMultipleTargets(); 1578 1579 // Strip off remaining callsite metadata, no longer needed. 1580 for (auto &FuncEntry : FuncToCallsWithMetadata) 1581 for (auto &Call : FuncEntry.second) 1582 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); 1583 } 1584 1585 IndexCallsiteContextGraph::IndexCallsiteContextGraph( 1586 ModuleSummaryIndex &Index, 1587 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 1588 isPrevailing) 1589 : Index(Index), isPrevailing(isPrevailing) { 1590 for (auto &I : Index) { 1591 auto VI = Index.getValueInfo(I); 1592 for (auto &S : VI.getSummaryList()) { 1593 // We should only add the prevailing nodes. Otherwise we may try to clone 1594 // in a weak copy that won't be linked (and may be different than the 1595 // prevailing version). 1596 // We only keep the memprof summary on the prevailing copy now when 1597 // building the combined index, as a space optimization, however don't 1598 // rely on this optimization. The linker doesn't resolve local linkage 1599 // values so don't check whether those are prevailing. 1600 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1601 !isPrevailing(VI.getGUID(), S.get())) 1602 continue; 1603 auto *FS = dyn_cast<FunctionSummary>(S.get()); 1604 if (!FS) 1605 continue; 1606 std::vector<CallInfo> CallsWithMetadata; 1607 if (!FS->allocs().empty()) { 1608 for (auto &AN : FS->mutableAllocs()) { 1609 // This can happen because of recursion elimination handling that 1610 // currently exists in ModuleSummaryAnalysis. Skip these for now. 1611 // We still added them to the summary because we need to be able to 1612 // correlate properly in applyImport in the backends. 1613 if (AN.MIBs.empty()) 1614 continue; 1615 CallsWithMetadata.push_back({&AN}); 1616 auto *AllocNode = addAllocNode({&AN}, FS); 1617 // Pass an empty CallStack to the CallsiteContext (second) 1618 // parameter, since for ThinLTO we already collapsed out the inlined 1619 // stack ids on the allocation call during ModuleSummaryAnalysis. 1620 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1621 EmptyContext; 1622 // Now add all of the MIBs and their stack nodes. 1623 for (auto &MIB : AN.MIBs) { 1624 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1625 StackContext(&MIB); 1626 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( 1627 AllocNode, StackContext, EmptyContext, MIB.AllocType); 1628 } 1629 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1630 // Initialize version 0 on the summary alloc node to the current alloc 1631 // type, unless it has both types in which case make it default, so 1632 // that in the case where we aren't able to clone the original version 1633 // always ends up with the default allocation behavior. 1634 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); 1635 } 1636 } 1637 // For callsite metadata, add to list for this function for later use. 1638 if (!FS->callsites().empty()) 1639 for (auto &SN : FS->mutableCallsites()) 1640 CallsWithMetadata.push_back({&SN}); 1641 1642 if (!CallsWithMetadata.empty()) 1643 FuncToCallsWithMetadata[FS] = CallsWithMetadata; 1644 1645 if (!FS->allocs().empty() || !FS->callsites().empty()) 1646 FSToVIMap[FS] = VI; 1647 } 1648 } 1649 1650 if (DumpCCG) { 1651 dbgs() << "CCG before updating call stack chains:\n"; 1652 dbgs() << *this; 1653 } 1654 1655 if (ExportToDot) 1656 exportToDot("prestackupdate"); 1657 1658 updateStackNodes(); 1659 1660 handleCallsitesWithMultipleTargets(); 1661 } 1662 1663 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1664 void CallsiteContextGraph<DerivedCCG, FuncTy, 1665 CallTy>::handleCallsitesWithMultipleTargets() { 1666 // Look for and workaround callsites that call multiple functions. 1667 // This can happen for indirect calls, which needs better handling, and in 1668 // more rare cases (e.g. macro expansion). 1669 // TODO: To fix this for indirect calls we will want to perform speculative 1670 // devirtualization using either the normal PGO info with ICP, or using the 1671 // information in the profiled MemProf contexts. We can do this prior to 1672 // this transformation for regular LTO, and for ThinLTO we can simulate that 1673 // effect in the summary and perform the actual speculative devirtualization 1674 // while cloning in the ThinLTO backend. 1675 1676 // Keep track of the new nodes synthesized for discovered tail calls missing 1677 // from the profiled contexts. 1678 MapVector<CallInfo, ContextNode *> TailCallToContextNodeMap; 1679 1680 for (auto Entry = NonAllocationCallToContextNodeMap.begin(); 1681 Entry != NonAllocationCallToContextNodeMap.end();) { 1682 auto *Node = Entry->second; 1683 assert(Node->Clones.empty()); 1684 // Check all node callees and see if in the same function. 1685 bool Removed = false; 1686 auto Call = Node->Call.call(); 1687 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 1688 auto Edge = *EI; 1689 if (!Edge->Callee->hasCall()) { 1690 ++EI; 1691 continue; 1692 } 1693 assert(NodeToCallingFunc.count(Edge->Callee)); 1694 // Check if the called function matches that of the callee node. 1695 if (calleesMatch(Call, EI, TailCallToContextNodeMap)) 1696 continue; 1697 RemovedEdgesWithMismatchedCallees++; 1698 // Work around by setting Node to have a null call, so it gets 1699 // skipped during cloning. Otherwise assignFunctions will assert 1700 // because its data structures are not designed to handle this case. 1701 Entry = NonAllocationCallToContextNodeMap.erase(Entry); 1702 Node->setCall(CallInfo()); 1703 Removed = true; 1704 break; 1705 } 1706 if (!Removed) 1707 Entry++; 1708 } 1709 1710 // Add the new nodes after the above loop so that the iteration is not 1711 // invalidated. 1712 for (auto &[Call, Node] : TailCallToContextNodeMap) 1713 NonAllocationCallToContextNodeMap[Call] = Node; 1714 } 1715 1716 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1717 // In the Module (IR) case this is already the Id. 1718 return IdOrIndex; 1719 } 1720 1721 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1722 // In the Index case this is an index into the stack id list in the summary 1723 // index, convert it to an Id. 1724 return Index.getStackIdAtIndex(IdOrIndex); 1725 } 1726 1727 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1728 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::calleesMatch( 1729 CallTy Call, EdgeIter &EI, 1730 MapVector<CallInfo, ContextNode *> &TailCallToContextNodeMap) { 1731 auto Edge = *EI; 1732 const FuncTy *ProfiledCalleeFunc = NodeToCallingFunc[Edge->Callee]; 1733 const FuncTy *CallerFunc = NodeToCallingFunc[Edge->Caller]; 1734 // Will be populated in order of callee to caller if we find a chain of tail 1735 // calls between the profiled caller and callee. 1736 std::vector<std::pair<CallTy, FuncTy *>> FoundCalleeChain; 1737 if (!calleeMatchesFunc(Call, ProfiledCalleeFunc, CallerFunc, 1738 FoundCalleeChain)) { 1739 ++EI; 1740 return false; 1741 } 1742 1743 // The usual case where the profiled callee matches that of the IR/summary. 1744 if (FoundCalleeChain.empty()) { 1745 ++EI; 1746 return true; 1747 } 1748 1749 auto AddEdge = [Edge, &EI](ContextNode *Caller, ContextNode *Callee) { 1750 auto *CurEdge = Callee->findEdgeFromCaller(Caller); 1751 // If there is already an edge between these nodes, simply update it and 1752 // return. 1753 if (CurEdge) { 1754 CurEdge->ContextIds.insert(Edge->ContextIds.begin(), 1755 Edge->ContextIds.end()); 1756 CurEdge->AllocTypes |= Edge->AllocTypes; 1757 return; 1758 } 1759 // Otherwise, create a new edge and insert it into the caller and callee 1760 // lists. 1761 auto NewEdge = std::make_shared<ContextEdge>( 1762 Callee, Caller, Edge->AllocTypes, Edge->ContextIds); 1763 Callee->CallerEdges.push_back(NewEdge); 1764 if (Caller == Edge->Caller) { 1765 // If we are inserting the new edge into the current edge's caller, insert 1766 // the new edge before the current iterator position, and then increment 1767 // back to the current edge. 1768 EI = Caller->CalleeEdges.insert(EI, NewEdge); 1769 ++EI; 1770 assert(*EI == Edge && 1771 "Iterator position not restored after insert and increment"); 1772 } else 1773 Caller->CalleeEdges.push_back(NewEdge); 1774 }; 1775 1776 // Create new nodes for each found callee and connect in between the profiled 1777 // caller and callee. 1778 auto *CurCalleeNode = Edge->Callee; 1779 for (auto &[NewCall, Func] : FoundCalleeChain) { 1780 ContextNode *NewNode = nullptr; 1781 // First check if we have already synthesized a node for this tail call. 1782 if (TailCallToContextNodeMap.count(NewCall)) { 1783 NewNode = TailCallToContextNodeMap[NewCall]; 1784 NewNode->ContextIds.insert(Edge->ContextIds.begin(), 1785 Edge->ContextIds.end()); 1786 NewNode->AllocTypes |= Edge->AllocTypes; 1787 } else { 1788 FuncToCallsWithMetadata[Func].push_back({NewCall}); 1789 // Create Node and record node info. 1790 NodeOwner.push_back( 1791 std::make_unique<ContextNode>(/*IsAllocation=*/false, NewCall)); 1792 NewNode = NodeOwner.back().get(); 1793 NodeToCallingFunc[NewNode] = Func; 1794 TailCallToContextNodeMap[NewCall] = NewNode; 1795 NewNode->ContextIds = Edge->ContextIds; 1796 NewNode->AllocTypes = Edge->AllocTypes; 1797 } 1798 1799 // Hook up node to its callee node 1800 AddEdge(NewNode, CurCalleeNode); 1801 1802 CurCalleeNode = NewNode; 1803 } 1804 1805 // Hook up edge's original caller to new callee node. 1806 AddEdge(Edge->Caller, CurCalleeNode); 1807 1808 // Remove old edge 1809 Edge->Callee->eraseCallerEdge(Edge.get()); 1810 EI = Edge->Caller->CalleeEdges.erase(EI); 1811 1812 return true; 1813 } 1814 1815 bool ModuleCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 1816 const Function *ProfiledCallee, Value *CurCallee, unsigned Depth, 1817 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain, 1818 bool &FoundMultipleCalleeChains) { 1819 // Stop recursive search if we have already explored the maximum specified 1820 // depth. 1821 if (Depth > TailCallSearchDepth) 1822 return false; 1823 1824 auto SaveCallsiteInfo = [&](Instruction *Callsite, Function *F) { 1825 FoundCalleeChain.push_back({Callsite, F}); 1826 }; 1827 1828 auto *CalleeFunc = dyn_cast<Function>(CurCallee); 1829 if (!CalleeFunc) { 1830 auto *Alias = dyn_cast<GlobalAlias>(CurCallee); 1831 assert(Alias); 1832 CalleeFunc = dyn_cast<Function>(Alias->getAliasee()); 1833 assert(CalleeFunc); 1834 } 1835 1836 // Look for tail calls in this function, and check if they either call the 1837 // profiled callee directly, or indirectly (via a recursive search). 1838 // Only succeed if there is a single unique tail call chain found between the 1839 // profiled caller and callee, otherwise we could perform incorrect cloning. 1840 bool FoundSingleCalleeChain = false; 1841 for (auto &BB : *CalleeFunc) { 1842 for (auto &I : BB) { 1843 auto *CB = dyn_cast<CallBase>(&I); 1844 if (!CB || !CB->isTailCall()) 1845 continue; 1846 auto *CalledValue = CB->getCalledOperand(); 1847 auto *CalledFunction = CB->getCalledFunction(); 1848 if (CalledValue && !CalledFunction) { 1849 CalledValue = CalledValue->stripPointerCasts(); 1850 // Stripping pointer casts can reveal a called function. 1851 CalledFunction = dyn_cast<Function>(CalledValue); 1852 } 1853 // Check if this is an alias to a function. If so, get the 1854 // called aliasee for the checks below. 1855 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 1856 assert(!CalledFunction && 1857 "Expected null called function in callsite for alias"); 1858 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 1859 } 1860 if (!CalledFunction) 1861 continue; 1862 if (CalledFunction == ProfiledCallee) { 1863 if (FoundSingleCalleeChain) { 1864 FoundMultipleCalleeChains = true; 1865 return false; 1866 } 1867 FoundSingleCalleeChain = true; 1868 FoundProfiledCalleeCount++; 1869 FoundProfiledCalleeDepth += Depth; 1870 if (Depth > FoundProfiledCalleeMaxDepth) 1871 FoundProfiledCalleeMaxDepth = Depth; 1872 SaveCallsiteInfo(&I, CalleeFunc); 1873 } else if (findProfiledCalleeThroughTailCalls( 1874 ProfiledCallee, CalledFunction, Depth + 1, 1875 FoundCalleeChain, FoundMultipleCalleeChains)) { 1876 if (FoundMultipleCalleeChains) 1877 return false; 1878 if (FoundSingleCalleeChain) { 1879 FoundMultipleCalleeChains = true; 1880 return false; 1881 } 1882 FoundSingleCalleeChain = true; 1883 SaveCallsiteInfo(&I, CalleeFunc); 1884 } 1885 } 1886 } 1887 1888 return FoundSingleCalleeChain; 1889 } 1890 1891 bool ModuleCallsiteContextGraph::calleeMatchesFunc( 1892 Instruction *Call, const Function *Func, const Function *CallerFunc, 1893 std::vector<std::pair<Instruction *, Function *>> &FoundCalleeChain) { 1894 auto *CB = dyn_cast<CallBase>(Call); 1895 if (!CB->getCalledOperand()) 1896 return false; 1897 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 1898 auto *CalleeFunc = dyn_cast<Function>(CalleeVal); 1899 if (CalleeFunc == Func) 1900 return true; 1901 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 1902 if (Alias && Alias->getAliasee() == Func) 1903 return true; 1904 1905 // Recursively search for the profiled callee through tail calls starting with 1906 // the actual Callee. The discovered tail call chain is saved in 1907 // FoundCalleeChain, and we will fixup the graph to include these callsites 1908 // after returning. 1909 // FIXME: We will currently redo the same recursive walk if we find the same 1910 // mismatched callee from another callsite. We can improve this with more 1911 // bookkeeping of the created chain of new nodes for each mismatch. 1912 unsigned Depth = 1; 1913 bool FoundMultipleCalleeChains = false; 1914 if (!findProfiledCalleeThroughTailCalls(Func, CalleeVal, Depth, 1915 FoundCalleeChain, 1916 FoundMultipleCalleeChains)) { 1917 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " 1918 << Func->getName() << " from " << CallerFunc->getName() 1919 << " that actually called " << CalleeVal->getName() 1920 << (FoundMultipleCalleeChains 1921 ? " (found multiple possible chains)" 1922 : "") 1923 << "\n"); 1924 if (FoundMultipleCalleeChains) 1925 FoundProfiledCalleeNonUniquelyCount++; 1926 return false; 1927 } 1928 1929 return true; 1930 } 1931 1932 bool IndexCallsiteContextGraph::findProfiledCalleeThroughTailCalls( 1933 ValueInfo ProfiledCallee, ValueInfo CurCallee, unsigned Depth, 1934 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain, 1935 bool &FoundMultipleCalleeChains) { 1936 // Stop recursive search if we have already explored the maximum specified 1937 // depth. 1938 if (Depth > TailCallSearchDepth) 1939 return false; 1940 1941 auto CreateAndSaveCallsiteInfo = [&](ValueInfo Callee, FunctionSummary *FS) { 1942 // Make a CallsiteInfo for each discovered callee, if one hasn't already 1943 // been synthesized. 1944 if (!FunctionCalleesToSynthesizedCallsiteInfos.count(FS) || 1945 !FunctionCalleesToSynthesizedCallsiteInfos[FS].count(Callee)) 1946 // StackIds is empty (we don't have debug info available in the index for 1947 // these callsites) 1948 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee] = 1949 std::make_unique<CallsiteInfo>(Callee, SmallVector<unsigned>()); 1950 CallsiteInfo *NewCallsiteInfo = 1951 FunctionCalleesToSynthesizedCallsiteInfos[FS][Callee].get(); 1952 FoundCalleeChain.push_back({NewCallsiteInfo, FS}); 1953 }; 1954 1955 // Look for tail calls in this function, and check if they either call the 1956 // profiled callee directly, or indirectly (via a recursive search). 1957 // Only succeed if there is a single unique tail call chain found between the 1958 // profiled caller and callee, otherwise we could perform incorrect cloning. 1959 bool FoundSingleCalleeChain = false; 1960 for (auto &S : CurCallee.getSummaryList()) { 1961 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1962 !isPrevailing(CurCallee.getGUID(), S.get())) 1963 continue; 1964 auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()); 1965 if (!FS) 1966 continue; 1967 auto FSVI = CurCallee; 1968 auto *AS = dyn_cast<AliasSummary>(S.get()); 1969 if (AS) 1970 FSVI = AS->getAliaseeVI(); 1971 for (auto &CallEdge : FS->calls()) { 1972 if (!CallEdge.second.hasTailCall()) 1973 continue; 1974 if (CallEdge.first == ProfiledCallee) { 1975 if (FoundSingleCalleeChain) { 1976 FoundMultipleCalleeChains = true; 1977 return false; 1978 } 1979 FoundSingleCalleeChain = true; 1980 FoundProfiledCalleeCount++; 1981 FoundProfiledCalleeDepth += Depth; 1982 if (Depth > FoundProfiledCalleeMaxDepth) 1983 FoundProfiledCalleeMaxDepth = Depth; 1984 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 1985 // Add FS to FSToVIMap in case it isn't already there. 1986 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 1987 FSToVIMap[FS] = FSVI; 1988 } else if (findProfiledCalleeThroughTailCalls( 1989 ProfiledCallee, CallEdge.first, Depth + 1, 1990 FoundCalleeChain, FoundMultipleCalleeChains)) { 1991 if (FoundMultipleCalleeChains) 1992 return false; 1993 if (FoundSingleCalleeChain) { 1994 FoundMultipleCalleeChains = true; 1995 return false; 1996 } 1997 FoundSingleCalleeChain = true; 1998 CreateAndSaveCallsiteInfo(CallEdge.first, FS); 1999 // Add FS to FSToVIMap in case it isn't already there. 2000 assert(!FSToVIMap.count(FS) || FSToVIMap[FS] == FSVI); 2001 FSToVIMap[FS] = FSVI; 2002 } 2003 } 2004 } 2005 2006 return FoundSingleCalleeChain; 2007 } 2008 2009 bool IndexCallsiteContextGraph::calleeMatchesFunc( 2010 IndexCall &Call, const FunctionSummary *Func, 2011 const FunctionSummary *CallerFunc, 2012 std::vector<std::pair<IndexCall, FunctionSummary *>> &FoundCalleeChain) { 2013 ValueInfo Callee = 2014 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee; 2015 // If there is no summary list then this is a call to an externally defined 2016 // symbol. 2017 AliasSummary *Alias = 2018 Callee.getSummaryList().empty() 2019 ? nullptr 2020 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); 2021 assert(FSToVIMap.count(Func)); 2022 auto FuncVI = FSToVIMap[Func]; 2023 if (Callee == FuncVI || 2024 // If callee is an alias, check the aliasee, since only function 2025 // summary base objects will contain the stack node summaries and thus 2026 // get a context node. 2027 (Alias && Alias->getAliaseeVI() == FuncVI)) 2028 return true; 2029 2030 // Recursively search for the profiled callee through tail calls starting with 2031 // the actual Callee. The discovered tail call chain is saved in 2032 // FoundCalleeChain, and we will fixup the graph to include these callsites 2033 // after returning. 2034 // FIXME: We will currently redo the same recursive walk if we find the same 2035 // mismatched callee from another callsite. We can improve this with more 2036 // bookkeeping of the created chain of new nodes for each mismatch. 2037 unsigned Depth = 1; 2038 bool FoundMultipleCalleeChains = false; 2039 if (!findProfiledCalleeThroughTailCalls( 2040 FuncVI, Callee, Depth, FoundCalleeChain, FoundMultipleCalleeChains)) { 2041 LLVM_DEBUG(dbgs() << "Not found through unique tail call chain: " << FuncVI 2042 << " from " << FSToVIMap[CallerFunc] 2043 << " that actually called " << Callee 2044 << (FoundMultipleCalleeChains 2045 ? " (found multiple possible chains)" 2046 : "") 2047 << "\n"); 2048 if (FoundMultipleCalleeChains) 2049 FoundProfiledCalleeNonUniquelyCount++; 2050 return false; 2051 } 2052 2053 return true; 2054 } 2055 2056 static std::string getAllocTypeString(uint8_t AllocTypes) { 2057 if (!AllocTypes) 2058 return "None"; 2059 std::string Str; 2060 if (AllocTypes & (uint8_t)AllocationType::NotCold) 2061 Str += "NotCold"; 2062 if (AllocTypes & (uint8_t)AllocationType::Cold) 2063 Str += "Cold"; 2064 return Str; 2065 } 2066 2067 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2068 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() 2069 const { 2070 print(dbgs()); 2071 dbgs() << "\n"; 2072 } 2073 2074 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2075 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( 2076 raw_ostream &OS) const { 2077 OS << "Node " << this << "\n"; 2078 OS << "\t"; 2079 printCall(OS); 2080 if (Recursive) 2081 OS << " (recursive)"; 2082 OS << "\n"; 2083 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; 2084 OS << "\tContextIds:"; 2085 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2086 std::sort(SortedIds.begin(), SortedIds.end()); 2087 for (auto Id : SortedIds) 2088 OS << " " << Id; 2089 OS << "\n"; 2090 OS << "\tCalleeEdges:\n"; 2091 for (auto &Edge : CalleeEdges) 2092 OS << "\t\t" << *Edge << "\n"; 2093 OS << "\tCallerEdges:\n"; 2094 for (auto &Edge : CallerEdges) 2095 OS << "\t\t" << *Edge << "\n"; 2096 if (!Clones.empty()) { 2097 OS << "\tClones: "; 2098 FieldSeparator FS; 2099 for (auto *Clone : Clones) 2100 OS << FS << Clone; 2101 OS << "\n"; 2102 } else if (CloneOf) { 2103 OS << "\tClone of " << CloneOf << "\n"; 2104 } 2105 } 2106 2107 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2108 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() 2109 const { 2110 print(dbgs()); 2111 dbgs() << "\n"; 2112 } 2113 2114 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2115 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( 2116 raw_ostream &OS) const { 2117 OS << "Edge from Callee " << Callee << " to Caller: " << Caller 2118 << " AllocTypes: " << getAllocTypeString(AllocTypes); 2119 OS << " ContextIds:"; 2120 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2121 std::sort(SortedIds.begin(), SortedIds.end()); 2122 for (auto Id : SortedIds) 2123 OS << " " << Id; 2124 } 2125 2126 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2127 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { 2128 print(dbgs()); 2129 } 2130 2131 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2132 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( 2133 raw_ostream &OS) const { 2134 OS << "Callsite Context Graph:\n"; 2135 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2136 for (const auto Node : nodes<GraphType>(this)) { 2137 if (Node->isRemoved()) 2138 continue; 2139 Node->print(OS); 2140 OS << "\n"; 2141 } 2142 } 2143 2144 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2145 static void checkEdge( 2146 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { 2147 // Confirm that alloc type is not None and that we have at least one context 2148 // id. 2149 assert(Edge->AllocTypes != (uint8_t)AllocationType::None); 2150 assert(!Edge->ContextIds.empty()); 2151 } 2152 2153 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2154 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, 2155 bool CheckEdges = true) { 2156 if (Node->isRemoved()) 2157 return; 2158 // Node's context ids should be the union of both its callee and caller edge 2159 // context ids. 2160 if (Node->CallerEdges.size()) { 2161 auto EI = Node->CallerEdges.begin(); 2162 auto &FirstEdge = *EI; 2163 EI++; 2164 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); 2165 for (; EI != Node->CallerEdges.end(); EI++) { 2166 const auto &Edge = *EI; 2167 if (CheckEdges) 2168 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2169 set_union(CallerEdgeContextIds, Edge->ContextIds); 2170 } 2171 // Node can have more context ids than callers if some contexts terminate at 2172 // node and some are longer. 2173 assert(Node->ContextIds == CallerEdgeContextIds || 2174 set_is_subset(CallerEdgeContextIds, Node->ContextIds)); 2175 } 2176 if (Node->CalleeEdges.size()) { 2177 auto EI = Node->CalleeEdges.begin(); 2178 auto &FirstEdge = *EI; 2179 EI++; 2180 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); 2181 for (; EI != Node->CalleeEdges.end(); EI++) { 2182 const auto &Edge = *EI; 2183 if (CheckEdges) 2184 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2185 set_union(CalleeEdgeContextIds, Edge->ContextIds); 2186 } 2187 assert(Node->ContextIds == CalleeEdgeContextIds); 2188 } 2189 } 2190 2191 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2192 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { 2193 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2194 for (const auto Node : nodes<GraphType>(this)) { 2195 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 2196 for (auto &Edge : Node->CallerEdges) 2197 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 2198 } 2199 } 2200 2201 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2202 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { 2203 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2204 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; 2205 2206 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; 2207 static NodeRef getNode(const NodePtrTy &P) { return P.get(); } 2208 2209 using nodes_iterator = 2210 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, 2211 decltype(&getNode)>; 2212 2213 static nodes_iterator nodes_begin(GraphType G) { 2214 return nodes_iterator(G->NodeOwner.begin(), &getNode); 2215 } 2216 2217 static nodes_iterator nodes_end(GraphType G) { 2218 return nodes_iterator(G->NodeOwner.end(), &getNode); 2219 } 2220 2221 static NodeRef getEntryNode(GraphType G) { 2222 return G->NodeOwner.begin()->get(); 2223 } 2224 2225 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; 2226 static const ContextNode<DerivedCCG, FuncTy, CallTy> * 2227 GetCallee(const EdgePtrTy &P) { 2228 return P->Callee; 2229 } 2230 2231 using ChildIteratorType = 2232 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< 2233 DerivedCCG, FuncTy, CallTy>>>::const_iterator, 2234 decltype(&GetCallee)>; 2235 2236 static ChildIteratorType child_begin(NodeRef N) { 2237 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); 2238 } 2239 2240 static ChildIteratorType child_end(NodeRef N) { 2241 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); 2242 } 2243 }; 2244 2245 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2246 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> 2247 : public DefaultDOTGraphTraits { 2248 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 2249 2250 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 2251 using GTraits = GraphTraits<GraphType>; 2252 using NodeRef = typename GTraits::NodeRef; 2253 using ChildIteratorType = typename GTraits::ChildIteratorType; 2254 2255 static std::string getNodeLabel(NodeRef Node, GraphType G) { 2256 std::string LabelString = 2257 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + 2258 Twine(Node->OrigStackOrAllocId)) 2259 .str(); 2260 LabelString += "\n"; 2261 if (Node->hasCall()) { 2262 auto Func = G->NodeToCallingFunc.find(Node); 2263 assert(Func != G->NodeToCallingFunc.end()); 2264 LabelString += 2265 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); 2266 } else { 2267 LabelString += "null call"; 2268 if (Node->Recursive) 2269 LabelString += " (recursive)"; 2270 else 2271 LabelString += " (external)"; 2272 } 2273 return LabelString; 2274 } 2275 2276 static std::string getNodeAttributes(NodeRef Node, GraphType) { 2277 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + 2278 getContextIds(Node->ContextIds) + "\"") 2279 .str(); 2280 AttributeString += 2281 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); 2282 AttributeString += ",style=\"filled\""; 2283 if (Node->CloneOf) { 2284 AttributeString += ",color=\"blue\""; 2285 AttributeString += ",style=\"filled,bold,dashed\""; 2286 } else 2287 AttributeString += ",style=\"filled\""; 2288 return AttributeString; 2289 } 2290 2291 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, 2292 GraphType) { 2293 auto &Edge = *(ChildIter.getCurrent()); 2294 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + 2295 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"") 2296 .str(); 2297 } 2298 2299 // Since the NodeOwners list includes nodes that are no longer connected to 2300 // the graph, skip them here. 2301 static bool isNodeHidden(NodeRef Node, GraphType) { 2302 return Node->isRemoved(); 2303 } 2304 2305 private: 2306 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { 2307 std::string IdString = "ContextIds:"; 2308 if (ContextIds.size() < 100) { 2309 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 2310 std::sort(SortedIds.begin(), SortedIds.end()); 2311 for (auto Id : SortedIds) 2312 IdString += (" " + Twine(Id)).str(); 2313 } else { 2314 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); 2315 } 2316 return IdString; 2317 } 2318 2319 static std::string getColor(uint8_t AllocTypes) { 2320 if (AllocTypes == (uint8_t)AllocationType::NotCold) 2321 // Color "brown1" actually looks like a lighter red. 2322 return "brown1"; 2323 if (AllocTypes == (uint8_t)AllocationType::Cold) 2324 return "cyan"; 2325 if (AllocTypes == 2326 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 2327 // Lighter purple. 2328 return "mediumorchid1"; 2329 return "gray"; 2330 } 2331 2332 static std::string getNodeId(NodeRef Node) { 2333 std::stringstream SStream; 2334 SStream << std::hex << "N0x" << (unsigned long long)Node; 2335 std::string Result = SStream.str(); 2336 return Result; 2337 } 2338 }; 2339 2340 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2341 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( 2342 std::string Label) const { 2343 WriteGraph(this, "", false, Label, 2344 DotFilePathPrefix + "ccg." + Label + ".dot"); 2345 } 2346 2347 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2348 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 2349 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( 2350 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) { 2351 ContextNode *Node = Edge->Callee; 2352 NodeOwner.push_back( 2353 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); 2354 ContextNode *Clone = NodeOwner.back().get(); 2355 Node->addClone(Clone); 2356 assert(NodeToCallingFunc.count(Node)); 2357 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; 2358 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true); 2359 return Clone; 2360 } 2361 2362 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2363 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 2364 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 2365 ContextNode *NewCallee, EdgeIter *CallerEdgeI, 2366 bool NewClone) { 2367 // NewCallee and Edge's current callee must be clones of the same original 2368 // node (Edge's current callee may be the original node too). 2369 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); 2370 auto &EdgeContextIds = Edge->getContextIds(); 2371 ContextNode *OldCallee = Edge->Callee; 2372 if (CallerEdgeI) 2373 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); 2374 else 2375 OldCallee->eraseCallerEdge(Edge.get()); 2376 Edge->Callee = NewCallee; 2377 NewCallee->CallerEdges.push_back(Edge); 2378 // Don't need to update Edge's context ids since we are simply reconnecting 2379 // it. 2380 set_subtract(OldCallee->ContextIds, EdgeContextIds); 2381 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end()); 2382 NewCallee->AllocTypes |= Edge->AllocTypes; 2383 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); 2384 // OldCallee alloc type should be None iff its context id set is now empty. 2385 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == 2386 OldCallee->ContextIds.empty()); 2387 // Now walk the old callee node's callee edges and move Edge's context ids 2388 // over to the corresponding edge into the clone (which is created here if 2389 // this is a newly created clone). 2390 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { 2391 // The context ids moving to the new callee are the subset of this edge's 2392 // context ids and the context ids on the caller edge being moved. 2393 DenseSet<uint32_t> EdgeContextIdsToMove = 2394 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds); 2395 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); 2396 OldCalleeEdge->AllocTypes = 2397 computeAllocType(OldCalleeEdge->getContextIds()); 2398 if (!NewClone) { 2399 // Update context ids / alloc type on corresponding edge to NewCallee. 2400 // There is a chance this may not exist if we are reusing an existing 2401 // clone, specifically during function assignment, where we would have 2402 // removed none type edges after creating the clone. If we can't find 2403 // a corresponding edge there, fall through to the cloning below. 2404 if (auto *NewCalleeEdge = 2405 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { 2406 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 2407 EdgeContextIdsToMove.end()); 2408 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 2409 continue; 2410 } 2411 } 2412 auto NewEdge = std::make_shared<ContextEdge>( 2413 OldCalleeEdge->Callee, NewCallee, 2414 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 2415 NewCallee->CalleeEdges.push_back(NewEdge); 2416 NewEdge->Callee->CallerEdges.push_back(NewEdge); 2417 } 2418 if (VerifyCCG) { 2419 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); 2420 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); 2421 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) 2422 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, 2423 /*CheckEdges=*/false); 2424 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) 2425 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, 2426 /*CheckEdges=*/false); 2427 } 2428 } 2429 2430 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2431 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { 2432 DenseSet<const ContextNode *> Visited; 2433 for (auto &Entry : AllocationCallToContextNodeMap) 2434 identifyClones(Entry.second, Visited); 2435 } 2436 2437 // helper function to check an AllocType is cold or notcold or both. 2438 bool checkColdOrNotCold(uint8_t AllocType) { 2439 return (AllocType == (uint8_t)AllocationType::Cold) || 2440 (AllocType == (uint8_t)AllocationType::NotCold) || 2441 (AllocType == 2442 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); 2443 } 2444 2445 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2446 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( 2447 ContextNode *Node, DenseSet<const ContextNode *> &Visited) { 2448 if (VerifyNodes) 2449 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2450 assert(!Node->CloneOf); 2451 2452 // If Node as a null call, then either it wasn't found in the module (regular 2453 // LTO) or summary index (ThinLTO), or there were other conditions blocking 2454 // cloning (e.g. recursion, calls multiple targets, etc). 2455 // Do this here so that we don't try to recursively clone callers below, which 2456 // isn't useful at least for this node. 2457 if (!Node->hasCall()) 2458 return; 2459 2460 #ifndef NDEBUG 2461 auto Insert = 2462 #endif 2463 Visited.insert(Node); 2464 // We should not have visited this node yet. 2465 assert(Insert.second); 2466 // The recursive call to identifyClones may delete the current edge from the 2467 // CallerEdges vector. Make a copy and iterate on that, simpler than passing 2468 // in an iterator and having recursive call erase from it. Other edges may 2469 // also get removed during the recursion, which will have null Callee and 2470 // Caller pointers (and are deleted later), so we skip those below. 2471 { 2472 auto CallerEdges = Node->CallerEdges; 2473 for (auto &Edge : CallerEdges) { 2474 // Skip any that have been removed by an earlier recursive call. 2475 if (Edge->Callee == nullptr && Edge->Caller == nullptr) { 2476 assert(!llvm::count(Node->CallerEdges, Edge)); 2477 continue; 2478 } 2479 // Ignore any caller we previously visited via another edge. 2480 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { 2481 identifyClones(Edge->Caller, Visited); 2482 } 2483 } 2484 } 2485 2486 // Check if we reached an unambiguous call or have have only a single caller. 2487 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2488 return; 2489 2490 // We need to clone. 2491 2492 // Try to keep the original version as alloc type NotCold. This will make 2493 // cases with indirect calls or any other situation with an unknown call to 2494 // the original function get the default behavior. We do this by sorting the 2495 // CallerEdges of the Node we will clone by alloc type. 2496 // 2497 // Give NotCold edge the lowest sort priority so those edges are at the end of 2498 // the caller edges vector, and stay on the original version (since the below 2499 // code clones greedily until it finds all remaining edges have the same type 2500 // and leaves the remaining ones on the original Node). 2501 // 2502 // We shouldn't actually have any None type edges, so the sorting priority for 2503 // that is arbitrary, and we assert in that case below. 2504 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, 2505 /*Cold*/ 1, 2506 /*NotColdCold*/ 2}; 2507 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), 2508 [&](const std::shared_ptr<ContextEdge> &A, 2509 const std::shared_ptr<ContextEdge> &B) { 2510 assert(checkColdOrNotCold(A->AllocTypes) && 2511 checkColdOrNotCold(B->AllocTypes)); 2512 2513 if (A->AllocTypes == B->AllocTypes) 2514 // Use the first context id for each edge as a 2515 // tie-breaker. 2516 return *A->ContextIds.begin() < *B->ContextIds.begin(); 2517 return AllocTypeCloningPriority[A->AllocTypes] < 2518 AllocTypeCloningPriority[B->AllocTypes]; 2519 }); 2520 2521 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2522 2523 // Iterate until we find no more opportunities for disambiguating the alloc 2524 // types via cloning. In most cases this loop will terminate once the Node 2525 // has a single allocation type, in which case no more cloning is needed. 2526 // We need to be able to remove Edge from CallerEdges, so need to adjust 2527 // iterator inside the loop. 2528 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { 2529 auto CallerEdge = *EI; 2530 2531 // See if cloning the prior caller edge left this node with a single alloc 2532 // type or a single caller. In that case no more cloning of Node is needed. 2533 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2534 break; 2535 2536 // Compute the node callee edge alloc types corresponding to the context ids 2537 // for this caller edge. 2538 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; 2539 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); 2540 for (auto &CalleeEdge : Node->CalleeEdges) 2541 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( 2542 CalleeEdge->getContextIds(), CallerEdge->getContextIds())); 2543 2544 // Don't clone if doing so will not disambiguate any alloc types amongst 2545 // caller edges (including the callee edges that would be cloned). 2546 // Otherwise we will simply move all edges to the clone. 2547 // 2548 // First check if by cloning we will disambiguate the caller allocation 2549 // type from node's allocation type. Query allocTypeToUse so that we don't 2550 // bother cloning to distinguish NotCold+Cold from NotCold. Note that 2551 // neither of these should be None type. 2552 // 2553 // Then check if by cloning node at least one of the callee edges will be 2554 // disambiguated by splitting out different context ids. 2555 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); 2556 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2557 if (allocTypeToUse(CallerEdge->AllocTypes) == 2558 allocTypeToUse(Node->AllocTypes) && 2559 allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2560 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { 2561 ++EI; 2562 continue; 2563 } 2564 2565 // First see if we can use an existing clone. Check each clone and its 2566 // callee edges for matching alloc types. 2567 ContextNode *Clone = nullptr; 2568 for (auto *CurClone : Node->Clones) { 2569 if (allocTypeToUse(CurClone->AllocTypes) != 2570 allocTypeToUse(CallerEdge->AllocTypes)) 2571 continue; 2572 2573 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2574 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) 2575 continue; 2576 Clone = CurClone; 2577 break; 2578 } 2579 2580 // The edge iterator is adjusted when we move the CallerEdge to the clone. 2581 if (Clone) 2582 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI); 2583 else 2584 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI); 2585 2586 assert(EI == Node->CallerEdges.end() || 2587 Node->AllocTypes != (uint8_t)AllocationType::None); 2588 // Sanity check that no alloc types on clone or its edges are None. 2589 assert(Clone->AllocTypes != (uint8_t)AllocationType::None); 2590 assert(llvm::none_of( 2591 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2592 return E->AllocTypes == (uint8_t)AllocationType::None; 2593 })); 2594 } 2595 2596 // Cloning may have resulted in some cloned callee edges with type None, 2597 // because they aren't carrying any contexts. Remove those edges. 2598 for (auto *Clone : Node->Clones) { 2599 removeNoneTypeCalleeEdges(Clone); 2600 if (VerifyNodes) 2601 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2602 } 2603 // We should still have some context ids on the original Node. 2604 assert(!Node->ContextIds.empty()); 2605 2606 // Remove any callee edges that ended up with alloc type None after creating 2607 // clones and updating callee edges. 2608 removeNoneTypeCalleeEdges(Node); 2609 2610 // Sanity check that no alloc types on node or edges are None. 2611 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2612 assert(llvm::none_of(Node->CalleeEdges, 2613 [&](const std::shared_ptr<ContextEdge> &E) { 2614 return E->AllocTypes == (uint8_t)AllocationType::None; 2615 })); 2616 assert(llvm::none_of(Node->CallerEdges, 2617 [&](const std::shared_ptr<ContextEdge> &E) { 2618 return E->AllocTypes == (uint8_t)AllocationType::None; 2619 })); 2620 2621 if (VerifyNodes) 2622 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2623 } 2624 2625 void ModuleCallsiteContextGraph::updateAllocationCall( 2626 CallInfo &Call, AllocationType AllocType) { 2627 std::string AllocTypeString = getAllocTypeAttributeString(AllocType); 2628 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), 2629 "memprof", AllocTypeString); 2630 cast<CallBase>(Call.call())->addFnAttr(A); 2631 OREGetter(Call.call()->getFunction()) 2632 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) 2633 << ore::NV("AllocationCall", Call.call()) << " in clone " 2634 << ore::NV("Caller", Call.call()->getFunction()) 2635 << " marked with memprof allocation attribute " 2636 << ore::NV("Attribute", AllocTypeString)); 2637 } 2638 2639 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, 2640 AllocationType AllocType) { 2641 auto *AI = Call.call().dyn_cast<AllocInfo *>(); 2642 assert(AI); 2643 assert(AI->Versions.size() > Call.cloneNo()); 2644 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; 2645 } 2646 2647 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2648 FuncInfo CalleeFunc) { 2649 if (CalleeFunc.cloneNo() > 0) 2650 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); 2651 OREGetter(CallerCall.call()->getFunction()) 2652 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) 2653 << ore::NV("Call", CallerCall.call()) << " in clone " 2654 << ore::NV("Caller", CallerCall.call()->getFunction()) 2655 << " assigned to call function clone " 2656 << ore::NV("Callee", CalleeFunc.func())); 2657 } 2658 2659 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2660 FuncInfo CalleeFunc) { 2661 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); 2662 assert(CI && 2663 "Caller cannot be an allocation which should not have profiled calls"); 2664 assert(CI->Clones.size() > CallerCall.cloneNo()); 2665 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); 2666 } 2667 2668 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 2669 Instruction *>::FuncInfo 2670 ModuleCallsiteContextGraph::cloneFunctionForCallsite( 2671 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2672 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2673 // Use existing LLVM facilities for cloning and obtaining Call in clone 2674 ValueToValueMapTy VMap; 2675 auto *NewFunc = CloneFunction(Func.func(), VMap); 2676 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); 2677 assert(!Func.func()->getParent()->getFunction(Name)); 2678 NewFunc->setName(Name); 2679 for (auto &Inst : CallsWithMetadataInFunc) { 2680 // This map always has the initial version in it. 2681 assert(Inst.cloneNo() == 0); 2682 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; 2683 } 2684 OREGetter(Func.func()) 2685 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) 2686 << "created clone " << ore::NV("NewFunction", NewFunc)); 2687 return {NewFunc, CloneNo}; 2688 } 2689 2690 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 2691 IndexCall>::FuncInfo 2692 IndexCallsiteContextGraph::cloneFunctionForCallsite( 2693 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2694 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2695 // Check how many clones we have of Call (and therefore function). 2696 // The next clone number is the current size of versions array. 2697 // Confirm this matches the CloneNo provided by the caller, which is based on 2698 // the number of function clones we have. 2699 assert(CloneNo == 2700 (Call.call().is<AllocInfo *>() 2701 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() 2702 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); 2703 // Walk all the instructions in this function. Create a new version for 2704 // each (by adding an entry to the Versions/Clones summary array), and copy 2705 // over the version being called for the function clone being cloned here. 2706 // Additionally, add an entry to the CallMap for the new function clone, 2707 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) 2708 // to the new call clone. 2709 for (auto &Inst : CallsWithMetadataInFunc) { 2710 // This map always has the initial version in it. 2711 assert(Inst.cloneNo() == 0); 2712 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { 2713 assert(AI->Versions.size() == CloneNo); 2714 // We assign the allocation type later (in updateAllocationCall), just add 2715 // an entry for it here. 2716 AI->Versions.push_back(0); 2717 } else { 2718 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); 2719 assert(CI && CI->Clones.size() == CloneNo); 2720 // We assign the clone number later (in updateCall), just add an entry for 2721 // it here. 2722 CI->Clones.push_back(0); 2723 } 2724 CallMap[Inst] = {Inst.call(), CloneNo}; 2725 } 2726 return {Func.func(), CloneNo}; 2727 } 2728 2729 // This method assigns cloned callsites to functions, cloning the functions as 2730 // needed. The assignment is greedy and proceeds roughly as follows: 2731 // 2732 // For each function Func: 2733 // For each call with graph Node having clones: 2734 // Initialize ClonesWorklist to Node and its clones 2735 // Initialize NodeCloneCount to 0 2736 // While ClonesWorklist is not empty: 2737 // Clone = pop front ClonesWorklist 2738 // NodeCloneCount++ 2739 // If Func has been cloned less than NodeCloneCount times: 2740 // If NodeCloneCount is 1: 2741 // Assign Clone to original Func 2742 // Continue 2743 // Create a new function clone 2744 // If other callers not assigned to call a function clone yet: 2745 // Assign them to call new function clone 2746 // Continue 2747 // Assign any other caller calling the cloned version to new clone 2748 // 2749 // For each caller of Clone: 2750 // If caller is assigned to call a specific function clone: 2751 // If we cannot assign Clone to that function clone: 2752 // Create new callsite Clone NewClone 2753 // Add NewClone to ClonesWorklist 2754 // Continue 2755 // Assign Clone to existing caller's called function clone 2756 // Else: 2757 // If Clone not already assigned to a function clone: 2758 // Assign to first function clone without assignment 2759 // Assign caller to selected function clone 2760 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2761 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { 2762 bool Changed = false; 2763 2764 // Keep track of the assignment of nodes (callsites) to function clones they 2765 // call. 2766 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; 2767 2768 // Update caller node to call function version CalleeFunc, by recording the 2769 // assignment in CallsiteToCalleeFuncCloneMap. 2770 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, 2771 const FuncInfo &CalleeFunc) { 2772 assert(Caller->hasCall()); 2773 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; 2774 }; 2775 2776 // Walk all functions for which we saw calls with memprof metadata, and handle 2777 // cloning for each of its calls. 2778 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 2779 FuncInfo OrigFunc(Func); 2780 // Map from each clone of OrigFunc to a map of remappings of each call of 2781 // interest (from original uncloned call to the corresponding cloned call in 2782 // that function clone). 2783 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; 2784 for (auto &Call : CallsWithMetadata) { 2785 ContextNode *Node = getNodeForInst(Call); 2786 // Skip call if we do not have a node for it (all uses of its stack ids 2787 // were either on inlined chains or pruned from the MIBs), or if we did 2788 // not create any clones for it. 2789 if (!Node || Node->Clones.empty()) 2790 continue; 2791 assert(Node->hasCall() && 2792 "Not having a call should have prevented cloning"); 2793 2794 // Track the assignment of function clones to clones of the current 2795 // callsite Node being handled. 2796 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; 2797 2798 // Assign callsite version CallsiteClone to function version FuncClone, 2799 // and also assign (possibly cloned) Call to CallsiteClone. 2800 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, 2801 CallInfo &Call, 2802 ContextNode *CallsiteClone, 2803 bool IsAlloc) { 2804 // Record the clone of callsite node assigned to this function clone. 2805 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; 2806 2807 assert(FuncClonesToCallMap.count(FuncClone)); 2808 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; 2809 CallInfo CallClone(Call); 2810 if (CallMap.count(Call)) 2811 CallClone = CallMap[Call]; 2812 CallsiteClone->setCall(CallClone); 2813 }; 2814 2815 // Keep track of the clones of callsite Node that need to be assigned to 2816 // function clones. This list may be expanded in the loop body below if we 2817 // find additional cloning is required. 2818 std::deque<ContextNode *> ClonesWorklist; 2819 // Ignore original Node if we moved all of its contexts to clones. 2820 if (!Node->ContextIds.empty()) 2821 ClonesWorklist.push_back(Node); 2822 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), 2823 Node->Clones.end()); 2824 2825 // Now walk through all of the clones of this callsite Node that we need, 2826 // and determine the assignment to a corresponding clone of the current 2827 // function (creating new function clones as needed). 2828 unsigned NodeCloneCount = 0; 2829 while (!ClonesWorklist.empty()) { 2830 ContextNode *Clone = ClonesWorklist.front(); 2831 ClonesWorklist.pop_front(); 2832 NodeCloneCount++; 2833 if (VerifyNodes) 2834 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2835 2836 // Need to create a new function clone if we have more callsite clones 2837 // than existing function clones, which would have been assigned to an 2838 // earlier clone in the list (we assign callsite clones to function 2839 // clones greedily). 2840 if (FuncClonesToCallMap.size() < NodeCloneCount) { 2841 // If this is the first callsite copy, assign to original function. 2842 if (NodeCloneCount == 1) { 2843 // Since FuncClonesToCallMap is empty in this case, no clones have 2844 // been created for this function yet, and no callers should have 2845 // been assigned a function clone for this callee node yet. 2846 assert(llvm::none_of( 2847 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2848 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2849 })); 2850 // Initialize with empty call map, assign Clone to original function 2851 // and its callers, and skip to the next clone. 2852 FuncClonesToCallMap[OrigFunc] = {}; 2853 AssignCallsiteCloneToFuncClone( 2854 OrigFunc, Call, Clone, 2855 AllocationCallToContextNodeMap.count(Call)); 2856 for (auto &CE : Clone->CallerEdges) { 2857 // Ignore any caller that does not have a recorded callsite Call. 2858 if (!CE->Caller->hasCall()) 2859 continue; 2860 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); 2861 } 2862 continue; 2863 } 2864 2865 // First locate which copy of OrigFunc to clone again. If a caller 2866 // of this callsite clone was already assigned to call a particular 2867 // function clone, we need to redirect all of those callers to the 2868 // new function clone, and update their other callees within this 2869 // function. 2870 FuncInfo PreviousAssignedFuncClone; 2871 auto EI = llvm::find_if( 2872 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2873 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2874 }); 2875 bool CallerAssignedToCloneOfFunc = false; 2876 if (EI != Clone->CallerEdges.end()) { 2877 const std::shared_ptr<ContextEdge> &Edge = *EI; 2878 PreviousAssignedFuncClone = 2879 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 2880 CallerAssignedToCloneOfFunc = true; 2881 } 2882 2883 // Clone function and save it along with the CallInfo map created 2884 // during cloning in the FuncClonesToCallMap. 2885 std::map<CallInfo, CallInfo> NewCallMap; 2886 unsigned CloneNo = FuncClonesToCallMap.size(); 2887 assert(CloneNo > 0 && "Clone 0 is the original function, which " 2888 "should already exist in the map"); 2889 FuncInfo NewFuncClone = cloneFunctionForCallsite( 2890 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); 2891 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); 2892 FunctionClonesAnalysis++; 2893 Changed = true; 2894 2895 // If no caller callsites were already assigned to a clone of this 2896 // function, we can simply assign this clone to the new func clone 2897 // and update all callers to it, then skip to the next clone. 2898 if (!CallerAssignedToCloneOfFunc) { 2899 AssignCallsiteCloneToFuncClone( 2900 NewFuncClone, Call, Clone, 2901 AllocationCallToContextNodeMap.count(Call)); 2902 for (auto &CE : Clone->CallerEdges) { 2903 // Ignore any caller that does not have a recorded callsite Call. 2904 if (!CE->Caller->hasCall()) 2905 continue; 2906 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2907 } 2908 continue; 2909 } 2910 2911 // We may need to do additional node cloning in this case. 2912 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers 2913 // that were previously assigned to call PreviousAssignedFuncClone, 2914 // to record that they now call NewFuncClone. 2915 for (auto CE : Clone->CallerEdges) { 2916 // Skip any that have been removed on an earlier iteration. 2917 if (!CE) 2918 continue; 2919 // Ignore any caller that does not have a recorded callsite Call. 2920 if (!CE->Caller->hasCall()) 2921 continue; 2922 2923 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || 2924 // We subsequently fall through to later handling that 2925 // will perform any additional cloning required for 2926 // callers that were calling other function clones. 2927 CallsiteToCalleeFuncCloneMap[CE->Caller] != 2928 PreviousAssignedFuncClone) 2929 continue; 2930 2931 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2932 2933 // If we are cloning a function that was already assigned to some 2934 // callers, then essentially we are creating new callsite clones 2935 // of the other callsites in that function that are reached by those 2936 // callers. Clone the other callees of the current callsite's caller 2937 // that were already assigned to PreviousAssignedFuncClone 2938 // accordingly. This is important since we subsequently update the 2939 // calls from the nodes in the graph and their assignments to callee 2940 // functions recorded in CallsiteToCalleeFuncCloneMap. 2941 for (auto CalleeEdge : CE->Caller->CalleeEdges) { 2942 // Skip any that have been removed on an earlier iteration when 2943 // cleaning up newly None type callee edges. 2944 if (!CalleeEdge) 2945 continue; 2946 ContextNode *Callee = CalleeEdge->Callee; 2947 // Skip the current callsite, we are looking for other 2948 // callsites Caller calls, as well as any that does not have a 2949 // recorded callsite Call. 2950 if (Callee == Clone || !Callee->hasCall()) 2951 continue; 2952 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); 2953 removeNoneTypeCalleeEdges(NewClone); 2954 // Moving the edge may have resulted in some none type 2955 // callee edges on the original Callee. 2956 removeNoneTypeCalleeEdges(Callee); 2957 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 2958 // If the Callee node was already assigned to call a specific 2959 // function version, make sure its new clone is assigned to call 2960 // that same function clone. 2961 if (CallsiteToCalleeFuncCloneMap.count(Callee)) 2962 RecordCalleeFuncOfCallsite( 2963 NewClone, CallsiteToCalleeFuncCloneMap[Callee]); 2964 // Update NewClone with the new Call clone of this callsite's Call 2965 // created for the new function clone created earlier. 2966 // Recall that we have already ensured when building the graph 2967 // that each caller can only call callsites within the same 2968 // function, so we are guaranteed that Callee Call is in the 2969 // current OrigFunc. 2970 // CallMap is set up as indexed by original Call at clone 0. 2971 CallInfo OrigCall(Callee->getOrigNode()->Call); 2972 OrigCall.setCloneNo(0); 2973 std::map<CallInfo, CallInfo> &CallMap = 2974 FuncClonesToCallMap[NewFuncClone]; 2975 assert(CallMap.count(OrigCall)); 2976 CallInfo NewCall(CallMap[OrigCall]); 2977 assert(NewCall); 2978 NewClone->setCall(NewCall); 2979 } 2980 } 2981 // Fall through to handling below to perform the recording of the 2982 // function for this callsite clone. This enables handling of cases 2983 // where the callers were assigned to different clones of a function. 2984 } 2985 2986 // See if we can use existing function clone. Walk through 2987 // all caller edges to see if any have already been assigned to 2988 // a clone of this callsite's function. If we can use it, do so. If not, 2989 // because that function clone is already assigned to a different clone 2990 // of this callsite, then we need to clone again. 2991 // Basically, this checking is needed to handle the case where different 2992 // caller functions/callsites may need versions of this function 2993 // containing different mixes of callsite clones across the different 2994 // callsites within the function. If that happens, we need to create 2995 // additional function clones to handle the various combinations. 2996 // 2997 // Keep track of any new clones of this callsite created by the 2998 // following loop, as well as any existing clone that we decided to 2999 // assign this clone to. 3000 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; 3001 FuncInfo FuncCloneAssignedToCurCallsiteClone; 3002 // We need to be able to remove Edge from CallerEdges, so need to adjust 3003 // iterator in the loop. 3004 for (auto EI = Clone->CallerEdges.begin(); 3005 EI != Clone->CallerEdges.end();) { 3006 auto Edge = *EI; 3007 // Ignore any caller that does not have a recorded callsite Call. 3008 if (!Edge->Caller->hasCall()) { 3009 EI++; 3010 continue; 3011 } 3012 // If this caller already assigned to call a version of OrigFunc, need 3013 // to ensure we can assign this callsite clone to that function clone. 3014 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { 3015 FuncInfo FuncCloneCalledByCaller = 3016 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 3017 // First we need to confirm that this function clone is available 3018 // for use by this callsite node clone. 3019 // 3020 // While FuncCloneToCurNodeCloneMap is built only for this Node and 3021 // its callsite clones, one of those callsite clones X could have 3022 // been assigned to the same function clone called by Edge's caller 3023 // - if Edge's caller calls another callsite within Node's original 3024 // function, and that callsite has another caller reaching clone X. 3025 // We need to clone Node again in this case. 3026 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && 3027 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != 3028 Clone) || 3029 // Detect when we have multiple callers of this callsite that 3030 // have already been assigned to specific, and different, clones 3031 // of OrigFunc (due to other unrelated callsites in Func they 3032 // reach via call contexts). Is this Clone of callsite Node 3033 // assigned to a different clone of OrigFunc? If so, clone Node 3034 // again. 3035 (FuncCloneAssignedToCurCallsiteClone && 3036 FuncCloneAssignedToCurCallsiteClone != 3037 FuncCloneCalledByCaller)) { 3038 // We need to use a different newly created callsite clone, in 3039 // order to assign it to another new function clone on a 3040 // subsequent iteration over the Clones array (adjusted below). 3041 // Note we specifically do not reset the 3042 // CallsiteToCalleeFuncCloneMap entry for this caller, so that 3043 // when this new clone is processed later we know which version of 3044 // the function to copy (so that other callsite clones we have 3045 // assigned to that function clone are properly cloned over). See 3046 // comments in the function cloning handling earlier. 3047 3048 // Check if we already have cloned this callsite again while 3049 // walking through caller edges, for a caller calling the same 3050 // function clone. If so, we can move this edge to that new clone 3051 // rather than creating yet another new clone. 3052 if (FuncCloneToNewCallsiteCloneMap.count( 3053 FuncCloneCalledByCaller)) { 3054 ContextNode *NewClone = 3055 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; 3056 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); 3057 // Cleanup any none type edges cloned over. 3058 removeNoneTypeCalleeEdges(NewClone); 3059 } else { 3060 // Create a new callsite clone. 3061 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); 3062 removeNoneTypeCalleeEdges(NewClone); 3063 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = 3064 NewClone; 3065 // Add to list of clones and process later. 3066 ClonesWorklist.push_back(NewClone); 3067 assert(EI == Clone->CallerEdges.end() || 3068 Clone->AllocTypes != (uint8_t)AllocationType::None); 3069 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 3070 } 3071 // Moving the caller edge may have resulted in some none type 3072 // callee edges. 3073 removeNoneTypeCalleeEdges(Clone); 3074 // We will handle the newly created callsite clone in a subsequent 3075 // iteration over this Node's Clones. Continue here since we 3076 // already adjusted iterator EI while moving the edge. 3077 continue; 3078 } 3079 3080 // Otherwise, we can use the function clone already assigned to this 3081 // caller. 3082 if (!FuncCloneAssignedToCurCallsiteClone) { 3083 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; 3084 // Assign Clone to FuncCloneCalledByCaller 3085 AssignCallsiteCloneToFuncClone( 3086 FuncCloneCalledByCaller, Call, Clone, 3087 AllocationCallToContextNodeMap.count(Call)); 3088 } else 3089 // Don't need to do anything - callsite is already calling this 3090 // function clone. 3091 assert(FuncCloneAssignedToCurCallsiteClone == 3092 FuncCloneCalledByCaller); 3093 3094 } else { 3095 // We have not already assigned this caller to a version of 3096 // OrigFunc. Do the assignment now. 3097 3098 // First check if we have already assigned this callsite clone to a 3099 // clone of OrigFunc for another caller during this iteration over 3100 // its caller edges. 3101 if (!FuncCloneAssignedToCurCallsiteClone) { 3102 // Find first function in FuncClonesToCallMap without an assigned 3103 // clone of this callsite Node. We should always have one 3104 // available at this point due to the earlier cloning when the 3105 // FuncClonesToCallMap size was smaller than the clone number. 3106 for (auto &CF : FuncClonesToCallMap) { 3107 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { 3108 FuncCloneAssignedToCurCallsiteClone = CF.first; 3109 break; 3110 } 3111 } 3112 assert(FuncCloneAssignedToCurCallsiteClone); 3113 // Assign Clone to FuncCloneAssignedToCurCallsiteClone 3114 AssignCallsiteCloneToFuncClone( 3115 FuncCloneAssignedToCurCallsiteClone, Call, Clone, 3116 AllocationCallToContextNodeMap.count(Call)); 3117 } else 3118 assert(FuncCloneToCurNodeCloneMap 3119 [FuncCloneAssignedToCurCallsiteClone] == Clone); 3120 // Update callers to record function version called. 3121 RecordCalleeFuncOfCallsite(Edge->Caller, 3122 FuncCloneAssignedToCurCallsiteClone); 3123 } 3124 3125 EI++; 3126 } 3127 } 3128 if (VerifyCCG) { 3129 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 3130 for (const auto &PE : Node->CalleeEdges) 3131 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 3132 for (const auto &CE : Node->CallerEdges) 3133 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 3134 for (auto *Clone : Node->Clones) { 3135 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 3136 for (const auto &PE : Clone->CalleeEdges) 3137 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 3138 for (const auto &CE : Clone->CallerEdges) 3139 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 3140 } 3141 } 3142 } 3143 } 3144 3145 auto UpdateCalls = [&](ContextNode *Node, 3146 DenseSet<const ContextNode *> &Visited, 3147 auto &&UpdateCalls) { 3148 auto Inserted = Visited.insert(Node); 3149 if (!Inserted.second) 3150 return; 3151 3152 for (auto *Clone : Node->Clones) 3153 UpdateCalls(Clone, Visited, UpdateCalls); 3154 3155 for (auto &Edge : Node->CallerEdges) 3156 UpdateCalls(Edge->Caller, Visited, UpdateCalls); 3157 3158 // Skip if either no call to update, or if we ended up with no context ids 3159 // (we moved all edges onto other clones). 3160 if (!Node->hasCall() || Node->ContextIds.empty()) 3161 return; 3162 3163 if (Node->IsAllocation) { 3164 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); 3165 return; 3166 } 3167 3168 if (!CallsiteToCalleeFuncCloneMap.count(Node)) 3169 return; 3170 3171 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; 3172 updateCall(Node->Call, CalleeFunc); 3173 }; 3174 3175 // Performs DFS traversal starting from allocation nodes to update calls to 3176 // reflect cloning decisions recorded earlier. For regular LTO this will 3177 // update the actual calls in the IR to call the appropriate function clone 3178 // (and add attributes to allocation calls), whereas for ThinLTO the decisions 3179 // are recorded in the summary entries. 3180 DenseSet<const ContextNode *> Visited; 3181 for (auto &Entry : AllocationCallToContextNodeMap) 3182 UpdateCalls(Entry.second, Visited, UpdateCalls); 3183 3184 return Changed; 3185 } 3186 3187 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( 3188 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, 3189 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 3190 &FuncToAliasMap) { 3191 // The first "clone" is the original copy, we should only call this if we 3192 // needed to create new clones. 3193 assert(NumClones > 1); 3194 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 3195 VMaps.reserve(NumClones - 1); 3196 FunctionsClonedThinBackend++; 3197 for (unsigned I = 1; I < NumClones; I++) { 3198 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); 3199 auto *NewF = CloneFunction(&F, *VMaps.back()); 3200 FunctionClonesThinBackend++; 3201 // Strip memprof and callsite metadata from clone as they are no longer 3202 // needed. 3203 for (auto &BB : *NewF) { 3204 for (auto &Inst : BB) { 3205 Inst.setMetadata(LLVMContext::MD_memprof, nullptr); 3206 Inst.setMetadata(LLVMContext::MD_callsite, nullptr); 3207 } 3208 } 3209 std::string Name = getMemProfFuncName(F.getName(), I); 3210 auto *PrevF = M.getFunction(Name); 3211 if (PrevF) { 3212 // We might have created this when adjusting callsite in another 3213 // function. It should be a declaration. 3214 assert(PrevF->isDeclaration()); 3215 NewF->takeName(PrevF); 3216 PrevF->replaceAllUsesWith(NewF); 3217 PrevF->eraseFromParent(); 3218 } else 3219 NewF->setName(Name); 3220 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) 3221 << "created clone " << ore::NV("NewFunction", NewF)); 3222 3223 // Now handle aliases to this function, and clone those as well. 3224 if (!FuncToAliasMap.count(&F)) 3225 continue; 3226 for (auto *A : FuncToAliasMap[&F]) { 3227 std::string Name = getMemProfFuncName(A->getName(), I); 3228 auto *PrevA = M.getNamedAlias(Name); 3229 auto *NewA = GlobalAlias::create(A->getValueType(), 3230 A->getType()->getPointerAddressSpace(), 3231 A->getLinkage(), Name, NewF); 3232 NewA->copyAttributesFrom(A); 3233 if (PrevA) { 3234 // We might have created this when adjusting callsite in another 3235 // function. It should be a declaration. 3236 assert(PrevA->isDeclaration()); 3237 NewA->takeName(PrevA); 3238 PrevA->replaceAllUsesWith(NewA); 3239 PrevA->eraseFromParent(); 3240 } 3241 } 3242 } 3243 return VMaps; 3244 } 3245 3246 // Locate the summary for F. This is complicated by the fact that it might 3247 // have been internalized or promoted. 3248 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, 3249 const ModuleSummaryIndex *ImportSummary) { 3250 // FIXME: Ideally we would retain the original GUID in some fashion on the 3251 // function (e.g. as metadata), but for now do our best to locate the 3252 // summary without that information. 3253 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); 3254 if (!TheFnVI) 3255 // See if theFn was internalized, by checking index directly with 3256 // original name (this avoids the name adjustment done by getGUID() for 3257 // internal symbols). 3258 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); 3259 if (TheFnVI) 3260 return TheFnVI; 3261 // Now query with the original name before any promotion was performed. 3262 StringRef OrigName = 3263 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); 3264 std::string OrigId = GlobalValue::getGlobalIdentifier( 3265 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); 3266 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); 3267 if (TheFnVI) 3268 return TheFnVI; 3269 // Could be a promoted local imported from another module. We need to pass 3270 // down more info here to find the original module id. For now, try with 3271 // the OrigName which might have been stored in the OidGuidMap in the 3272 // index. This would not work if there were same-named locals in multiple 3273 // modules, however. 3274 auto OrigGUID = 3275 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); 3276 if (OrigGUID) 3277 TheFnVI = ImportSummary->getValueInfo(OrigGUID); 3278 return TheFnVI; 3279 } 3280 3281 bool MemProfContextDisambiguation::applyImport(Module &M) { 3282 assert(ImportSummary); 3283 bool Changed = false; 3284 3285 auto IsMemProfClone = [](const Function &F) { 3286 return F.getName().contains(MemProfCloneSuffix); 3287 }; 3288 3289 // We also need to clone any aliases that reference cloned functions, because 3290 // the modified callsites may invoke via the alias. Keep track of the aliases 3291 // for each function. 3292 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 3293 FuncToAliasMap; 3294 for (auto &A : M.aliases()) { 3295 auto *Aliasee = A.getAliaseeObject(); 3296 if (auto *F = dyn_cast<Function>(Aliasee)) 3297 FuncToAliasMap[F].insert(&A); 3298 } 3299 3300 for (auto &F : M) { 3301 if (F.isDeclaration() || IsMemProfClone(F)) 3302 continue; 3303 3304 OptimizationRemarkEmitter ORE(&F); 3305 3306 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 3307 bool ClonesCreated = false; 3308 unsigned NumClonesCreated = 0; 3309 auto CloneFuncIfNeeded = [&](unsigned NumClones) { 3310 // We should at least have version 0 which is the original copy. 3311 assert(NumClones > 0); 3312 // If only one copy needed use original. 3313 if (NumClones == 1) 3314 return; 3315 // If we already performed cloning of this function, confirm that the 3316 // requested number of clones matches (the thin link should ensure the 3317 // number of clones for each constituent callsite is consistent within 3318 // each function), before returning. 3319 if (ClonesCreated) { 3320 assert(NumClonesCreated == NumClones); 3321 return; 3322 } 3323 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); 3324 // The first "clone" is the original copy, which doesn't have a VMap. 3325 assert(VMaps.size() == NumClones - 1); 3326 Changed = true; 3327 ClonesCreated = true; 3328 NumClonesCreated = NumClones; 3329 }; 3330 3331 auto CloneCallsite = [&](const CallsiteInfo &StackNode, CallBase *CB, 3332 Function *CalledFunction) { 3333 // Perform cloning if not yet done. 3334 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); 3335 3336 // Should have skipped indirect calls via mayHaveMemprofSummary. 3337 assert(CalledFunction); 3338 assert(!IsMemProfClone(*CalledFunction)); 3339 3340 // Update the calls per the summary info. 3341 // Save orig name since it gets updated in the first iteration 3342 // below. 3343 auto CalleeOrigName = CalledFunction->getName(); 3344 for (unsigned J = 0; J < StackNode.Clones.size(); J++) { 3345 // Do nothing if this version calls the original version of its 3346 // callee. 3347 if (!StackNode.Clones[J]) 3348 continue; 3349 auto NewF = M.getOrInsertFunction( 3350 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), 3351 CalledFunction->getFunctionType()); 3352 CallBase *CBClone; 3353 // Copy 0 is the original function. 3354 if (!J) 3355 CBClone = CB; 3356 else 3357 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3358 CBClone->setCalledFunction(NewF); 3359 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 3360 << ore::NV("Call", CBClone) << " in clone " 3361 << ore::NV("Caller", CBClone->getFunction()) 3362 << " assigned to call function clone " 3363 << ore::NV("Callee", NewF.getCallee())); 3364 } 3365 }; 3366 3367 // Locate the summary for F. 3368 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); 3369 // If not found, this could be an imported local (see comment in 3370 // findValueInfoForFunc). Skip for now as it will be cloned in its original 3371 // module (where it would have been promoted to global scope so should 3372 // satisfy any reference in this module). 3373 if (!TheFnVI) 3374 continue; 3375 3376 auto *GVSummary = 3377 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); 3378 if (!GVSummary) 3379 // Must have been imported, use the first summary (might be multiple if 3380 // this was a linkonce_odr). 3381 GVSummary = TheFnVI.getSummaryList().front().get(); 3382 3383 // If this was an imported alias skip it as we won't have the function 3384 // summary, and it should be cloned in the original module. 3385 if (isa<AliasSummary>(GVSummary)) 3386 continue; 3387 3388 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); 3389 3390 if (FS->allocs().empty() && FS->callsites().empty()) 3391 continue; 3392 3393 auto SI = FS->callsites().begin(); 3394 auto AI = FS->allocs().begin(); 3395 3396 // To handle callsite infos synthesized for tail calls which have missing 3397 // frames in the profiled context, map callee VI to the synthesized callsite 3398 // info. 3399 DenseMap<ValueInfo, CallsiteInfo> MapTailCallCalleeVIToCallsite; 3400 // Iterate the callsites for this function in reverse, since we place all 3401 // those synthesized for tail calls at the end. 3402 for (auto CallsiteIt = FS->callsites().rbegin(); 3403 CallsiteIt != FS->callsites().rend(); CallsiteIt++) { 3404 auto &Callsite = *CallsiteIt; 3405 // Stop as soon as we see a non-synthesized callsite info (see comment 3406 // above loop). All the entries added for discovered tail calls have empty 3407 // stack ids. 3408 if (!Callsite.StackIdIndices.empty()) 3409 break; 3410 MapTailCallCalleeVIToCallsite.insert({Callsite.Callee, Callsite}); 3411 } 3412 3413 // Assume for now that the instructions are in the exact same order 3414 // as when the summary was created, but confirm this is correct by 3415 // matching the stack ids. 3416 for (auto &BB : F) { 3417 for (auto &I : BB) { 3418 auto *CB = dyn_cast<CallBase>(&I); 3419 // Same handling as when creating module summary. 3420 if (!mayHaveMemprofSummary(CB)) 3421 continue; 3422 3423 auto *CalledValue = CB->getCalledOperand(); 3424 auto *CalledFunction = CB->getCalledFunction(); 3425 if (CalledValue && !CalledFunction) { 3426 CalledValue = CalledValue->stripPointerCasts(); 3427 // Stripping pointer casts can reveal a called function. 3428 CalledFunction = dyn_cast<Function>(CalledValue); 3429 } 3430 // Check if this is an alias to a function. If so, get the 3431 // called aliasee for the checks below. 3432 if (auto *GA = dyn_cast<GlobalAlias>(CalledValue)) { 3433 assert(!CalledFunction && 3434 "Expected null called function in callsite for alias"); 3435 CalledFunction = dyn_cast<Function>(GA->getAliaseeObject()); 3436 } 3437 3438 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 3439 I.getMetadata(LLVMContext::MD_callsite)); 3440 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); 3441 3442 // Include allocs that were already assigned a memprof function 3443 // attribute in the statistics. 3444 if (CB->getAttributes().hasFnAttr("memprof")) { 3445 assert(!MemProfMD); 3446 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 3447 ? AllocTypeColdThinBackend++ 3448 : AllocTypeNotColdThinBackend++; 3449 OrigAllocsThinBackend++; 3450 AllocVersionsThinBackend++; 3451 if (!MaxAllocVersionsThinBackend) 3452 MaxAllocVersionsThinBackend = 1; 3453 // Remove any remaining callsite metadata and we can skip the rest of 3454 // the handling for this instruction, since no cloning needed. 3455 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3456 continue; 3457 } 3458 3459 if (MemProfMD) { 3460 // Consult the next alloc node. 3461 assert(AI != FS->allocs().end()); 3462 auto &AllocNode = *(AI++); 3463 3464 // Sanity check that the MIB stack ids match between the summary and 3465 // instruction metadata. 3466 auto MIBIter = AllocNode.MIBs.begin(); 3467 for (auto &MDOp : MemProfMD->operands()) { 3468 assert(MIBIter != AllocNode.MIBs.end()); 3469 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = 3470 MIBIter->StackIdIndices.begin(); 3471 auto *MIBMD = cast<const MDNode>(MDOp); 3472 MDNode *StackMDNode = getMIBStackNode(MIBMD); 3473 assert(StackMDNode); 3474 SmallVector<unsigned> StackIdsFromMetadata; 3475 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); 3476 for (auto ContextIter = 3477 StackContext.beginAfterSharedPrefix(CallsiteContext); 3478 ContextIter != StackContext.end(); ++ContextIter) { 3479 // If this is a direct recursion, simply skip the duplicate 3480 // entries, to be consistent with how the summary ids were 3481 // generated during ModuleSummaryAnalysis. 3482 if (!StackIdsFromMetadata.empty() && 3483 StackIdsFromMetadata.back() == *ContextIter) 3484 continue; 3485 assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); 3486 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3487 *ContextIter); 3488 StackIdIndexIter++; 3489 } 3490 MIBIter++; 3491 } 3492 3493 // Perform cloning if not yet done. 3494 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); 3495 3496 OrigAllocsThinBackend++; 3497 AllocVersionsThinBackend += AllocNode.Versions.size(); 3498 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) 3499 MaxAllocVersionsThinBackend = AllocNode.Versions.size(); 3500 3501 // If there is only one version that means we didn't end up 3502 // considering this function for cloning, and in that case the alloc 3503 // will still be none type or should have gotten the default NotCold. 3504 // Skip that after calling clone helper since that does some sanity 3505 // checks that confirm we haven't decided yet that we need cloning. 3506 if (AllocNode.Versions.size() == 1) { 3507 assert((AllocationType)AllocNode.Versions[0] == 3508 AllocationType::NotCold || 3509 (AllocationType)AllocNode.Versions[0] == 3510 AllocationType::None); 3511 UnclonableAllocsThinBackend++; 3512 continue; 3513 } 3514 3515 // All versions should have a singular allocation type. 3516 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { 3517 return Type == ((uint8_t)AllocationType::NotCold | 3518 (uint8_t)AllocationType::Cold); 3519 })); 3520 3521 // Update the allocation types per the summary info. 3522 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { 3523 // Ignore any that didn't get an assigned allocation type. 3524 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) 3525 continue; 3526 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; 3527 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ 3528 : AllocTypeNotColdThinBackend++; 3529 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); 3530 auto A = llvm::Attribute::get(F.getContext(), "memprof", 3531 AllocTypeString); 3532 CallBase *CBClone; 3533 // Copy 0 is the original function. 3534 if (!J) 3535 CBClone = CB; 3536 else 3537 // Since VMaps are only created for new clones, we index with 3538 // clone J-1 (J==0 is the original clone and does not have a VMaps 3539 // entry). 3540 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3541 CBClone->addFnAttr(A); 3542 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) 3543 << ore::NV("AllocationCall", CBClone) << " in clone " 3544 << ore::NV("Caller", CBClone->getFunction()) 3545 << " marked with memprof allocation attribute " 3546 << ore::NV("Attribute", AllocTypeString)); 3547 } 3548 } else if (!CallsiteContext.empty()) { 3549 // Consult the next callsite node. 3550 assert(SI != FS->callsites().end()); 3551 auto &StackNode = *(SI++); 3552 3553 #ifndef NDEBUG 3554 // Sanity check that the stack ids match between the summary and 3555 // instruction metadata. 3556 auto StackIdIndexIter = StackNode.StackIdIndices.begin(); 3557 for (auto StackId : CallsiteContext) { 3558 assert(StackIdIndexIter != StackNode.StackIdIndices.end()); 3559 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3560 StackId); 3561 StackIdIndexIter++; 3562 } 3563 #endif 3564 3565 CloneCallsite(StackNode, CB, CalledFunction); 3566 } else if (CB->isTailCall()) { 3567 // Locate the synthesized callsite info for the callee VI, if any was 3568 // created, and use that for cloning. 3569 ValueInfo CalleeVI = 3570 findValueInfoForFunc(*CalledFunction, M, ImportSummary); 3571 if (CalleeVI && MapTailCallCalleeVIToCallsite.count(CalleeVI)) { 3572 auto Callsite = MapTailCallCalleeVIToCallsite.find(CalleeVI); 3573 assert(Callsite != MapTailCallCalleeVIToCallsite.end()); 3574 CloneCallsite(Callsite->second, CB, CalledFunction); 3575 } 3576 } 3577 // Memprof and callsite metadata on memory allocations no longer needed. 3578 I.setMetadata(LLVMContext::MD_memprof, nullptr); 3579 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3580 } 3581 } 3582 } 3583 3584 return Changed; 3585 } 3586 3587 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3588 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { 3589 if (DumpCCG) { 3590 dbgs() << "CCG before cloning:\n"; 3591 dbgs() << *this; 3592 } 3593 if (ExportToDot) 3594 exportToDot("postbuild"); 3595 3596 if (VerifyCCG) { 3597 check(); 3598 } 3599 3600 identifyClones(); 3601 3602 if (VerifyCCG) { 3603 check(); 3604 } 3605 3606 if (DumpCCG) { 3607 dbgs() << "CCG after cloning:\n"; 3608 dbgs() << *this; 3609 } 3610 if (ExportToDot) 3611 exportToDot("cloned"); 3612 3613 bool Changed = assignFunctions(); 3614 3615 if (DumpCCG) { 3616 dbgs() << "CCG after assigning function clones:\n"; 3617 dbgs() << *this; 3618 } 3619 if (ExportToDot) 3620 exportToDot("clonefuncassign"); 3621 3622 return Changed; 3623 } 3624 3625 bool MemProfContextDisambiguation::processModule( 3626 Module &M, 3627 llvm::function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 3628 3629 // If we have an import summary, then the cloning decisions were made during 3630 // the thin link on the index. Apply them and return. 3631 if (ImportSummary) 3632 return applyImport(M); 3633 3634 // TODO: If/when other types of memprof cloning are enabled beyond just for 3635 // hot and cold, we will need to change this to individually control the 3636 // AllocationType passed to addStackNodesForMIB during CCG construction. 3637 // Note that we specifically check this after applying imports above, so that 3638 // the option isn't needed to be passed to distributed ThinLTO backend 3639 // clang processes, which won't necessarily have visibility into the linker 3640 // dependences. Instead the information is communicated from the LTO link to 3641 // the backends via the combined summary index. 3642 if (!SupportsHotColdNew) 3643 return false; 3644 3645 ModuleCallsiteContextGraph CCG(M, OREGetter); 3646 return CCG.process(); 3647 } 3648 3649 MemProfContextDisambiguation::MemProfContextDisambiguation( 3650 const ModuleSummaryIndex *Summary) 3651 : ImportSummary(Summary) { 3652 if (ImportSummary) { 3653 // The MemProfImportSummary should only be used for testing ThinLTO 3654 // distributed backend handling via opt, in which case we don't have a 3655 // summary from the pass pipeline. 3656 assert(MemProfImportSummary.empty()); 3657 return; 3658 } 3659 if (MemProfImportSummary.empty()) 3660 return; 3661 3662 auto ReadSummaryFile = 3663 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); 3664 if (!ReadSummaryFile) { 3665 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), 3666 "Error loading file '" + MemProfImportSummary + 3667 "': "); 3668 return; 3669 } 3670 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); 3671 if (!ImportSummaryForTestingOrErr) { 3672 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), 3673 "Error parsing file '" + MemProfImportSummary + 3674 "': "); 3675 return; 3676 } 3677 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); 3678 ImportSummary = ImportSummaryForTesting.get(); 3679 } 3680 3681 PreservedAnalyses MemProfContextDisambiguation::run(Module &M, 3682 ModuleAnalysisManager &AM) { 3683 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 3684 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 3685 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 3686 }; 3687 if (!processModule(M, OREGetter)) 3688 return PreservedAnalyses::all(); 3689 return PreservedAnalyses::none(); 3690 } 3691 3692 void MemProfContextDisambiguation::run( 3693 ModuleSummaryIndex &Index, 3694 llvm::function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 3695 isPrevailing) { 3696 // TODO: If/when other types of memprof cloning are enabled beyond just for 3697 // hot and cold, we will need to change this to individually control the 3698 // AllocationType passed to addStackNodesForMIB during CCG construction. 3699 // The index was set from the option, so these should be in sync. 3700 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); 3701 if (!SupportsHotColdNew) 3702 return; 3703 3704 IndexCallsiteContextGraph CCG(Index, isPrevailing); 3705 CCG.process(); 3706 } 3707