1 //==-- MemProfContextDisambiguation.cpp - Disambiguate contexts -------------=// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements support for context disambiguation of allocation 10 // calls for profile guided heap optimization. Specifically, it uses Memprof 11 // profiles which indicate context specific allocation behavior (currently 12 // distinguishing cold vs hot memory allocations). Cloning is performed to 13 // expose the cold allocation call contexts, and the allocation calls are 14 // subsequently annotated with an attribute for later transformation. 15 // 16 // The transformations can be performed either directly on IR (regular LTO), or 17 // on a ThinLTO index (and later applied to the IR during the ThinLTO backend). 18 // Both types of LTO operate on a the same base graph representation, which 19 // uses CRTP to support either IR or Index formats. 20 // 21 //===----------------------------------------------------------------------===// 22 23 #include "llvm/Transforms/IPO/MemProfContextDisambiguation.h" 24 #include "llvm/ADT/DenseMap.h" 25 #include "llvm/ADT/DenseSet.h" 26 #include "llvm/ADT/MapVector.h" 27 #include "llvm/ADT/SetOperations.h" 28 #include "llvm/ADT/SmallPtrSet.h" 29 #include "llvm/ADT/SmallSet.h" 30 #include "llvm/ADT/SmallVector.h" 31 #include "llvm/ADT/Statistic.h" 32 #include "llvm/Analysis/MemoryProfileInfo.h" 33 #include "llvm/Analysis/ModuleSummaryAnalysis.h" 34 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 35 #include "llvm/Bitcode/BitcodeReader.h" 36 #include "llvm/IR/Constants.h" 37 #include "llvm/IR/Instructions.h" 38 #include "llvm/IR/Module.h" 39 #include "llvm/IR/ModuleSummaryIndex.h" 40 #include "llvm/Pass.h" 41 #include "llvm/Support/CommandLine.h" 42 #include "llvm/Support/FileSystem.h" 43 #include "llvm/Support/GraphWriter.h" 44 #include "llvm/Support/raw_ostream.h" 45 #include "llvm/Transforms/IPO.h" 46 #include "llvm/Transforms/Utils/Cloning.h" 47 #include <sstream> 48 #include <vector> 49 using namespace llvm; 50 using namespace llvm::memprof; 51 52 #define DEBUG_TYPE "memprof-context-disambiguation" 53 54 STATISTIC(FunctionClonesAnalysis, 55 "Number of function clones created during whole program analysis"); 56 STATISTIC(FunctionClonesThinBackend, 57 "Number of function clones created during ThinLTO backend"); 58 STATISTIC(FunctionsClonedThinBackend, 59 "Number of functions that had clones created during ThinLTO backend"); 60 STATISTIC(AllocTypeNotCold, "Number of not cold static allocations (possibly " 61 "cloned) during whole program analysis"); 62 STATISTIC(AllocTypeCold, "Number of cold static allocations (possibly cloned) " 63 "during whole program analysis"); 64 STATISTIC(AllocTypeNotColdThinBackend, 65 "Number of not cold static allocations (possibly cloned) during " 66 "ThinLTO backend"); 67 STATISTIC(AllocTypeColdThinBackend, "Number of cold static allocations " 68 "(possibly cloned) during ThinLTO backend"); 69 STATISTIC(OrigAllocsThinBackend, 70 "Number of original (not cloned) allocations with memprof profiles " 71 "during ThinLTO backend"); 72 STATISTIC( 73 AllocVersionsThinBackend, 74 "Number of allocation versions (including clones) during ThinLTO backend"); 75 STATISTIC(MaxAllocVersionsThinBackend, 76 "Maximum number of allocation versions created for an original " 77 "allocation during ThinLTO backend"); 78 STATISTIC(UnclonableAllocsThinBackend, 79 "Number of unclonable ambigous allocations during ThinLTO backend"); 80 81 static cl::opt<std::string> DotFilePathPrefix( 82 "memprof-dot-file-path-prefix", cl::init(""), cl::Hidden, 83 cl::value_desc("filename"), 84 cl::desc("Specify the path prefix of the MemProf dot files.")); 85 86 static cl::opt<bool> ExportToDot("memprof-export-to-dot", cl::init(false), 87 cl::Hidden, 88 cl::desc("Export graph to dot files.")); 89 90 static cl::opt<bool> 91 DumpCCG("memprof-dump-ccg", cl::init(false), cl::Hidden, 92 cl::desc("Dump CallingContextGraph to stdout after each stage.")); 93 94 static cl::opt<bool> 95 VerifyCCG("memprof-verify-ccg", cl::init(false), cl::Hidden, 96 cl::desc("Perform verification checks on CallingContextGraph.")); 97 98 static cl::opt<bool> 99 VerifyNodes("memprof-verify-nodes", cl::init(false), cl::Hidden, 100 cl::desc("Perform frequent verification checks on nodes.")); 101 102 static cl::opt<std::string> MemProfImportSummary( 103 "memprof-import-summary", 104 cl::desc("Import summary to use for testing the ThinLTO backend via opt"), 105 cl::Hidden); 106 107 // Indicate we are linking with an allocator that supports hot/cold operator 108 // new interfaces. 109 cl::opt<bool> SupportsHotColdNew( 110 "supports-hot-cold-new", cl::init(false), cl::Hidden, 111 cl::desc("Linking with hot/cold operator new interfaces")); 112 113 namespace { 114 /// CRTP base for graphs built from either IR or ThinLTO summary index. 115 /// 116 /// The graph represents the call contexts in all memprof metadata on allocation 117 /// calls, with nodes for the allocations themselves, as well as for the calls 118 /// in each context. The graph is initially built from the allocation memprof 119 /// metadata (or summary) MIBs. It is then updated to match calls with callsite 120 /// metadata onto the nodes, updating it to reflect any inlining performed on 121 /// those calls. 122 /// 123 /// Each MIB (representing an allocation's call context with allocation 124 /// behavior) is assigned a unique context id during the graph build. The edges 125 /// and nodes in the graph are decorated with the context ids they carry. This 126 /// is used to correctly update the graph when cloning is performed so that we 127 /// can uniquify the context for a single (possibly cloned) allocation. 128 template <typename DerivedCCG, typename FuncTy, typename CallTy> 129 class CallsiteContextGraph { 130 public: 131 CallsiteContextGraph() = default; 132 CallsiteContextGraph(const CallsiteContextGraph &) = default; 133 CallsiteContextGraph(CallsiteContextGraph &&) = default; 134 135 /// Main entry point to perform analysis and transformations on graph. 136 bool process(); 137 138 /// Perform cloning on the graph necessary to uniquely identify the allocation 139 /// behavior of an allocation based on its context. 140 void identifyClones(); 141 142 /// Assign callsite clones to functions, cloning functions as needed to 143 /// accommodate the combinations of their callsite clones reached by callers. 144 /// For regular LTO this clones functions and callsites in the IR, but for 145 /// ThinLTO the cloning decisions are noted in the summaries and later applied 146 /// in applyImport. 147 bool assignFunctions(); 148 149 void dump() const; 150 void print(raw_ostream &OS) const; 151 152 friend raw_ostream &operator<<(raw_ostream &OS, 153 const CallsiteContextGraph &CCG) { 154 CCG.print(OS); 155 return OS; 156 } 157 158 friend struct GraphTraits< 159 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 160 friend struct DOTGraphTraits< 161 const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *>; 162 163 void exportToDot(std::string Label) const; 164 165 /// Represents a function clone via FuncTy pointer and clone number pair. 166 struct FuncInfo final 167 : public std::pair<FuncTy *, unsigned /*Clone number*/> { 168 using Base = std::pair<FuncTy *, unsigned>; 169 FuncInfo(const Base &B) : Base(B) {} 170 FuncInfo(FuncTy *F = nullptr, unsigned CloneNo = 0) : Base(F, CloneNo) {} 171 explicit operator bool() const { return this->first != nullptr; } 172 FuncTy *func() const { return this->first; } 173 unsigned cloneNo() const { return this->second; } 174 }; 175 176 /// Represents a callsite clone via CallTy and clone number pair. 177 struct CallInfo final : public std::pair<CallTy, unsigned /*Clone number*/> { 178 using Base = std::pair<CallTy, unsigned>; 179 CallInfo(const Base &B) : Base(B) {} 180 CallInfo(CallTy Call = nullptr, unsigned CloneNo = 0) 181 : Base(Call, CloneNo) {} 182 explicit operator bool() const { return (bool)this->first; } 183 CallTy call() const { return this->first; } 184 unsigned cloneNo() const { return this->second; } 185 void setCloneNo(unsigned N) { this->second = N; } 186 void print(raw_ostream &OS) const { 187 if (!operator bool()) { 188 assert(!cloneNo()); 189 OS << "null Call"; 190 return; 191 } 192 call()->print(OS); 193 OS << "\t(clone " << cloneNo() << ")"; 194 } 195 void dump() const { 196 print(dbgs()); 197 dbgs() << "\n"; 198 } 199 friend raw_ostream &operator<<(raw_ostream &OS, const CallInfo &Call) { 200 Call.print(OS); 201 return OS; 202 } 203 }; 204 205 struct ContextEdge; 206 207 /// Node in the Callsite Context Graph 208 struct ContextNode { 209 // Keep this for now since in the IR case where we have an Instruction* it 210 // is not as immediately discoverable. Used for printing richer information 211 // when dumping graph. 212 bool IsAllocation; 213 214 // Keeps track of when the Call was reset to null because there was 215 // recursion. 216 bool Recursive = false; 217 218 // The corresponding allocation or interior call. 219 CallInfo Call; 220 221 // For alloc nodes this is a unique id assigned when constructed, and for 222 // callsite stack nodes it is the original stack id when the node is 223 // constructed from the memprof MIB metadata on the alloc nodes. Note that 224 // this is only used when matching callsite metadata onto the stack nodes 225 // created when processing the allocation memprof MIBs, and for labeling 226 // nodes in the dot graph. Therefore we don't bother to assign a value for 227 // clones. 228 uint64_t OrigStackOrAllocId = 0; 229 230 // This will be formed by ORing together the AllocationType enum values 231 // for contexts including this node. 232 uint8_t AllocTypes = 0; 233 234 // Edges to all callees in the profiled call stacks. 235 // TODO: Should this be a map (from Callee node) for more efficient lookup? 236 std::vector<std::shared_ptr<ContextEdge>> CalleeEdges; 237 238 // Edges to all callers in the profiled call stacks. 239 // TODO: Should this be a map (from Caller node) for more efficient lookup? 240 std::vector<std::shared_ptr<ContextEdge>> CallerEdges; 241 242 // The set of IDs for contexts including this node. 243 DenseSet<uint32_t> ContextIds; 244 245 // List of clones of this ContextNode, initially empty. 246 std::vector<ContextNode *> Clones; 247 248 // If a clone, points to the original uncloned node. 249 ContextNode *CloneOf = nullptr; 250 251 ContextNode(bool IsAllocation) : IsAllocation(IsAllocation), Call() {} 252 253 ContextNode(bool IsAllocation, CallInfo C) 254 : IsAllocation(IsAllocation), Call(C) {} 255 256 void addClone(ContextNode *Clone) { 257 if (CloneOf) { 258 CloneOf->Clones.push_back(Clone); 259 Clone->CloneOf = CloneOf; 260 } else { 261 Clones.push_back(Clone); 262 assert(!Clone->CloneOf); 263 Clone->CloneOf = this; 264 } 265 } 266 267 ContextNode *getOrigNode() { 268 if (!CloneOf) 269 return this; 270 return CloneOf; 271 } 272 273 void addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 274 unsigned int ContextId); 275 276 ContextEdge *findEdgeFromCallee(const ContextNode *Callee); 277 ContextEdge *findEdgeFromCaller(const ContextNode *Caller); 278 void eraseCalleeEdge(const ContextEdge *Edge); 279 void eraseCallerEdge(const ContextEdge *Edge); 280 281 void setCall(CallInfo C) { Call = C; } 282 283 bool hasCall() const { return (bool)Call.call(); } 284 285 void printCall(raw_ostream &OS) const { Call.print(OS); } 286 287 // True if this node was effectively removed from the graph, in which case 288 // its context id set, caller edges, and callee edges should all be empty. 289 bool isRemoved() const { 290 assert(ContextIds.empty() == 291 (CalleeEdges.empty() && CallerEdges.empty())); 292 return ContextIds.empty(); 293 } 294 295 void dump() const; 296 void print(raw_ostream &OS) const; 297 298 friend raw_ostream &operator<<(raw_ostream &OS, const ContextNode &Node) { 299 Node.print(OS); 300 return OS; 301 } 302 }; 303 304 /// Edge in the Callsite Context Graph from a ContextNode N to a caller or 305 /// callee. 306 struct ContextEdge { 307 ContextNode *Callee; 308 ContextNode *Caller; 309 310 // This will be formed by ORing together the AllocationType enum values 311 // for contexts including this edge. 312 uint8_t AllocTypes = 0; 313 314 // The set of IDs for contexts including this edge. 315 DenseSet<uint32_t> ContextIds; 316 317 ContextEdge(ContextNode *Callee, ContextNode *Caller, uint8_t AllocType, 318 DenseSet<uint32_t> ContextIds) 319 : Callee(Callee), Caller(Caller), AllocTypes(AllocType), 320 ContextIds(ContextIds) {} 321 322 DenseSet<uint32_t> &getContextIds() { return ContextIds; } 323 324 void dump() const; 325 void print(raw_ostream &OS) const; 326 327 friend raw_ostream &operator<<(raw_ostream &OS, const ContextEdge &Edge) { 328 Edge.print(OS); 329 return OS; 330 } 331 }; 332 333 /// Helper to remove callee edges that have allocation type None (due to not 334 /// carrying any context ids) after transformations. 335 void removeNoneTypeCalleeEdges(ContextNode *Node); 336 337 protected: 338 /// Get a list of nodes corresponding to the stack ids in the given callsite 339 /// context. 340 template <class NodeT, class IteratorT> 341 std::vector<uint64_t> 342 getStackIdsWithContextNodes(CallStack<NodeT, IteratorT> &CallsiteContext); 343 344 /// Adds nodes for the given allocation and any stack ids on its memprof MIB 345 /// metadata (or summary). 346 ContextNode *addAllocNode(CallInfo Call, const FuncTy *F); 347 348 /// Adds nodes for the given MIB stack ids. 349 template <class NodeT, class IteratorT> 350 void addStackNodesForMIB(ContextNode *AllocNode, 351 CallStack<NodeT, IteratorT> &StackContext, 352 CallStack<NodeT, IteratorT> &CallsiteContext, 353 AllocationType AllocType); 354 355 /// Matches all callsite metadata (or summary) to the nodes created for 356 /// allocation memprof MIB metadata, synthesizing new nodes to reflect any 357 /// inlining performed on those callsite instructions. 358 void updateStackNodes(); 359 360 /// Update graph to conservatively handle any callsite stack nodes that target 361 /// multiple different callee target functions. 362 void handleCallsitesWithMultipleTargets(); 363 364 /// Save lists of calls with MemProf metadata in each function, for faster 365 /// iteration. 366 std::vector<std::pair<FuncTy *, std::vector<CallInfo>>> 367 FuncToCallsWithMetadata; 368 369 /// Map from callsite node to the enclosing caller function. 370 std::map<const ContextNode *, const FuncTy *> NodeToCallingFunc; 371 372 private: 373 using EdgeIter = typename std::vector<std::shared_ptr<ContextEdge>>::iterator; 374 375 using CallContextInfo = std::tuple<CallTy, std::vector<uint64_t>, 376 const FuncTy *, DenseSet<uint32_t>>; 377 378 /// Assigns the given Node to calls at or inlined into the location with 379 /// the Node's stack id, after post order traversing and processing its 380 /// caller nodes. Uses the call information recorded in the given 381 /// StackIdToMatchingCalls map, and creates new nodes for inlined sequences 382 /// as needed. Called by updateStackNodes which sets up the given 383 /// StackIdToMatchingCalls map. 384 void assignStackNodesPostOrder( 385 ContextNode *Node, DenseSet<const ContextNode *> &Visited, 386 DenseMap<uint64_t, std::vector<CallContextInfo>> &StackIdToMatchingCalls); 387 388 /// Duplicates the given set of context ids, updating the provided 389 /// map from each original id with the newly generated context ids, 390 /// and returning the new duplicated id set. 391 DenseSet<uint32_t> duplicateContextIds( 392 const DenseSet<uint32_t> &StackSequenceContextIds, 393 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 394 395 /// Propagates all duplicated context ids across the graph. 396 void propagateDuplicateContextIds( 397 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds); 398 399 /// Connect the NewNode to OrigNode's callees if TowardsCallee is true, 400 /// else to its callers. Also updates OrigNode's edges to remove any context 401 /// ids moved to the newly created edge. 402 void connectNewNode(ContextNode *NewNode, ContextNode *OrigNode, 403 bool TowardsCallee); 404 405 /// Get the stack id corresponding to the given Id or Index (for IR this will 406 /// return itself, for a summary index this will return the id recorded in the 407 /// index for that stack id index value). 408 uint64_t getStackId(uint64_t IdOrIndex) const { 409 return static_cast<const DerivedCCG *>(this)->getStackId(IdOrIndex); 410 } 411 412 /// Returns true if the given call targets the given function. 413 bool calleeMatchesFunc(CallTy Call, const FuncTy *Func) { 414 return static_cast<DerivedCCG *>(this)->calleeMatchesFunc(Call, Func); 415 } 416 417 /// Get a list of nodes corresponding to the stack ids in the given 418 /// callsite's context. 419 std::vector<uint64_t> getStackIdsWithContextNodesForCall(CallTy Call) { 420 return static_cast<DerivedCCG *>(this)->getStackIdsWithContextNodesForCall( 421 Call); 422 } 423 424 /// Get the last stack id in the context for callsite. 425 uint64_t getLastStackId(CallTy Call) { 426 return static_cast<DerivedCCG *>(this)->getLastStackId(Call); 427 } 428 429 /// Update the allocation call to record type of allocated memory. 430 void updateAllocationCall(CallInfo &Call, AllocationType AllocType) { 431 AllocType == AllocationType::Cold ? AllocTypeCold++ : AllocTypeNotCold++; 432 static_cast<DerivedCCG *>(this)->updateAllocationCall(Call, AllocType); 433 } 434 435 /// Update non-allocation call to invoke (possibly cloned) function 436 /// CalleeFunc. 437 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc) { 438 static_cast<DerivedCCG *>(this)->updateCall(CallerCall, CalleeFunc); 439 } 440 441 /// Clone the given function for the given callsite, recording mapping of all 442 /// of the functions tracked calls to their new versions in the CallMap. 443 /// Assigns new clones to clone number CloneNo. 444 FuncInfo cloneFunctionForCallsite( 445 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 446 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 447 return static_cast<DerivedCCG *>(this)->cloneFunctionForCallsite( 448 Func, Call, CallMap, CallsWithMetadataInFunc, CloneNo); 449 } 450 451 /// Gets a label to use in the dot graph for the given call clone in the given 452 /// function. 453 std::string getLabel(const FuncTy *Func, const CallTy Call, 454 unsigned CloneNo) const { 455 return static_cast<const DerivedCCG *>(this)->getLabel(Func, Call, CloneNo); 456 } 457 458 /// Helpers to find the node corresponding to the given call or stackid. 459 ContextNode *getNodeForInst(const CallInfo &C); 460 ContextNode *getNodeForAlloc(const CallInfo &C); 461 ContextNode *getNodeForStackId(uint64_t StackId); 462 463 /// Removes the node information recorded for the given call. 464 void unsetNodeForInst(const CallInfo &C); 465 466 /// Computes the alloc type corresponding to the given context ids, by 467 /// unioning their recorded alloc types. 468 uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds); 469 470 /// Returns the alloction type of the intersection of the contexts of two 471 /// nodes (based on their provided context id sets), optimized for the case 472 /// when Node1Ids is smaller than Node2Ids. 473 uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids, 474 const DenseSet<uint32_t> &Node2Ids); 475 476 /// Returns the alloction type of the intersection of the contexts of two 477 /// nodes (based on their provided context id sets). 478 uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids, 479 const DenseSet<uint32_t> &Node2Ids); 480 481 /// Create a clone of Edge's callee and move Edge to that new callee node, 482 /// performing the necessary context id and allocation type updates. 483 /// If callee's caller edge iterator is supplied, it is updated when removing 484 /// the edge from that list. 485 ContextNode * 486 moveEdgeToNewCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 487 EdgeIter *CallerEdgeI = nullptr); 488 489 /// Change the callee of Edge to existing callee clone NewCallee, performing 490 /// the necessary context id and allocation type updates. 491 /// If callee's caller edge iterator is supplied, it is updated when removing 492 /// the edge from that list. 493 void moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 494 ContextNode *NewCallee, 495 EdgeIter *CallerEdgeI = nullptr, 496 bool NewClone = false); 497 498 /// Recursively perform cloning on the graph for the given Node and its 499 /// callers, in order to uniquely identify the allocation behavior of an 500 /// allocation given its context. 501 void identifyClones(ContextNode *Node, 502 DenseSet<const ContextNode *> &Visited); 503 504 /// Map from each context ID to the AllocationType assigned to that context. 505 std::map<uint32_t, AllocationType> ContextIdToAllocationType; 506 507 /// Identifies the context node created for a stack id when adding the MIB 508 /// contexts to the graph. This is used to locate the context nodes when 509 /// trying to assign the corresponding callsites with those stack ids to these 510 /// nodes. 511 std::map<uint64_t, ContextNode *> StackEntryIdToContextNodeMap; 512 513 /// Maps to track the calls to their corresponding nodes in the graph. 514 MapVector<CallInfo, ContextNode *> AllocationCallToContextNodeMap; 515 MapVector<CallInfo, ContextNode *> NonAllocationCallToContextNodeMap; 516 517 /// Owner of all ContextNode unique_ptrs. 518 std::vector<std::unique_ptr<ContextNode>> NodeOwner; 519 520 /// Perform sanity checks on graph when requested. 521 void check() const; 522 523 /// Keeps track of the last unique context id assigned. 524 unsigned int LastContextId = 0; 525 }; 526 527 template <typename DerivedCCG, typename FuncTy, typename CallTy> 528 using ContextNode = 529 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode; 530 template <typename DerivedCCG, typename FuncTy, typename CallTy> 531 using ContextEdge = 532 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge; 533 template <typename DerivedCCG, typename FuncTy, typename CallTy> 534 using FuncInfo = 535 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::FuncInfo; 536 template <typename DerivedCCG, typename FuncTy, typename CallTy> 537 using CallInfo = 538 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::CallInfo; 539 540 /// CRTP derived class for graphs built from IR (regular LTO). 541 class ModuleCallsiteContextGraph 542 : public CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 543 Instruction *> { 544 public: 545 ModuleCallsiteContextGraph( 546 Module &M, 547 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter); 548 549 private: 550 friend CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 551 Instruction *>; 552 553 uint64_t getStackId(uint64_t IdOrIndex) const; 554 bool calleeMatchesFunc(Instruction *Call, const Function *Func); 555 uint64_t getLastStackId(Instruction *Call); 556 std::vector<uint64_t> getStackIdsWithContextNodesForCall(Instruction *Call); 557 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 558 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 559 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 560 Instruction *>::FuncInfo 561 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 562 std::map<CallInfo, CallInfo> &CallMap, 563 std::vector<CallInfo> &CallsWithMetadataInFunc, 564 unsigned CloneNo); 565 std::string getLabel(const Function *Func, const Instruction *Call, 566 unsigned CloneNo) const; 567 568 const Module &Mod; 569 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter; 570 }; 571 572 /// Represents a call in the summary index graph, which can either be an 573 /// allocation or an interior callsite node in an allocation's context. 574 /// Holds a pointer to the corresponding data structure in the index. 575 struct IndexCall : public PointerUnion<CallsiteInfo *, AllocInfo *> { 576 IndexCall() : PointerUnion() {} 577 IndexCall(std::nullptr_t) : IndexCall() {} 578 IndexCall(CallsiteInfo *StackNode) : PointerUnion(StackNode) {} 579 IndexCall(AllocInfo *AllocNode) : PointerUnion(AllocNode) {} 580 IndexCall(PointerUnion PT) : PointerUnion(PT) {} 581 582 IndexCall *operator->() { return this; } 583 584 PointerUnion<CallsiteInfo *, AllocInfo *> getBase() const { return *this; } 585 586 void print(raw_ostream &OS) const { 587 if (auto *AI = llvm::dyn_cast_if_present<AllocInfo *>(getBase())) { 588 OS << *AI; 589 } else { 590 auto *CI = llvm::dyn_cast_if_present<CallsiteInfo *>(getBase()); 591 assert(CI); 592 OS << *CI; 593 } 594 } 595 }; 596 597 /// CRTP derived class for graphs built from summary index (ThinLTO). 598 class IndexCallsiteContextGraph 599 : public CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 600 IndexCall> { 601 public: 602 IndexCallsiteContextGraph( 603 ModuleSummaryIndex &Index, 604 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 605 isPrevailing); 606 607 private: 608 friend CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 609 IndexCall>; 610 611 uint64_t getStackId(uint64_t IdOrIndex) const; 612 bool calleeMatchesFunc(IndexCall &Call, const FunctionSummary *Func); 613 uint64_t getLastStackId(IndexCall &Call); 614 std::vector<uint64_t> getStackIdsWithContextNodesForCall(IndexCall &Call); 615 void updateAllocationCall(CallInfo &Call, AllocationType AllocType); 616 void updateCall(CallInfo &CallerCall, FuncInfo CalleeFunc); 617 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 618 IndexCall>::FuncInfo 619 cloneFunctionForCallsite(FuncInfo &Func, CallInfo &Call, 620 std::map<CallInfo, CallInfo> &CallMap, 621 std::vector<CallInfo> &CallsWithMetadataInFunc, 622 unsigned CloneNo); 623 std::string getLabel(const FunctionSummary *Func, const IndexCall &Call, 624 unsigned CloneNo) const; 625 626 // Saves mapping from function summaries containing memprof records back to 627 // its VI, for use in checking and debugging. 628 std::map<const FunctionSummary *, ValueInfo> FSToVIMap; 629 630 const ModuleSummaryIndex &Index; 631 }; 632 } // namespace 633 634 namespace llvm { 635 template <> 636 struct DenseMapInfo<typename CallsiteContextGraph< 637 ModuleCallsiteContextGraph, Function, Instruction *>::CallInfo> 638 : public DenseMapInfo<std::pair<Instruction *, unsigned>> {}; 639 template <> 640 struct DenseMapInfo<typename CallsiteContextGraph< 641 IndexCallsiteContextGraph, FunctionSummary, IndexCall>::CallInfo> 642 : public DenseMapInfo<std::pair<IndexCall, unsigned>> {}; 643 template <> 644 struct DenseMapInfo<IndexCall> 645 : public DenseMapInfo<PointerUnion<CallsiteInfo *, AllocInfo *>> {}; 646 } // end namespace llvm 647 648 namespace { 649 650 struct FieldSeparator { 651 bool Skip = true; 652 const char *Sep; 653 654 FieldSeparator(const char *Sep = ", ") : Sep(Sep) {} 655 }; 656 657 raw_ostream &operator<<(raw_ostream &OS, FieldSeparator &FS) { 658 if (FS.Skip) { 659 FS.Skip = false; 660 return OS; 661 } 662 return OS << FS.Sep; 663 } 664 665 // Map the uint8_t alloc types (which may contain NotCold|Cold) to the alloc 666 // type we should actually use on the corresponding allocation. 667 // If we can't clone a node that has NotCold+Cold alloc type, we will fall 668 // back to using NotCold. So don't bother cloning to distinguish NotCold+Cold 669 // from NotCold. 670 AllocationType allocTypeToUse(uint8_t AllocTypes) { 671 assert(AllocTypes != (uint8_t)AllocationType::None); 672 if (AllocTypes == 673 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 674 return AllocationType::NotCold; 675 else 676 return (AllocationType)AllocTypes; 677 } 678 679 // Helper to check if the alloc types for all edges recorded in the 680 // InAllocTypes vector match the alloc types for all edges in the Edges 681 // vector. 682 template <typename DerivedCCG, typename FuncTy, typename CallTy> 683 bool allocTypesMatch( 684 const std::vector<uint8_t> &InAllocTypes, 685 const std::vector<std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>> 686 &Edges) { 687 return std::equal( 688 InAllocTypes.begin(), InAllocTypes.end(), Edges.begin(), 689 [](const uint8_t &l, 690 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &r) { 691 // Can share if one of the edges is None type - don't 692 // care about the type along that edge as it doesn't 693 // exist for those context ids. 694 if (l == (uint8_t)AllocationType::None || 695 r->AllocTypes == (uint8_t)AllocationType::None) 696 return true; 697 return allocTypeToUse(l) == allocTypeToUse(r->AllocTypes); 698 }); 699 } 700 701 } // end anonymous namespace 702 703 template <typename DerivedCCG, typename FuncTy, typename CallTy> 704 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 705 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForInst( 706 const CallInfo &C) { 707 ContextNode *Node = getNodeForAlloc(C); 708 if (Node) 709 return Node; 710 711 return NonAllocationCallToContextNodeMap.lookup(C); 712 } 713 714 template <typename DerivedCCG, typename FuncTy, typename CallTy> 715 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 716 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForAlloc( 717 const CallInfo &C) { 718 return AllocationCallToContextNodeMap.lookup(C); 719 } 720 721 template <typename DerivedCCG, typename FuncTy, typename CallTy> 722 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 723 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getNodeForStackId( 724 uint64_t StackId) { 725 auto StackEntryNode = StackEntryIdToContextNodeMap.find(StackId); 726 if (StackEntryNode != StackEntryIdToContextNodeMap.end()) 727 return StackEntryNode->second; 728 return nullptr; 729 } 730 731 template <typename DerivedCCG, typename FuncTy, typename CallTy> 732 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::unsetNodeForInst( 733 const CallInfo &C) { 734 AllocationCallToContextNodeMap.erase(C) || 735 NonAllocationCallToContextNodeMap.erase(C); 736 assert(!AllocationCallToContextNodeMap.count(C) && 737 !NonAllocationCallToContextNodeMap.count(C)); 738 } 739 740 template <typename DerivedCCG, typename FuncTy, typename CallTy> 741 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 742 addOrUpdateCallerEdge(ContextNode *Caller, AllocationType AllocType, 743 unsigned int ContextId) { 744 for (auto &Edge : CallerEdges) { 745 if (Edge->Caller == Caller) { 746 Edge->AllocTypes |= (uint8_t)AllocType; 747 Edge->getContextIds().insert(ContextId); 748 return; 749 } 750 } 751 std::shared_ptr<ContextEdge> Edge = std::make_shared<ContextEdge>( 752 this, Caller, (uint8_t)AllocType, DenseSet<uint32_t>({ContextId})); 753 CallerEdges.push_back(Edge); 754 Caller->CalleeEdges.push_back(Edge); 755 } 756 757 template <typename DerivedCCG, typename FuncTy, typename CallTy> 758 void CallsiteContextGraph< 759 DerivedCCG, FuncTy, CallTy>::removeNoneTypeCalleeEdges(ContextNode *Node) { 760 for (auto EI = Node->CalleeEdges.begin(); EI != Node->CalleeEdges.end();) { 761 auto Edge = *EI; 762 if (Edge->AllocTypes == (uint8_t)AllocationType::None) { 763 assert(Edge->ContextIds.empty()); 764 Edge->Callee->eraseCallerEdge(Edge.get()); 765 EI = Node->CalleeEdges.erase(EI); 766 } else 767 ++EI; 768 } 769 } 770 771 template <typename DerivedCCG, typename FuncTy, typename CallTy> 772 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 773 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 774 findEdgeFromCallee(const ContextNode *Callee) { 775 for (const auto &Edge : CalleeEdges) 776 if (Edge->Callee == Callee) 777 return Edge.get(); 778 return nullptr; 779 } 780 781 template <typename DerivedCCG, typename FuncTy, typename CallTy> 782 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge * 783 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 784 findEdgeFromCaller(const ContextNode *Caller) { 785 for (const auto &Edge : CallerEdges) 786 if (Edge->Caller == Caller) 787 return Edge.get(); 788 return nullptr; 789 } 790 791 template <typename DerivedCCG, typename FuncTy, typename CallTy> 792 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 793 eraseCalleeEdge(const ContextEdge *Edge) { 794 auto EI = 795 std::find_if(CalleeEdges.begin(), CalleeEdges.end(), 796 [Edge](const std::shared_ptr<ContextEdge> &CalleeEdge) { 797 return CalleeEdge.get() == Edge; 798 }); 799 assert(EI != CalleeEdges.end()); 800 CalleeEdges.erase(EI); 801 } 802 803 template <typename DerivedCCG, typename FuncTy, typename CallTy> 804 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode:: 805 eraseCallerEdge(const ContextEdge *Edge) { 806 auto EI = 807 std::find_if(CallerEdges.begin(), CallerEdges.end(), 808 [Edge](const std::shared_ptr<ContextEdge> &CallerEdge) { 809 return CallerEdge.get() == Edge; 810 }); 811 assert(EI != CallerEdges.end()); 812 CallerEdges.erase(EI); 813 } 814 815 template <typename DerivedCCG, typename FuncTy, typename CallTy> 816 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::computeAllocType( 817 DenseSet<uint32_t> &ContextIds) { 818 uint8_t BothTypes = 819 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 820 uint8_t AllocType = (uint8_t)AllocationType::None; 821 for (auto Id : ContextIds) { 822 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 823 // Bail early if alloc type reached both, no further refinement. 824 if (AllocType == BothTypes) 825 return AllocType; 826 } 827 return AllocType; 828 } 829 830 template <typename DerivedCCG, typename FuncTy, typename CallTy> 831 uint8_t 832 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypesImpl( 833 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 834 uint8_t BothTypes = 835 (uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold; 836 uint8_t AllocType = (uint8_t)AllocationType::None; 837 for (auto Id : Node1Ids) { 838 if (!Node2Ids.count(Id)) 839 continue; 840 AllocType |= (uint8_t)ContextIdToAllocationType[Id]; 841 // Bail early if alloc type reached both, no further refinement. 842 if (AllocType == BothTypes) 843 return AllocType; 844 } 845 return AllocType; 846 } 847 848 template <typename DerivedCCG, typename FuncTy, typename CallTy> 849 uint8_t CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::intersectAllocTypes( 850 const DenseSet<uint32_t> &Node1Ids, const DenseSet<uint32_t> &Node2Ids) { 851 if (Node1Ids.size() < Node2Ids.size()) 852 return intersectAllocTypesImpl(Node1Ids, Node2Ids); 853 else 854 return intersectAllocTypesImpl(Node2Ids, Node1Ids); 855 } 856 857 template <typename DerivedCCG, typename FuncTy, typename CallTy> 858 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 859 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addAllocNode( 860 CallInfo Call, const FuncTy *F) { 861 assert(!getNodeForAlloc(Call)); 862 NodeOwner.push_back( 863 std::make_unique<ContextNode>(/*IsAllocation=*/true, Call)); 864 ContextNode *AllocNode = NodeOwner.back().get(); 865 AllocationCallToContextNodeMap[Call] = AllocNode; 866 NodeToCallingFunc[AllocNode] = F; 867 // Use LastContextId as a uniq id for MIB allocation nodes. 868 AllocNode->OrigStackOrAllocId = LastContextId; 869 // Alloc type should be updated as we add in the MIBs. We should assert 870 // afterwards that it is not still None. 871 AllocNode->AllocTypes = (uint8_t)AllocationType::None; 872 873 return AllocNode; 874 } 875 876 template <typename DerivedCCG, typename FuncTy, typename CallTy> 877 template <class NodeT, class IteratorT> 878 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::addStackNodesForMIB( 879 ContextNode *AllocNode, CallStack<NodeT, IteratorT> &StackContext, 880 CallStack<NodeT, IteratorT> &CallsiteContext, AllocationType AllocType) { 881 // Treating the hot alloc type as NotCold before the disambiguation for "hot" 882 // is done. 883 if (AllocType == AllocationType::Hot) 884 AllocType = AllocationType::NotCold; 885 886 ContextIdToAllocationType[++LastContextId] = AllocType; 887 888 // Update alloc type and context ids for this MIB. 889 AllocNode->AllocTypes |= (uint8_t)AllocType; 890 AllocNode->ContextIds.insert(LastContextId); 891 892 // Now add or update nodes for each stack id in alloc's context. 893 // Later when processing the stack ids on non-alloc callsites we will adjust 894 // for any inlining in the context. 895 ContextNode *PrevNode = AllocNode; 896 // Look for recursion (direct recursion should have been collapsed by 897 // module summary analysis, here we should just be detecting mutual 898 // recursion). Mark these nodes so we don't try to clone. 899 SmallSet<uint64_t, 8> StackIdSet; 900 // Skip any on the allocation call (inlining). 901 for (auto ContextIter = StackContext.beginAfterSharedPrefix(CallsiteContext); 902 ContextIter != StackContext.end(); ++ContextIter) { 903 auto StackId = getStackId(*ContextIter); 904 ContextNode *StackNode = getNodeForStackId(StackId); 905 if (!StackNode) { 906 NodeOwner.push_back( 907 std::make_unique<ContextNode>(/*IsAllocation=*/false)); 908 StackNode = NodeOwner.back().get(); 909 StackEntryIdToContextNodeMap[StackId] = StackNode; 910 StackNode->OrigStackOrAllocId = StackId; 911 } 912 auto Ins = StackIdSet.insert(StackId); 913 if (!Ins.second) 914 StackNode->Recursive = true; 915 StackNode->ContextIds.insert(LastContextId); 916 StackNode->AllocTypes |= (uint8_t)AllocType; 917 PrevNode->addOrUpdateCallerEdge(StackNode, AllocType, LastContextId); 918 PrevNode = StackNode; 919 } 920 } 921 922 template <typename DerivedCCG, typename FuncTy, typename CallTy> 923 DenseSet<uint32_t> 924 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::duplicateContextIds( 925 const DenseSet<uint32_t> &StackSequenceContextIds, 926 DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 927 DenseSet<uint32_t> NewContextIds; 928 for (auto OldId : StackSequenceContextIds) { 929 NewContextIds.insert(++LastContextId); 930 OldToNewContextIds[OldId].insert(LastContextId); 931 assert(ContextIdToAllocationType.count(OldId)); 932 // The new context has the same allocation type as original. 933 ContextIdToAllocationType[LastContextId] = ContextIdToAllocationType[OldId]; 934 } 935 return NewContextIds; 936 } 937 938 template <typename DerivedCCG, typename FuncTy, typename CallTy> 939 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 940 propagateDuplicateContextIds( 941 const DenseMap<uint32_t, DenseSet<uint32_t>> &OldToNewContextIds) { 942 // Build a set of duplicated context ids corresponding to the input id set. 943 auto GetNewIds = [&OldToNewContextIds](const DenseSet<uint32_t> &ContextIds) { 944 DenseSet<uint32_t> NewIds; 945 for (auto Id : ContextIds) 946 if (auto NewId = OldToNewContextIds.find(Id); 947 NewId != OldToNewContextIds.end()) 948 NewIds.insert(NewId->second.begin(), NewId->second.end()); 949 return NewIds; 950 }; 951 952 // Recursively update context ids sets along caller edges. 953 auto UpdateCallers = [&](ContextNode *Node, 954 DenseSet<const ContextEdge *> &Visited, 955 auto &&UpdateCallers) -> void { 956 for (const auto &Edge : Node->CallerEdges) { 957 auto Inserted = Visited.insert(Edge.get()); 958 if (!Inserted.second) 959 continue; 960 ContextNode *NextNode = Edge->Caller; 961 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Edge->getContextIds()); 962 // Only need to recursively iterate to NextNode via this caller edge if 963 // it resulted in any added ids to NextNode. 964 if (!NewIdsToAdd.empty()) { 965 Edge->getContextIds().insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 966 NextNode->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 967 UpdateCallers(NextNode, Visited, UpdateCallers); 968 } 969 } 970 }; 971 972 DenseSet<const ContextEdge *> Visited; 973 for (auto &Entry : AllocationCallToContextNodeMap) { 974 auto *Node = Entry.second; 975 // Update ids on the allocation nodes before calling the recursive 976 // update along caller edges, since this simplifies the logic during 977 // that traversal. 978 DenseSet<uint32_t> NewIdsToAdd = GetNewIds(Node->ContextIds); 979 Node->ContextIds.insert(NewIdsToAdd.begin(), NewIdsToAdd.end()); 980 UpdateCallers(Node, Visited, UpdateCallers); 981 } 982 } 983 984 template <typename DerivedCCG, typename FuncTy, typename CallTy> 985 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::connectNewNode( 986 ContextNode *NewNode, ContextNode *OrigNode, bool TowardsCallee) { 987 // Make a copy of the context ids, since this will be adjusted below as they 988 // are moved. 989 DenseSet<uint32_t> RemainingContextIds = NewNode->ContextIds; 990 auto &OrigEdges = 991 TowardsCallee ? OrigNode->CalleeEdges : OrigNode->CallerEdges; 992 // Increment iterator in loop so that we can remove edges as needed. 993 for (auto EI = OrigEdges.begin(); EI != OrigEdges.end();) { 994 auto Edge = *EI; 995 // Remove any matching context ids from Edge, return set that were found and 996 // removed, these are the new edge's context ids. Also update the remaining 997 // (not found ids). 998 DenseSet<uint32_t> NewEdgeContextIds, NotFoundContextIds; 999 set_subtract(Edge->getContextIds(), RemainingContextIds, NewEdgeContextIds, 1000 NotFoundContextIds); 1001 RemainingContextIds.swap(NotFoundContextIds); 1002 // If no matching context ids for this edge, skip it. 1003 if (NewEdgeContextIds.empty()) { 1004 ++EI; 1005 continue; 1006 } 1007 if (TowardsCallee) { 1008 auto NewEdge = std::make_shared<ContextEdge>( 1009 Edge->Callee, NewNode, computeAllocType(NewEdgeContextIds), 1010 NewEdgeContextIds); 1011 NewNode->CalleeEdges.push_back(NewEdge); 1012 NewEdge->Callee->CallerEdges.push_back(NewEdge); 1013 } else { 1014 auto NewEdge = std::make_shared<ContextEdge>( 1015 NewNode, Edge->Caller, computeAllocType(NewEdgeContextIds), 1016 NewEdgeContextIds); 1017 NewNode->CallerEdges.push_back(NewEdge); 1018 NewEdge->Caller->CalleeEdges.push_back(NewEdge); 1019 } 1020 // Remove old edge if context ids empty. 1021 if (Edge->getContextIds().empty()) { 1022 if (TowardsCallee) { 1023 Edge->Callee->eraseCallerEdge(Edge.get()); 1024 EI = OrigNode->CalleeEdges.erase(EI); 1025 } else { 1026 Edge->Caller->eraseCalleeEdge(Edge.get()); 1027 EI = OrigNode->CallerEdges.erase(EI); 1028 } 1029 continue; 1030 } 1031 ++EI; 1032 } 1033 } 1034 1035 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1036 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1037 assignStackNodesPostOrder(ContextNode *Node, 1038 DenseSet<const ContextNode *> &Visited, 1039 DenseMap<uint64_t, std::vector<CallContextInfo>> 1040 &StackIdToMatchingCalls) { 1041 auto Inserted = Visited.insert(Node); 1042 if (!Inserted.second) 1043 return; 1044 // Post order traversal. Iterate over a copy since we may add nodes and 1045 // therefore new callers during the recursive call, invalidating any 1046 // iterator over the original edge vector. We don't need to process these 1047 // new nodes as they were already processed on creation. 1048 auto CallerEdges = Node->CallerEdges; 1049 for (auto &Edge : CallerEdges) { 1050 // Skip any that have been removed during the recursion. 1051 if (!Edge) 1052 continue; 1053 assignStackNodesPostOrder(Edge->Caller, Visited, StackIdToMatchingCalls); 1054 } 1055 1056 // If this node's stack id is in the map, update the graph to contain new 1057 // nodes representing any inlining at interior callsites. Note we move the 1058 // associated context ids over to the new nodes. 1059 1060 // Ignore this node if it is for an allocation or we didn't record any 1061 // stack id lists ending at it. 1062 if (Node->IsAllocation || 1063 !StackIdToMatchingCalls.count(Node->OrigStackOrAllocId)) 1064 return; 1065 1066 auto &Calls = StackIdToMatchingCalls[Node->OrigStackOrAllocId]; 1067 // Handle the simple case first. A single call with a single stack id. 1068 // In this case there is no need to create any new context nodes, simply 1069 // assign the context node for stack id to this Call. 1070 if (Calls.size() == 1) { 1071 auto &[Call, Ids, Func, SavedContextIds] = Calls[0]; 1072 if (Ids.size() == 1) { 1073 assert(SavedContextIds.empty()); 1074 // It should be this Node 1075 assert(Node == getNodeForStackId(Ids[0])); 1076 if (Node->Recursive) 1077 return; 1078 Node->setCall(Call); 1079 NonAllocationCallToContextNodeMap[Call] = Node; 1080 NodeToCallingFunc[Node] = Func; 1081 return; 1082 } 1083 } 1084 1085 // Find the node for the last stack id, which should be the same 1086 // across all calls recorded for this id, and is this node's id. 1087 uint64_t LastId = Node->OrigStackOrAllocId; 1088 ContextNode *LastNode = getNodeForStackId(LastId); 1089 // We should only have kept stack ids that had nodes. 1090 assert(LastNode); 1091 1092 for (unsigned I = 0; I < Calls.size(); I++) { 1093 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1094 // Skip any for which we didn't assign any ids, these don't get a node in 1095 // the graph. 1096 if (SavedContextIds.empty()) 1097 continue; 1098 1099 assert(LastId == Ids.back()); 1100 1101 ContextNode *FirstNode = getNodeForStackId(Ids[0]); 1102 assert(FirstNode); 1103 1104 // Recompute the context ids for this stack id sequence (the 1105 // intersection of the context ids of the corresponding nodes). 1106 // Start with the ids we saved in the map for this call, which could be 1107 // duplicated context ids. We have to recompute as we might have overlap 1108 // overlap between the saved context ids for different last nodes, and 1109 // removed them already during the post order traversal. 1110 set_intersect(SavedContextIds, FirstNode->ContextIds); 1111 ContextNode *PrevNode = nullptr; 1112 for (auto Id : Ids) { 1113 ContextNode *CurNode = getNodeForStackId(Id); 1114 // We should only have kept stack ids that had nodes and weren't 1115 // recursive. 1116 assert(CurNode); 1117 assert(!CurNode->Recursive); 1118 if (!PrevNode) { 1119 PrevNode = CurNode; 1120 continue; 1121 } 1122 auto *Edge = CurNode->findEdgeFromCallee(PrevNode); 1123 if (!Edge) { 1124 SavedContextIds.clear(); 1125 break; 1126 } 1127 PrevNode = CurNode; 1128 set_intersect(SavedContextIds, Edge->getContextIds()); 1129 1130 // If we now have no context ids for clone, skip this call. 1131 if (SavedContextIds.empty()) 1132 break; 1133 } 1134 if (SavedContextIds.empty()) 1135 continue; 1136 1137 // Create new context node. 1138 NodeOwner.push_back( 1139 std::make_unique<ContextNode>(/*IsAllocation=*/false, Call)); 1140 ContextNode *NewNode = NodeOwner.back().get(); 1141 NodeToCallingFunc[NewNode] = Func; 1142 NonAllocationCallToContextNodeMap[Call] = NewNode; 1143 NewNode->ContextIds = SavedContextIds; 1144 NewNode->AllocTypes = computeAllocType(NewNode->ContextIds); 1145 1146 // Connect to callees of innermost stack frame in inlined call chain. 1147 // This updates context ids for FirstNode's callee's to reflect those 1148 // moved to NewNode. 1149 connectNewNode(NewNode, FirstNode, /*TowardsCallee=*/true); 1150 1151 // Connect to callers of outermost stack frame in inlined call chain. 1152 // This updates context ids for FirstNode's caller's to reflect those 1153 // moved to NewNode. 1154 connectNewNode(NewNode, LastNode, /*TowardsCallee=*/false); 1155 1156 // Now we need to remove context ids from edges/nodes between First and 1157 // Last Node. 1158 PrevNode = nullptr; 1159 for (auto Id : Ids) { 1160 ContextNode *CurNode = getNodeForStackId(Id); 1161 // We should only have kept stack ids that had nodes. 1162 assert(CurNode); 1163 1164 // Remove the context ids moved to NewNode from CurNode, and the 1165 // edge from the prior node. 1166 set_subtract(CurNode->ContextIds, NewNode->ContextIds); 1167 if (PrevNode) { 1168 auto *PrevEdge = CurNode->findEdgeFromCallee(PrevNode); 1169 assert(PrevEdge); 1170 set_subtract(PrevEdge->getContextIds(), NewNode->ContextIds); 1171 if (PrevEdge->getContextIds().empty()) { 1172 PrevNode->eraseCallerEdge(PrevEdge); 1173 CurNode->eraseCalleeEdge(PrevEdge); 1174 } 1175 } 1176 PrevNode = CurNode; 1177 } 1178 } 1179 } 1180 1181 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1182 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::updateStackNodes() { 1183 // Map of stack id to all calls with that as the last (outermost caller) 1184 // callsite id that has a context node (some might not due to pruning 1185 // performed during matching of the allocation profile contexts). 1186 // The CallContextInfo contains the Call and a list of its stack ids with 1187 // ContextNodes, the function containing Call, and the set of context ids 1188 // the analysis will eventually identify for use in any new node created 1189 // for that callsite. 1190 DenseMap<uint64_t, std::vector<CallContextInfo>> StackIdToMatchingCalls; 1191 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 1192 for (auto &Call : CallsWithMetadata) { 1193 // Ignore allocations, already handled. 1194 if (AllocationCallToContextNodeMap.count(Call)) 1195 continue; 1196 auto StackIdsWithContextNodes = 1197 getStackIdsWithContextNodesForCall(Call.call()); 1198 // If there were no nodes created for MIBs on allocs (maybe this was in 1199 // the unambiguous part of the MIB stack that was pruned), ignore. 1200 if (StackIdsWithContextNodes.empty()) 1201 continue; 1202 // Otherwise, record this Call along with the list of ids for the last 1203 // (outermost caller) stack id with a node. 1204 StackIdToMatchingCalls[StackIdsWithContextNodes.back()].push_back( 1205 {Call.call(), StackIdsWithContextNodes, Func, {}}); 1206 } 1207 } 1208 1209 // First make a pass through all stack ids that correspond to a call, 1210 // as identified in the above loop. Compute the context ids corresponding to 1211 // each of these calls when they correspond to multiple stack ids due to 1212 // due to inlining. Perform any duplication of context ids required when 1213 // there is more than one call with the same stack ids. Their (possibly newly 1214 // duplicated) context ids are saved in the StackIdToMatchingCalls map. 1215 DenseMap<uint32_t, DenseSet<uint32_t>> OldToNewContextIds; 1216 for (auto &It : StackIdToMatchingCalls) { 1217 auto &Calls = It.getSecond(); 1218 // Skip single calls with a single stack id. These don't need a new node. 1219 if (Calls.size() == 1) { 1220 auto &Ids = std::get<1>(Calls[0]); 1221 if (Ids.size() == 1) 1222 continue; 1223 } 1224 // In order to do the best and maximal matching of inlined calls to context 1225 // node sequences we will sort the vectors of stack ids in descending order 1226 // of length, and within each length, lexicographically by stack id. The 1227 // latter is so that we can specially handle calls that have identical stack 1228 // id sequences (either due to cloning or artificially because of the MIB 1229 // context pruning). 1230 std::stable_sort(Calls.begin(), Calls.end(), 1231 [](const CallContextInfo &A, const CallContextInfo &B) { 1232 auto &IdsA = std::get<1>(A); 1233 auto &IdsB = std::get<1>(B); 1234 return IdsA.size() > IdsB.size() || 1235 (IdsA.size() == IdsB.size() && IdsA < IdsB); 1236 }); 1237 1238 // Find the node for the last stack id, which should be the same 1239 // across all calls recorded for this id, and is the id for this 1240 // entry in the StackIdToMatchingCalls map. 1241 uint64_t LastId = It.getFirst(); 1242 ContextNode *LastNode = getNodeForStackId(LastId); 1243 // We should only have kept stack ids that had nodes. 1244 assert(LastNode); 1245 1246 if (LastNode->Recursive) 1247 continue; 1248 1249 // Initialize the context ids with the last node's. We will subsequently 1250 // refine the context ids by computing the intersection along all edges. 1251 DenseSet<uint32_t> LastNodeContextIds = LastNode->ContextIds; 1252 assert(!LastNodeContextIds.empty()); 1253 1254 for (unsigned I = 0; I < Calls.size(); I++) { 1255 auto &[Call, Ids, Func, SavedContextIds] = Calls[I]; 1256 assert(SavedContextIds.empty()); 1257 assert(LastId == Ids.back()); 1258 1259 // First compute the context ids for this stack id sequence (the 1260 // intersection of the context ids of the corresponding nodes). 1261 // Start with the remaining saved ids for the last node. 1262 assert(!LastNodeContextIds.empty()); 1263 DenseSet<uint32_t> StackSequenceContextIds = LastNodeContextIds; 1264 1265 ContextNode *PrevNode = LastNode; 1266 ContextNode *CurNode = LastNode; 1267 bool Skip = false; 1268 1269 // Iterate backwards through the stack Ids, starting after the last Id 1270 // in the list, which was handled once outside for all Calls. 1271 for (auto IdIter = Ids.rbegin() + 1; IdIter != Ids.rend(); IdIter++) { 1272 auto Id = *IdIter; 1273 CurNode = getNodeForStackId(Id); 1274 // We should only have kept stack ids that had nodes. 1275 assert(CurNode); 1276 1277 if (CurNode->Recursive) { 1278 Skip = true; 1279 break; 1280 } 1281 1282 auto *Edge = CurNode->findEdgeFromCaller(PrevNode); 1283 // If there is no edge then the nodes belong to different MIB contexts, 1284 // and we should skip this inlined context sequence. For example, this 1285 // particular inlined context may include stack ids A->B, and we may 1286 // indeed have nodes for both A and B, but it is possible that they were 1287 // never profiled in sequence in a single MIB for any allocation (i.e. 1288 // we might have profiled an allocation that involves the callsite A, 1289 // but through a different one of its callee callsites, and we might 1290 // have profiled an allocation that involves callsite B, but reached 1291 // from a different caller callsite). 1292 if (!Edge) { 1293 Skip = true; 1294 break; 1295 } 1296 PrevNode = CurNode; 1297 1298 // Update the context ids, which is the intersection of the ids along 1299 // all edges in the sequence. 1300 set_intersect(StackSequenceContextIds, Edge->getContextIds()); 1301 1302 // If we now have no context ids for clone, skip this call. 1303 if (StackSequenceContextIds.empty()) { 1304 Skip = true; 1305 break; 1306 } 1307 } 1308 if (Skip) 1309 continue; 1310 1311 // If some of this call's stack ids did not have corresponding nodes (due 1312 // to pruning), don't include any context ids for contexts that extend 1313 // beyond these nodes. Otherwise we would be matching part of unrelated / 1314 // not fully matching stack contexts. To do this, subtract any context ids 1315 // found in caller nodes of the last node found above. 1316 if (Ids.back() != getLastStackId(Call)) { 1317 for (const auto &PE : CurNode->CallerEdges) { 1318 set_subtract(StackSequenceContextIds, PE->getContextIds()); 1319 if (StackSequenceContextIds.empty()) 1320 break; 1321 } 1322 // If we now have no context ids for clone, skip this call. 1323 if (StackSequenceContextIds.empty()) 1324 continue; 1325 } 1326 1327 // Check if the next set of stack ids is the same (since the Calls vector 1328 // of tuples is sorted by the stack ids we can just look at the next one). 1329 bool DuplicateContextIds = false; 1330 if (I + 1 < Calls.size()) { 1331 auto NextIds = std::get<1>(Calls[I + 1]); 1332 DuplicateContextIds = Ids == NextIds; 1333 } 1334 1335 // If we don't have duplicate context ids, then we can assign all the 1336 // context ids computed for the original node sequence to this call. 1337 // If there are duplicate calls with the same stack ids then we synthesize 1338 // new context ids that are duplicates of the originals. These are 1339 // assigned to SavedContextIds, which is a reference into the map entry 1340 // for this call, allowing us to access these ids later on. 1341 OldToNewContextIds.reserve(OldToNewContextIds.size() + 1342 StackSequenceContextIds.size()); 1343 SavedContextIds = 1344 DuplicateContextIds 1345 ? duplicateContextIds(StackSequenceContextIds, OldToNewContextIds) 1346 : StackSequenceContextIds; 1347 assert(!SavedContextIds.empty()); 1348 1349 if (!DuplicateContextIds) { 1350 // Update saved last node's context ids to remove those that are 1351 // assigned to other calls, so that it is ready for the next call at 1352 // this stack id. 1353 set_subtract(LastNodeContextIds, StackSequenceContextIds); 1354 if (LastNodeContextIds.empty()) 1355 break; 1356 } 1357 } 1358 } 1359 1360 // Propagate the duplicate context ids over the graph. 1361 propagateDuplicateContextIds(OldToNewContextIds); 1362 1363 if (VerifyCCG) 1364 check(); 1365 1366 // Now perform a post-order traversal over the graph, starting with the 1367 // allocation nodes, essentially processing nodes from callers to callees. 1368 // For any that contains an id in the map, update the graph to contain new 1369 // nodes representing any inlining at interior callsites. Note we move the 1370 // associated context ids over to the new nodes. 1371 DenseSet<const ContextNode *> Visited; 1372 for (auto &Entry : AllocationCallToContextNodeMap) 1373 assignStackNodesPostOrder(Entry.second, Visited, StackIdToMatchingCalls); 1374 } 1375 1376 uint64_t ModuleCallsiteContextGraph::getLastStackId(Instruction *Call) { 1377 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1378 Call->getMetadata(LLVMContext::MD_callsite)); 1379 return CallsiteContext.back(); 1380 } 1381 1382 uint64_t IndexCallsiteContextGraph::getLastStackId(IndexCall &Call) { 1383 assert(isa<CallsiteInfo *>(Call.getBase())); 1384 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1385 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1386 // Need to convert index into stack id. 1387 return Index.getStackIdAtIndex(CallsiteContext.back()); 1388 } 1389 1390 static const std::string MemProfCloneSuffix = ".memprof."; 1391 1392 static std::string getMemProfFuncName(Twine Base, unsigned CloneNo) { 1393 // We use CloneNo == 0 to refer to the original version, which doesn't get 1394 // renamed with a suffix. 1395 if (!CloneNo) 1396 return Base.str(); 1397 return (Base + MemProfCloneSuffix + Twine(CloneNo)).str(); 1398 } 1399 1400 std::string ModuleCallsiteContextGraph::getLabel(const Function *Func, 1401 const Instruction *Call, 1402 unsigned CloneNo) const { 1403 return (Twine(Call->getFunction()->getName()) + " -> " + 1404 cast<CallBase>(Call)->getCalledFunction()->getName()) 1405 .str(); 1406 } 1407 1408 std::string IndexCallsiteContextGraph::getLabel(const FunctionSummary *Func, 1409 const IndexCall &Call, 1410 unsigned CloneNo) const { 1411 auto VI = FSToVIMap.find(Func); 1412 assert(VI != FSToVIMap.end()); 1413 if (isa<AllocInfo *>(Call.getBase())) 1414 return (VI->second.name() + " -> alloc").str(); 1415 else { 1416 auto *Callsite = dyn_cast_if_present<CallsiteInfo *>(Call.getBase()); 1417 return (VI->second.name() + " -> " + 1418 getMemProfFuncName(Callsite->Callee.name(), 1419 Callsite->Clones[CloneNo])) 1420 .str(); 1421 } 1422 } 1423 1424 std::vector<uint64_t> 1425 ModuleCallsiteContextGraph::getStackIdsWithContextNodesForCall( 1426 Instruction *Call) { 1427 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 1428 Call->getMetadata(LLVMContext::MD_callsite)); 1429 return getStackIdsWithContextNodes<MDNode, MDNode::op_iterator>( 1430 CallsiteContext); 1431 } 1432 1433 std::vector<uint64_t> 1434 IndexCallsiteContextGraph::getStackIdsWithContextNodesForCall(IndexCall &Call) { 1435 assert(isa<CallsiteInfo *>(Call.getBase())); 1436 CallStack<CallsiteInfo, SmallVector<unsigned>::const_iterator> 1437 CallsiteContext(dyn_cast_if_present<CallsiteInfo *>(Call.getBase())); 1438 return getStackIdsWithContextNodes<CallsiteInfo, 1439 SmallVector<unsigned>::const_iterator>( 1440 CallsiteContext); 1441 } 1442 1443 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1444 template <class NodeT, class IteratorT> 1445 std::vector<uint64_t> 1446 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::getStackIdsWithContextNodes( 1447 CallStack<NodeT, IteratorT> &CallsiteContext) { 1448 std::vector<uint64_t> StackIds; 1449 for (auto IdOrIndex : CallsiteContext) { 1450 auto StackId = getStackId(IdOrIndex); 1451 ContextNode *Node = getNodeForStackId(StackId); 1452 if (!Node) 1453 break; 1454 StackIds.push_back(StackId); 1455 } 1456 return StackIds; 1457 } 1458 1459 ModuleCallsiteContextGraph::ModuleCallsiteContextGraph( 1460 Module &M, function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) 1461 : Mod(M), OREGetter(OREGetter) { 1462 for (auto &F : M) { 1463 std::vector<CallInfo> CallsWithMetadata; 1464 for (auto &BB : F) { 1465 for (auto &I : BB) { 1466 if (!isa<CallBase>(I)) 1467 continue; 1468 if (auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof)) { 1469 CallsWithMetadata.push_back(&I); 1470 auto *AllocNode = addAllocNode(&I, &F); 1471 auto *CallsiteMD = I.getMetadata(LLVMContext::MD_callsite); 1472 assert(CallsiteMD); 1473 CallStack<MDNode, MDNode::op_iterator> CallsiteContext(CallsiteMD); 1474 // Add all of the MIBs and their stack nodes. 1475 for (auto &MDOp : MemProfMD->operands()) { 1476 auto *MIBMD = cast<const MDNode>(MDOp); 1477 MDNode *StackNode = getMIBStackNode(MIBMD); 1478 assert(StackNode); 1479 CallStack<MDNode, MDNode::op_iterator> StackContext(StackNode); 1480 addStackNodesForMIB<MDNode, MDNode::op_iterator>( 1481 AllocNode, StackContext, CallsiteContext, 1482 getMIBAllocType(MIBMD)); 1483 } 1484 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1485 // Memprof and callsite metadata on memory allocations no longer 1486 // needed. 1487 I.setMetadata(LLVMContext::MD_memprof, nullptr); 1488 I.setMetadata(LLVMContext::MD_callsite, nullptr); 1489 } 1490 // For callsite metadata, add to list for this function for later use. 1491 else if (I.getMetadata(LLVMContext::MD_callsite)) 1492 CallsWithMetadata.push_back(&I); 1493 } 1494 } 1495 if (!CallsWithMetadata.empty()) 1496 FuncToCallsWithMetadata.push_back({&F, CallsWithMetadata}); 1497 } 1498 1499 if (DumpCCG) { 1500 dbgs() << "CCG before updating call stack chains:\n"; 1501 dbgs() << *this; 1502 } 1503 1504 if (ExportToDot) 1505 exportToDot("prestackupdate"); 1506 1507 updateStackNodes(); 1508 1509 handleCallsitesWithMultipleTargets(); 1510 1511 // Strip off remaining callsite metadata, no longer needed. 1512 for (auto &FuncEntry : FuncToCallsWithMetadata) 1513 for (auto &Call : FuncEntry.second) 1514 Call.call()->setMetadata(LLVMContext::MD_callsite, nullptr); 1515 } 1516 1517 IndexCallsiteContextGraph::IndexCallsiteContextGraph( 1518 ModuleSummaryIndex &Index, 1519 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 1520 isPrevailing) 1521 : Index(Index) { 1522 for (auto &I : Index) { 1523 auto VI = Index.getValueInfo(I); 1524 for (auto &S : VI.getSummaryList()) { 1525 // We should only add the prevailing nodes. Otherwise we may try to clone 1526 // in a weak copy that won't be linked (and may be different than the 1527 // prevailing version). 1528 // We only keep the memprof summary on the prevailing copy now when 1529 // building the combined index, as a space optimization, however don't 1530 // rely on this optimization. The linker doesn't resolve local linkage 1531 // values so don't check whether those are prevailing. 1532 if (!GlobalValue::isLocalLinkage(S->linkage()) && 1533 !isPrevailing(VI.getGUID(), S.get())) 1534 continue; 1535 auto *FS = dyn_cast<FunctionSummary>(S.get()); 1536 if (!FS) 1537 continue; 1538 std::vector<CallInfo> CallsWithMetadata; 1539 if (!FS->allocs().empty()) { 1540 for (auto &AN : FS->mutableAllocs()) { 1541 // This can happen because of recursion elimination handling that 1542 // currently exists in ModuleSummaryAnalysis. Skip these for now. 1543 // We still added them to the summary because we need to be able to 1544 // correlate properly in applyImport in the backends. 1545 if (AN.MIBs.empty()) 1546 continue; 1547 CallsWithMetadata.push_back({&AN}); 1548 auto *AllocNode = addAllocNode({&AN}, FS); 1549 // Pass an empty CallStack to the CallsiteContext (second) 1550 // parameter, since for ThinLTO we already collapsed out the inlined 1551 // stack ids on the allocation call during ModuleSummaryAnalysis. 1552 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1553 EmptyContext; 1554 // Now add all of the MIBs and their stack nodes. 1555 for (auto &MIB : AN.MIBs) { 1556 CallStack<MIBInfo, SmallVector<unsigned>::const_iterator> 1557 StackContext(&MIB); 1558 addStackNodesForMIB<MIBInfo, SmallVector<unsigned>::const_iterator>( 1559 AllocNode, StackContext, EmptyContext, MIB.AllocType); 1560 } 1561 assert(AllocNode->AllocTypes != (uint8_t)AllocationType::None); 1562 // Initialize version 0 on the summary alloc node to the current alloc 1563 // type, unless it has both types in which case make it default, so 1564 // that in the case where we aren't able to clone the original version 1565 // always ends up with the default allocation behavior. 1566 AN.Versions[0] = (uint8_t)allocTypeToUse(AllocNode->AllocTypes); 1567 } 1568 } 1569 // For callsite metadata, add to list for this function for later use. 1570 if (!FS->callsites().empty()) 1571 for (auto &SN : FS->mutableCallsites()) 1572 CallsWithMetadata.push_back({&SN}); 1573 1574 if (!CallsWithMetadata.empty()) 1575 FuncToCallsWithMetadata.push_back({FS, CallsWithMetadata}); 1576 1577 if (!FS->allocs().empty() || !FS->callsites().empty()) 1578 FSToVIMap[FS] = VI; 1579 } 1580 } 1581 1582 if (DumpCCG) { 1583 dbgs() << "CCG before updating call stack chains:\n"; 1584 dbgs() << *this; 1585 } 1586 1587 if (ExportToDot) 1588 exportToDot("prestackupdate"); 1589 1590 updateStackNodes(); 1591 1592 handleCallsitesWithMultipleTargets(); 1593 } 1594 1595 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1596 void CallsiteContextGraph<DerivedCCG, FuncTy, 1597 CallTy>::handleCallsitesWithMultipleTargets() { 1598 // Look for and workaround callsites that call multiple functions. 1599 // This can happen for indirect calls, which needs better handling, and in 1600 // more rare cases (e.g. macro expansion). 1601 // TODO: To fix this for indirect calls we will want to perform speculative 1602 // devirtualization using either the normal PGO info with ICP, or using the 1603 // information in the profiled MemProf contexts. We can do this prior to 1604 // this transformation for regular LTO, and for ThinLTO we can simulate that 1605 // effect in the summary and perform the actual speculative devirtualization 1606 // while cloning in the ThinLTO backend. 1607 for (auto Entry = NonAllocationCallToContextNodeMap.begin(); 1608 Entry != NonAllocationCallToContextNodeMap.end();) { 1609 auto *Node = Entry->second; 1610 assert(Node->Clones.empty()); 1611 // Check all node callees and see if in the same function. 1612 bool Removed = false; 1613 auto Call = Node->Call.call(); 1614 for (auto &Edge : Node->CalleeEdges) { 1615 if (!Edge->Callee->hasCall()) 1616 continue; 1617 assert(NodeToCallingFunc.count(Edge->Callee)); 1618 // Check if the called function matches that of the callee node. 1619 if (calleeMatchesFunc(Call, NodeToCallingFunc[Edge->Callee])) 1620 continue; 1621 // Work around by setting Node to have a null call, so it gets 1622 // skipped during cloning. Otherwise assignFunctions will assert 1623 // because its data structures are not designed to handle this case. 1624 Entry = NonAllocationCallToContextNodeMap.erase(Entry); 1625 Node->setCall(CallInfo()); 1626 Removed = true; 1627 break; 1628 } 1629 if (!Removed) 1630 Entry++; 1631 } 1632 } 1633 1634 uint64_t ModuleCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1635 // In the Module (IR) case this is already the Id. 1636 return IdOrIndex; 1637 } 1638 1639 uint64_t IndexCallsiteContextGraph::getStackId(uint64_t IdOrIndex) const { 1640 // In the Index case this is an index into the stack id list in the summary 1641 // index, convert it to an Id. 1642 return Index.getStackIdAtIndex(IdOrIndex); 1643 } 1644 1645 bool ModuleCallsiteContextGraph::calleeMatchesFunc(Instruction *Call, 1646 const Function *Func) { 1647 auto *CB = dyn_cast<CallBase>(Call); 1648 if (!CB->getCalledOperand()) 1649 return false; 1650 auto *CalleeVal = CB->getCalledOperand()->stripPointerCasts(); 1651 auto *CalleeFunc = dyn_cast<Function>(CalleeVal); 1652 if (CalleeFunc == Func) 1653 return true; 1654 auto *Alias = dyn_cast<GlobalAlias>(CalleeVal); 1655 return Alias && Alias->getAliasee() == Func; 1656 } 1657 1658 bool IndexCallsiteContextGraph::calleeMatchesFunc(IndexCall &Call, 1659 const FunctionSummary *Func) { 1660 ValueInfo Callee = 1661 dyn_cast_if_present<CallsiteInfo *>(Call.getBase())->Callee; 1662 // If there is no summary list then this is a call to an externally defined 1663 // symbol. 1664 AliasSummary *Alias = 1665 Callee.getSummaryList().empty() 1666 ? nullptr 1667 : dyn_cast<AliasSummary>(Callee.getSummaryList()[0].get()); 1668 assert(FSToVIMap.count(Func)); 1669 return Callee == FSToVIMap[Func] || 1670 // If callee is an alias, check the aliasee, since only function 1671 // summary base objects will contain the stack node summaries and thus 1672 // get a context node. 1673 (Alias && Alias->getAliaseeVI() == FSToVIMap[Func]); 1674 } 1675 1676 static std::string getAllocTypeString(uint8_t AllocTypes) { 1677 if (!AllocTypes) 1678 return "None"; 1679 std::string Str; 1680 if (AllocTypes & (uint8_t)AllocationType::NotCold) 1681 Str += "NotCold"; 1682 if (AllocTypes & (uint8_t)AllocationType::Cold) 1683 Str += "Cold"; 1684 return Str; 1685 } 1686 1687 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1688 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::dump() 1689 const { 1690 print(dbgs()); 1691 dbgs() << "\n"; 1692 } 1693 1694 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1695 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode::print( 1696 raw_ostream &OS) const { 1697 OS << "Node " << this << "\n"; 1698 OS << "\t"; 1699 printCall(OS); 1700 if (Recursive) 1701 OS << " (recursive)"; 1702 OS << "\n"; 1703 OS << "\tAllocTypes: " << getAllocTypeString(AllocTypes) << "\n"; 1704 OS << "\tContextIds:"; 1705 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 1706 std::sort(SortedIds.begin(), SortedIds.end()); 1707 for (auto Id : SortedIds) 1708 OS << " " << Id; 1709 OS << "\n"; 1710 OS << "\tCalleeEdges:\n"; 1711 for (auto &Edge : CalleeEdges) 1712 OS << "\t\t" << *Edge << "\n"; 1713 OS << "\tCallerEdges:\n"; 1714 for (auto &Edge : CallerEdges) 1715 OS << "\t\t" << *Edge << "\n"; 1716 if (!Clones.empty()) { 1717 OS << "\tClones: "; 1718 FieldSeparator FS; 1719 for (auto *Clone : Clones) 1720 OS << FS << Clone; 1721 OS << "\n"; 1722 } else if (CloneOf) { 1723 OS << "\tClone of " << CloneOf << "\n"; 1724 } 1725 } 1726 1727 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1728 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::dump() 1729 const { 1730 print(dbgs()); 1731 dbgs() << "\n"; 1732 } 1733 1734 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1735 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextEdge::print( 1736 raw_ostream &OS) const { 1737 OS << "Edge from Callee " << Callee << " to Caller: " << Caller 1738 << " AllocTypes: " << getAllocTypeString(AllocTypes); 1739 OS << " ContextIds:"; 1740 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 1741 std::sort(SortedIds.begin(), SortedIds.end()); 1742 for (auto Id : SortedIds) 1743 OS << " " << Id; 1744 } 1745 1746 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1747 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::dump() const { 1748 print(dbgs()); 1749 } 1750 1751 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1752 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::print( 1753 raw_ostream &OS) const { 1754 OS << "Callsite Context Graph:\n"; 1755 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1756 for (const auto Node : nodes<GraphType>(this)) { 1757 if (Node->isRemoved()) 1758 continue; 1759 Node->print(OS); 1760 OS << "\n"; 1761 } 1762 } 1763 1764 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1765 static void checkEdge( 1766 const std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>> &Edge) { 1767 // Confirm that alloc type is not None and that we have at least one context 1768 // id. 1769 assert(Edge->AllocTypes != (uint8_t)AllocationType::None); 1770 assert(!Edge->ContextIds.empty()); 1771 } 1772 1773 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1774 static void checkNode(const ContextNode<DerivedCCG, FuncTy, CallTy> *Node, 1775 bool CheckEdges = true) { 1776 if (Node->isRemoved()) 1777 return; 1778 // Node's context ids should be the union of both its callee and caller edge 1779 // context ids. 1780 if (Node->CallerEdges.size()) { 1781 auto EI = Node->CallerEdges.begin(); 1782 auto &FirstEdge = *EI; 1783 EI++; 1784 DenseSet<uint32_t> CallerEdgeContextIds(FirstEdge->ContextIds); 1785 for (; EI != Node->CallerEdges.end(); EI++) { 1786 const auto &Edge = *EI; 1787 if (CheckEdges) 1788 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1789 set_union(CallerEdgeContextIds, Edge->ContextIds); 1790 } 1791 // Node can have more context ids than callers if some contexts terminate at 1792 // node and some are longer. 1793 assert(Node->ContextIds == CallerEdgeContextIds || 1794 set_is_subset(CallerEdgeContextIds, Node->ContextIds)); 1795 } 1796 if (Node->CalleeEdges.size()) { 1797 auto EI = Node->CalleeEdges.begin(); 1798 auto &FirstEdge = *EI; 1799 EI++; 1800 DenseSet<uint32_t> CalleeEdgeContextIds(FirstEdge->ContextIds); 1801 for (; EI != Node->CalleeEdges.end(); EI++) { 1802 const auto &Edge = *EI; 1803 if (CheckEdges) 1804 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1805 set_union(CalleeEdgeContextIds, Edge->ContextIds); 1806 } 1807 assert(Node->ContextIds == CalleeEdgeContextIds); 1808 } 1809 } 1810 1811 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1812 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::check() const { 1813 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1814 for (const auto Node : nodes<GraphType>(this)) { 1815 checkNode<DerivedCCG, FuncTy, CallTy>(Node, /*CheckEdges=*/false); 1816 for (auto &Edge : Node->CallerEdges) 1817 checkEdge<DerivedCCG, FuncTy, CallTy>(Edge); 1818 } 1819 } 1820 1821 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1822 struct GraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> { 1823 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1824 using NodeRef = const ContextNode<DerivedCCG, FuncTy, CallTy> *; 1825 1826 using NodePtrTy = std::unique_ptr<ContextNode<DerivedCCG, FuncTy, CallTy>>; 1827 static NodeRef getNode(const NodePtrTy &P) { return P.get(); } 1828 1829 using nodes_iterator = 1830 mapped_iterator<typename std::vector<NodePtrTy>::const_iterator, 1831 decltype(&getNode)>; 1832 1833 static nodes_iterator nodes_begin(GraphType G) { 1834 return nodes_iterator(G->NodeOwner.begin(), &getNode); 1835 } 1836 1837 static nodes_iterator nodes_end(GraphType G) { 1838 return nodes_iterator(G->NodeOwner.end(), &getNode); 1839 } 1840 1841 static NodeRef getEntryNode(GraphType G) { 1842 return G->NodeOwner.begin()->get(); 1843 } 1844 1845 using EdgePtrTy = std::shared_ptr<ContextEdge<DerivedCCG, FuncTy, CallTy>>; 1846 static const ContextNode<DerivedCCG, FuncTy, CallTy> * 1847 GetCallee(const EdgePtrTy &P) { 1848 return P->Callee; 1849 } 1850 1851 using ChildIteratorType = 1852 mapped_iterator<typename std::vector<std::shared_ptr<ContextEdge< 1853 DerivedCCG, FuncTy, CallTy>>>::const_iterator, 1854 decltype(&GetCallee)>; 1855 1856 static ChildIteratorType child_begin(NodeRef N) { 1857 return ChildIteratorType(N->CalleeEdges.begin(), &GetCallee); 1858 } 1859 1860 static ChildIteratorType child_end(NodeRef N) { 1861 return ChildIteratorType(N->CalleeEdges.end(), &GetCallee); 1862 } 1863 }; 1864 1865 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1866 struct DOTGraphTraits<const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *> 1867 : public DefaultDOTGraphTraits { 1868 DOTGraphTraits(bool IsSimple = false) : DefaultDOTGraphTraits(IsSimple) {} 1869 1870 using GraphType = const CallsiteContextGraph<DerivedCCG, FuncTy, CallTy> *; 1871 using GTraits = GraphTraits<GraphType>; 1872 using NodeRef = typename GTraits::NodeRef; 1873 using ChildIteratorType = typename GTraits::ChildIteratorType; 1874 1875 static std::string getNodeLabel(NodeRef Node, GraphType G) { 1876 std::string LabelString = 1877 (Twine("OrigId: ") + (Node->IsAllocation ? "Alloc" : "") + 1878 Twine(Node->OrigStackOrAllocId)) 1879 .str(); 1880 LabelString += "\n"; 1881 if (Node->hasCall()) { 1882 auto Func = G->NodeToCallingFunc.find(Node); 1883 assert(Func != G->NodeToCallingFunc.end()); 1884 LabelString += 1885 G->getLabel(Func->second, Node->Call.call(), Node->Call.cloneNo()); 1886 } else { 1887 LabelString += "null call"; 1888 if (Node->Recursive) 1889 LabelString += " (recursive)"; 1890 else 1891 LabelString += " (external)"; 1892 } 1893 return LabelString; 1894 } 1895 1896 static std::string getNodeAttributes(NodeRef Node, GraphType) { 1897 std::string AttributeString = (Twine("tooltip=\"") + getNodeId(Node) + " " + 1898 getContextIds(Node->ContextIds) + "\"") 1899 .str(); 1900 AttributeString += 1901 (Twine(",fillcolor=\"") + getColor(Node->AllocTypes) + "\"").str(); 1902 AttributeString += ",style=\"filled\""; 1903 if (Node->CloneOf) { 1904 AttributeString += ",color=\"blue\""; 1905 AttributeString += ",style=\"filled,bold,dashed\""; 1906 } else 1907 AttributeString += ",style=\"filled\""; 1908 return AttributeString; 1909 } 1910 1911 static std::string getEdgeAttributes(NodeRef, ChildIteratorType ChildIter, 1912 GraphType) { 1913 auto &Edge = *(ChildIter.getCurrent()); 1914 return (Twine("tooltip=\"") + getContextIds(Edge->ContextIds) + "\"" + 1915 Twine(",fillcolor=\"") + getColor(Edge->AllocTypes) + "\"") 1916 .str(); 1917 } 1918 1919 // Since the NodeOwners list includes nodes that are no longer connected to 1920 // the graph, skip them here. 1921 static bool isNodeHidden(NodeRef Node, GraphType) { 1922 return Node->isRemoved(); 1923 } 1924 1925 private: 1926 static std::string getContextIds(const DenseSet<uint32_t> &ContextIds) { 1927 std::string IdString = "ContextIds:"; 1928 if (ContextIds.size() < 100) { 1929 std::vector<uint32_t> SortedIds(ContextIds.begin(), ContextIds.end()); 1930 std::sort(SortedIds.begin(), SortedIds.end()); 1931 for (auto Id : SortedIds) 1932 IdString += (" " + Twine(Id)).str(); 1933 } else { 1934 IdString += (" (" + Twine(ContextIds.size()) + " ids)").str(); 1935 } 1936 return IdString; 1937 } 1938 1939 static std::string getColor(uint8_t AllocTypes) { 1940 if (AllocTypes == (uint8_t)AllocationType::NotCold) 1941 // Color "brown1" actually looks like a lighter red. 1942 return "brown1"; 1943 if (AllocTypes == (uint8_t)AllocationType::Cold) 1944 return "cyan"; 1945 if (AllocTypes == 1946 ((uint8_t)AllocationType::NotCold | (uint8_t)AllocationType::Cold)) 1947 // Lighter purple. 1948 return "mediumorchid1"; 1949 return "gray"; 1950 } 1951 1952 static std::string getNodeId(NodeRef Node) { 1953 std::stringstream SStream; 1954 SStream << std::hex << "N0x" << (unsigned long long)Node; 1955 std::string Result = SStream.str(); 1956 return Result; 1957 } 1958 }; 1959 1960 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1961 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::exportToDot( 1962 std::string Label) const { 1963 WriteGraph(this, "", false, Label, 1964 DotFilePathPrefix + "ccg." + Label + ".dot"); 1965 } 1966 1967 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1968 typename CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::ContextNode * 1969 CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::moveEdgeToNewCalleeClone( 1970 const std::shared_ptr<ContextEdge> &Edge, EdgeIter *CallerEdgeI) { 1971 ContextNode *Node = Edge->Callee; 1972 NodeOwner.push_back( 1973 std::make_unique<ContextNode>(Node->IsAllocation, Node->Call)); 1974 ContextNode *Clone = NodeOwner.back().get(); 1975 Node->addClone(Clone); 1976 assert(NodeToCallingFunc.count(Node)); 1977 NodeToCallingFunc[Clone] = NodeToCallingFunc[Node]; 1978 moveEdgeToExistingCalleeClone(Edge, Clone, CallerEdgeI, /*NewClone=*/true); 1979 return Clone; 1980 } 1981 1982 template <typename DerivedCCG, typename FuncTy, typename CallTy> 1983 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>:: 1984 moveEdgeToExistingCalleeClone(const std::shared_ptr<ContextEdge> &Edge, 1985 ContextNode *NewCallee, EdgeIter *CallerEdgeI, 1986 bool NewClone) { 1987 // NewCallee and Edge's current callee must be clones of the same original 1988 // node (Edge's current callee may be the original node too). 1989 assert(NewCallee->getOrigNode() == Edge->Callee->getOrigNode()); 1990 auto &EdgeContextIds = Edge->getContextIds(); 1991 ContextNode *OldCallee = Edge->Callee; 1992 if (CallerEdgeI) 1993 *CallerEdgeI = OldCallee->CallerEdges.erase(*CallerEdgeI); 1994 else 1995 OldCallee->eraseCallerEdge(Edge.get()); 1996 Edge->Callee = NewCallee; 1997 NewCallee->CallerEdges.push_back(Edge); 1998 // Don't need to update Edge's context ids since we are simply reconnecting 1999 // it. 2000 set_subtract(OldCallee->ContextIds, EdgeContextIds); 2001 NewCallee->ContextIds.insert(EdgeContextIds.begin(), EdgeContextIds.end()); 2002 NewCallee->AllocTypes |= Edge->AllocTypes; 2003 OldCallee->AllocTypes = computeAllocType(OldCallee->ContextIds); 2004 // OldCallee alloc type should be None iff its context id set is now empty. 2005 assert((OldCallee->AllocTypes == (uint8_t)AllocationType::None) == 2006 OldCallee->ContextIds.empty()); 2007 // Now walk the old callee node's callee edges and move Edge's context ids 2008 // over to the corresponding edge into the clone (which is created here if 2009 // this is a newly created clone). 2010 for (auto &OldCalleeEdge : OldCallee->CalleeEdges) { 2011 // The context ids moving to the new callee are the subset of this edge's 2012 // context ids and the context ids on the caller edge being moved. 2013 DenseSet<uint32_t> EdgeContextIdsToMove = 2014 set_intersection(OldCalleeEdge->getContextIds(), EdgeContextIds); 2015 set_subtract(OldCalleeEdge->getContextIds(), EdgeContextIdsToMove); 2016 OldCalleeEdge->AllocTypes = 2017 computeAllocType(OldCalleeEdge->getContextIds()); 2018 if (!NewClone) { 2019 // Update context ids / alloc type on corresponding edge to NewCallee. 2020 // There is a chance this may not exist if we are reusing an existing 2021 // clone, specifically during function assignment, where we would have 2022 // removed none type edges after creating the clone. If we can't find 2023 // a corresponding edge there, fall through to the cloning below. 2024 if (auto *NewCalleeEdge = 2025 NewCallee->findEdgeFromCallee(OldCalleeEdge->Callee)) { 2026 NewCalleeEdge->getContextIds().insert(EdgeContextIdsToMove.begin(), 2027 EdgeContextIdsToMove.end()); 2028 NewCalleeEdge->AllocTypes |= computeAllocType(EdgeContextIdsToMove); 2029 continue; 2030 } 2031 } 2032 auto NewEdge = std::make_shared<ContextEdge>( 2033 OldCalleeEdge->Callee, NewCallee, 2034 computeAllocType(EdgeContextIdsToMove), EdgeContextIdsToMove); 2035 NewCallee->CalleeEdges.push_back(NewEdge); 2036 NewEdge->Callee->CallerEdges.push_back(NewEdge); 2037 } 2038 if (VerifyCCG) { 2039 checkNode<DerivedCCG, FuncTy, CallTy>(OldCallee, /*CheckEdges=*/false); 2040 checkNode<DerivedCCG, FuncTy, CallTy>(NewCallee, /*CheckEdges=*/false); 2041 for (const auto &OldCalleeEdge : OldCallee->CalleeEdges) 2042 checkNode<DerivedCCG, FuncTy, CallTy>(OldCalleeEdge->Callee, 2043 /*CheckEdges=*/false); 2044 for (const auto &NewCalleeEdge : NewCallee->CalleeEdges) 2045 checkNode<DerivedCCG, FuncTy, CallTy>(NewCalleeEdge->Callee, 2046 /*CheckEdges=*/false); 2047 } 2048 } 2049 2050 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2051 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones() { 2052 DenseSet<const ContextNode *> Visited; 2053 for (auto &Entry : AllocationCallToContextNodeMap) 2054 identifyClones(Entry.second, Visited); 2055 } 2056 2057 // helper function to check an AllocType is cold or notcold or both. 2058 bool checkColdOrNotCold(uint8_t AllocType) { 2059 return (AllocType == (uint8_t)AllocationType::Cold) || 2060 (AllocType == (uint8_t)AllocationType::NotCold) || 2061 (AllocType == 2062 ((uint8_t)AllocationType::Cold | (uint8_t)AllocationType::NotCold)); 2063 } 2064 2065 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2066 void CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::identifyClones( 2067 ContextNode *Node, DenseSet<const ContextNode *> &Visited) { 2068 if (VerifyNodes) 2069 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2070 assert(!Node->CloneOf); 2071 2072 // If Node as a null call, then either it wasn't found in the module (regular 2073 // LTO) or summary index (ThinLTO), or there were other conditions blocking 2074 // cloning (e.g. recursion, calls multiple targets, etc). 2075 // Do this here so that we don't try to recursively clone callers below, which 2076 // isn't useful at least for this node. 2077 if (!Node->hasCall()) 2078 return; 2079 2080 #ifndef NDEBUG 2081 auto Insert = 2082 #endif 2083 Visited.insert(Node); 2084 // We should not have visited this node yet. 2085 assert(Insert.second); 2086 // The recursive call to identifyClones may delete the current edge from the 2087 // CallerEdges vector. Make a copy and iterate on that, simpler than passing 2088 // in an iterator and having recursive call erase from it. Other edges may 2089 // also get removed during the recursion, which will have null Callee and 2090 // Caller pointers (and are deleted later), so we skip those below. 2091 { 2092 auto CallerEdges = Node->CallerEdges; 2093 for (auto &Edge : CallerEdges) { 2094 // Skip any that have been removed by an earlier recursive call. 2095 if (Edge->Callee == nullptr && Edge->Caller == nullptr) { 2096 assert(!std::count(Node->CallerEdges.begin(), Node->CallerEdges.end(), 2097 Edge)); 2098 continue; 2099 } 2100 // Ignore any caller we previously visited via another edge. 2101 if (!Visited.count(Edge->Caller) && !Edge->Caller->CloneOf) { 2102 identifyClones(Edge->Caller, Visited); 2103 } 2104 } 2105 } 2106 2107 // Check if we reached an unambiguous call or have have only a single caller. 2108 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2109 return; 2110 2111 // We need to clone. 2112 2113 // Try to keep the original version as alloc type NotCold. This will make 2114 // cases with indirect calls or any other situation with an unknown call to 2115 // the original function get the default behavior. We do this by sorting the 2116 // CallerEdges of the Node we will clone by alloc type. 2117 // 2118 // Give NotCold edge the lowest sort priority so those edges are at the end of 2119 // the caller edges vector, and stay on the original version (since the below 2120 // code clones greedily until it finds all remaining edges have the same type 2121 // and leaves the remaining ones on the original Node). 2122 // 2123 // We shouldn't actually have any None type edges, so the sorting priority for 2124 // that is arbitrary, and we assert in that case below. 2125 const unsigned AllocTypeCloningPriority[] = {/*None*/ 3, /*NotCold*/ 4, 2126 /*Cold*/ 1, 2127 /*NotColdCold*/ 2}; 2128 std::stable_sort(Node->CallerEdges.begin(), Node->CallerEdges.end(), 2129 [&](const std::shared_ptr<ContextEdge> &A, 2130 const std::shared_ptr<ContextEdge> &B) { 2131 assert(checkColdOrNotCold(A->AllocTypes) && 2132 checkColdOrNotCold(B->AllocTypes)); 2133 2134 if (A->AllocTypes == B->AllocTypes) 2135 // Use the first context id for each edge as a 2136 // tie-breaker. 2137 return *A->ContextIds.begin() < *B->ContextIds.begin(); 2138 return AllocTypeCloningPriority[A->AllocTypes] < 2139 AllocTypeCloningPriority[B->AllocTypes]; 2140 }); 2141 2142 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2143 2144 // Iterate until we find no more opportunities for disambiguating the alloc 2145 // types via cloning. In most cases this loop will terminate once the Node 2146 // has a single allocation type, in which case no more cloning is needed. 2147 // We need to be able to remove Edge from CallerEdges, so need to adjust 2148 // iterator inside the loop. 2149 for (auto EI = Node->CallerEdges.begin(); EI != Node->CallerEdges.end();) { 2150 auto CallerEdge = *EI; 2151 2152 // See if cloning the prior caller edge left this node with a single alloc 2153 // type or a single caller. In that case no more cloning of Node is needed. 2154 if (hasSingleAllocType(Node->AllocTypes) || Node->CallerEdges.size() <= 1) 2155 break; 2156 2157 // Compute the node callee edge alloc types corresponding to the context ids 2158 // for this caller edge. 2159 std::vector<uint8_t> CalleeEdgeAllocTypesForCallerEdge; 2160 CalleeEdgeAllocTypesForCallerEdge.reserve(Node->CalleeEdges.size()); 2161 for (auto &CalleeEdge : Node->CalleeEdges) 2162 CalleeEdgeAllocTypesForCallerEdge.push_back(intersectAllocTypes( 2163 CalleeEdge->getContextIds(), CallerEdge->getContextIds())); 2164 2165 // Don't clone if doing so will not disambiguate any alloc types amongst 2166 // caller edges (including the callee edges that would be cloned). 2167 // Otherwise we will simply move all edges to the clone. 2168 // 2169 // First check if by cloning we will disambiguate the caller allocation 2170 // type from node's allocation type. Query allocTypeToUse so that we don't 2171 // bother cloning to distinguish NotCold+Cold from NotCold. Note that 2172 // neither of these should be None type. 2173 // 2174 // Then check if by cloning node at least one of the callee edges will be 2175 // disambiguated by splitting out different context ids. 2176 assert(CallerEdge->AllocTypes != (uint8_t)AllocationType::None); 2177 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2178 if (allocTypeToUse(CallerEdge->AllocTypes) == 2179 allocTypeToUse(Node->AllocTypes) && 2180 allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2181 CalleeEdgeAllocTypesForCallerEdge, Node->CalleeEdges)) { 2182 ++EI; 2183 continue; 2184 } 2185 2186 // First see if we can use an existing clone. Check each clone and its 2187 // callee edges for matching alloc types. 2188 ContextNode *Clone = nullptr; 2189 for (auto *CurClone : Node->Clones) { 2190 if (allocTypeToUse(CurClone->AllocTypes) != 2191 allocTypeToUse(CallerEdge->AllocTypes)) 2192 continue; 2193 2194 if (!allocTypesMatch<DerivedCCG, FuncTy, CallTy>( 2195 CalleeEdgeAllocTypesForCallerEdge, CurClone->CalleeEdges)) 2196 continue; 2197 Clone = CurClone; 2198 break; 2199 } 2200 2201 // The edge iterator is adjusted when we move the CallerEdge to the clone. 2202 if (Clone) 2203 moveEdgeToExistingCalleeClone(CallerEdge, Clone, &EI); 2204 else 2205 Clone = moveEdgeToNewCalleeClone(CallerEdge, &EI); 2206 2207 assert(EI == Node->CallerEdges.end() || 2208 Node->AllocTypes != (uint8_t)AllocationType::None); 2209 // Sanity check that no alloc types on clone or its edges are None. 2210 assert(Clone->AllocTypes != (uint8_t)AllocationType::None); 2211 assert(llvm::none_of( 2212 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2213 return E->AllocTypes == (uint8_t)AllocationType::None; 2214 })); 2215 } 2216 2217 // Cloning may have resulted in some cloned callee edges with type None, 2218 // because they aren't carrying any contexts. Remove those edges. 2219 for (auto *Clone : Node->Clones) { 2220 removeNoneTypeCalleeEdges(Clone); 2221 if (VerifyNodes) 2222 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2223 } 2224 // We should still have some context ids on the original Node. 2225 assert(!Node->ContextIds.empty()); 2226 2227 // Remove any callee edges that ended up with alloc type None after creating 2228 // clones and updating callee edges. 2229 removeNoneTypeCalleeEdges(Node); 2230 2231 // Sanity check that no alloc types on node or edges are None. 2232 assert(Node->AllocTypes != (uint8_t)AllocationType::None); 2233 assert(llvm::none_of(Node->CalleeEdges, 2234 [&](const std::shared_ptr<ContextEdge> &E) { 2235 return E->AllocTypes == (uint8_t)AllocationType::None; 2236 })); 2237 assert(llvm::none_of(Node->CallerEdges, 2238 [&](const std::shared_ptr<ContextEdge> &E) { 2239 return E->AllocTypes == (uint8_t)AllocationType::None; 2240 })); 2241 2242 if (VerifyNodes) 2243 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2244 } 2245 2246 void ModuleCallsiteContextGraph::updateAllocationCall( 2247 CallInfo &Call, AllocationType AllocType) { 2248 std::string AllocTypeString = getAllocTypeAttributeString(AllocType); 2249 auto A = llvm::Attribute::get(Call.call()->getFunction()->getContext(), 2250 "memprof", AllocTypeString); 2251 cast<CallBase>(Call.call())->addFnAttr(A); 2252 OREGetter(Call.call()->getFunction()) 2253 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", Call.call()) 2254 << ore::NV("AllocationCall", Call.call()) << " in clone " 2255 << ore::NV("Caller", Call.call()->getFunction()) 2256 << " marked with memprof allocation attribute " 2257 << ore::NV("Attribute", AllocTypeString)); 2258 } 2259 2260 void IndexCallsiteContextGraph::updateAllocationCall(CallInfo &Call, 2261 AllocationType AllocType) { 2262 auto *AI = Call.call().dyn_cast<AllocInfo *>(); 2263 assert(AI); 2264 assert(AI->Versions.size() > Call.cloneNo()); 2265 AI->Versions[Call.cloneNo()] = (uint8_t)AllocType; 2266 } 2267 2268 void ModuleCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2269 FuncInfo CalleeFunc) { 2270 if (CalleeFunc.cloneNo() > 0) 2271 cast<CallBase>(CallerCall.call())->setCalledFunction(CalleeFunc.func()); 2272 OREGetter(CallerCall.call()->getFunction()) 2273 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CallerCall.call()) 2274 << ore::NV("Call", CallerCall.call()) << " in clone " 2275 << ore::NV("Caller", CallerCall.call()->getFunction()) 2276 << " assigned to call function clone " 2277 << ore::NV("Callee", CalleeFunc.func())); 2278 } 2279 2280 void IndexCallsiteContextGraph::updateCall(CallInfo &CallerCall, 2281 FuncInfo CalleeFunc) { 2282 auto *CI = CallerCall.call().dyn_cast<CallsiteInfo *>(); 2283 assert(CI && 2284 "Caller cannot be an allocation which should not have profiled calls"); 2285 assert(CI->Clones.size() > CallerCall.cloneNo()); 2286 CI->Clones[CallerCall.cloneNo()] = CalleeFunc.cloneNo(); 2287 } 2288 2289 CallsiteContextGraph<ModuleCallsiteContextGraph, Function, 2290 Instruction *>::FuncInfo 2291 ModuleCallsiteContextGraph::cloneFunctionForCallsite( 2292 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2293 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2294 // Use existing LLVM facilities for cloning and obtaining Call in clone 2295 ValueToValueMapTy VMap; 2296 auto *NewFunc = CloneFunction(Func.func(), VMap); 2297 std::string Name = getMemProfFuncName(Func.func()->getName(), CloneNo); 2298 assert(!Func.func()->getParent()->getFunction(Name)); 2299 NewFunc->setName(Name); 2300 for (auto &Inst : CallsWithMetadataInFunc) { 2301 // This map always has the initial version in it. 2302 assert(Inst.cloneNo() == 0); 2303 CallMap[Inst] = {cast<Instruction>(VMap[Inst.call()]), CloneNo}; 2304 } 2305 OREGetter(Func.func()) 2306 .emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", Func.func()) 2307 << "created clone " << ore::NV("NewFunction", NewFunc)); 2308 return {NewFunc, CloneNo}; 2309 } 2310 2311 CallsiteContextGraph<IndexCallsiteContextGraph, FunctionSummary, 2312 IndexCall>::FuncInfo 2313 IndexCallsiteContextGraph::cloneFunctionForCallsite( 2314 FuncInfo &Func, CallInfo &Call, std::map<CallInfo, CallInfo> &CallMap, 2315 std::vector<CallInfo> &CallsWithMetadataInFunc, unsigned CloneNo) { 2316 // Check how many clones we have of Call (and therefore function). 2317 // The next clone number is the current size of versions array. 2318 // Confirm this matches the CloneNo provided by the caller, which is based on 2319 // the number of function clones we have. 2320 assert(CloneNo == 2321 (Call.call().is<AllocInfo *>() 2322 ? Call.call().dyn_cast<AllocInfo *>()->Versions.size() 2323 : Call.call().dyn_cast<CallsiteInfo *>()->Clones.size())); 2324 // Walk all the instructions in this function. Create a new version for 2325 // each (by adding an entry to the Versions/Clones summary array), and copy 2326 // over the version being called for the function clone being cloned here. 2327 // Additionally, add an entry to the CallMap for the new function clone, 2328 // mapping the original call (clone 0, what is in CallsWithMetadataInFunc) 2329 // to the new call clone. 2330 for (auto &Inst : CallsWithMetadataInFunc) { 2331 // This map always has the initial version in it. 2332 assert(Inst.cloneNo() == 0); 2333 if (auto *AI = Inst.call().dyn_cast<AllocInfo *>()) { 2334 assert(AI->Versions.size() == CloneNo); 2335 // We assign the allocation type later (in updateAllocationCall), just add 2336 // an entry for it here. 2337 AI->Versions.push_back(0); 2338 } else { 2339 auto *CI = Inst.call().dyn_cast<CallsiteInfo *>(); 2340 assert(CI && CI->Clones.size() == CloneNo); 2341 // We assign the clone number later (in updateCall), just add an entry for 2342 // it here. 2343 CI->Clones.push_back(0); 2344 } 2345 CallMap[Inst] = {Inst.call(), CloneNo}; 2346 } 2347 return {Func.func(), CloneNo}; 2348 } 2349 2350 // This method assigns cloned callsites to functions, cloning the functions as 2351 // needed. The assignment is greedy and proceeds roughly as follows: 2352 // 2353 // For each function Func: 2354 // For each call with graph Node having clones: 2355 // Initialize ClonesWorklist to Node and its clones 2356 // Initialize NodeCloneCount to 0 2357 // While ClonesWorklist is not empty: 2358 // Clone = pop front ClonesWorklist 2359 // NodeCloneCount++ 2360 // If Func has been cloned less than NodeCloneCount times: 2361 // If NodeCloneCount is 1: 2362 // Assign Clone to original Func 2363 // Continue 2364 // Create a new function clone 2365 // If other callers not assigned to call a function clone yet: 2366 // Assign them to call new function clone 2367 // Continue 2368 // Assign any other caller calling the cloned version to new clone 2369 // 2370 // For each caller of Clone: 2371 // If caller is assigned to call a specific function clone: 2372 // If we cannot assign Clone to that function clone: 2373 // Create new callsite Clone NewClone 2374 // Add NewClone to ClonesWorklist 2375 // Continue 2376 // Assign Clone to existing caller's called function clone 2377 // Else: 2378 // If Clone not already assigned to a function clone: 2379 // Assign to first function clone without assignment 2380 // Assign caller to selected function clone 2381 template <typename DerivedCCG, typename FuncTy, typename CallTy> 2382 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::assignFunctions() { 2383 bool Changed = false; 2384 2385 // Keep track of the assignment of nodes (callsites) to function clones they 2386 // call. 2387 DenseMap<ContextNode *, FuncInfo> CallsiteToCalleeFuncCloneMap; 2388 2389 // Update caller node to call function version CalleeFunc, by recording the 2390 // assignment in CallsiteToCalleeFuncCloneMap. 2391 auto RecordCalleeFuncOfCallsite = [&](ContextNode *Caller, 2392 const FuncInfo &CalleeFunc) { 2393 assert(Caller->hasCall()); 2394 CallsiteToCalleeFuncCloneMap[Caller] = CalleeFunc; 2395 }; 2396 2397 // Walk all functions for which we saw calls with memprof metadata, and handle 2398 // cloning for each of its calls. 2399 for (auto &[Func, CallsWithMetadata] : FuncToCallsWithMetadata) { 2400 FuncInfo OrigFunc(Func); 2401 // Map from each clone of OrigFunc to a map of remappings of each call of 2402 // interest (from original uncloned call to the corresponding cloned call in 2403 // that function clone). 2404 std::map<FuncInfo, std::map<CallInfo, CallInfo>> FuncClonesToCallMap; 2405 for (auto &Call : CallsWithMetadata) { 2406 ContextNode *Node = getNodeForInst(Call); 2407 // Skip call if we do not have a node for it (all uses of its stack ids 2408 // were either on inlined chains or pruned from the MIBs), or if we did 2409 // not create any clones for it. 2410 if (!Node || Node->Clones.empty()) 2411 continue; 2412 assert(Node->hasCall() && 2413 "Not having a call should have prevented cloning"); 2414 2415 // Track the assignment of function clones to clones of the current 2416 // callsite Node being handled. 2417 std::map<FuncInfo, ContextNode *> FuncCloneToCurNodeCloneMap; 2418 2419 // Assign callsite version CallsiteClone to function version FuncClone, 2420 // and also assign (possibly cloned) Call to CallsiteClone. 2421 auto AssignCallsiteCloneToFuncClone = [&](const FuncInfo &FuncClone, 2422 CallInfo &Call, 2423 ContextNode *CallsiteClone, 2424 bool IsAlloc) { 2425 // Record the clone of callsite node assigned to this function clone. 2426 FuncCloneToCurNodeCloneMap[FuncClone] = CallsiteClone; 2427 2428 assert(FuncClonesToCallMap.count(FuncClone)); 2429 std::map<CallInfo, CallInfo> &CallMap = FuncClonesToCallMap[FuncClone]; 2430 CallInfo CallClone(Call); 2431 if (CallMap.count(Call)) 2432 CallClone = CallMap[Call]; 2433 CallsiteClone->setCall(CallClone); 2434 }; 2435 2436 // Keep track of the clones of callsite Node that need to be assigned to 2437 // function clones. This list may be expanded in the loop body below if we 2438 // find additional cloning is required. 2439 std::deque<ContextNode *> ClonesWorklist; 2440 // Ignore original Node if we moved all of its contexts to clones. 2441 if (!Node->ContextIds.empty()) 2442 ClonesWorklist.push_back(Node); 2443 ClonesWorklist.insert(ClonesWorklist.end(), Node->Clones.begin(), 2444 Node->Clones.end()); 2445 2446 // Now walk through all of the clones of this callsite Node that we need, 2447 // and determine the assignment to a corresponding clone of the current 2448 // function (creating new function clones as needed). 2449 unsigned NodeCloneCount = 0; 2450 while (!ClonesWorklist.empty()) { 2451 ContextNode *Clone = ClonesWorklist.front(); 2452 ClonesWorklist.pop_front(); 2453 NodeCloneCount++; 2454 if (VerifyNodes) 2455 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2456 2457 // Need to create a new function clone if we have more callsite clones 2458 // than existing function clones, which would have been assigned to an 2459 // earlier clone in the list (we assign callsite clones to function 2460 // clones greedily). 2461 if (FuncClonesToCallMap.size() < NodeCloneCount) { 2462 // If this is the first callsite copy, assign to original function. 2463 if (NodeCloneCount == 1) { 2464 // Since FuncClonesToCallMap is empty in this case, no clones have 2465 // been created for this function yet, and no callers should have 2466 // been assigned a function clone for this callee node yet. 2467 assert(llvm::none_of( 2468 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2469 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2470 })); 2471 // Initialize with empty call map, assign Clone to original function 2472 // and its callers, and skip to the next clone. 2473 FuncClonesToCallMap[OrigFunc] = {}; 2474 AssignCallsiteCloneToFuncClone( 2475 OrigFunc, Call, Clone, 2476 AllocationCallToContextNodeMap.count(Call)); 2477 for (auto &CE : Clone->CallerEdges) { 2478 // Ignore any caller that does not have a recorded callsite Call. 2479 if (!CE->Caller->hasCall()) 2480 continue; 2481 RecordCalleeFuncOfCallsite(CE->Caller, OrigFunc); 2482 } 2483 continue; 2484 } 2485 2486 // First locate which copy of OrigFunc to clone again. If a caller 2487 // of this callsite clone was already assigned to call a particular 2488 // function clone, we need to redirect all of those callers to the 2489 // new function clone, and update their other callees within this 2490 // function. 2491 FuncInfo PreviousAssignedFuncClone; 2492 auto EI = llvm::find_if( 2493 Clone->CallerEdges, [&](const std::shared_ptr<ContextEdge> &E) { 2494 return CallsiteToCalleeFuncCloneMap.count(E->Caller); 2495 }); 2496 bool CallerAssignedToCloneOfFunc = false; 2497 if (EI != Clone->CallerEdges.end()) { 2498 const std::shared_ptr<ContextEdge> &Edge = *EI; 2499 PreviousAssignedFuncClone = 2500 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 2501 CallerAssignedToCloneOfFunc = true; 2502 } 2503 2504 // Clone function and save it along with the CallInfo map created 2505 // during cloning in the FuncClonesToCallMap. 2506 std::map<CallInfo, CallInfo> NewCallMap; 2507 unsigned CloneNo = FuncClonesToCallMap.size(); 2508 assert(CloneNo > 0 && "Clone 0 is the original function, which " 2509 "should already exist in the map"); 2510 FuncInfo NewFuncClone = cloneFunctionForCallsite( 2511 OrigFunc, Call, NewCallMap, CallsWithMetadata, CloneNo); 2512 FuncClonesToCallMap.emplace(NewFuncClone, std::move(NewCallMap)); 2513 FunctionClonesAnalysis++; 2514 Changed = true; 2515 2516 // If no caller callsites were already assigned to a clone of this 2517 // function, we can simply assign this clone to the new func clone 2518 // and update all callers to it, then skip to the next clone. 2519 if (!CallerAssignedToCloneOfFunc) { 2520 AssignCallsiteCloneToFuncClone( 2521 NewFuncClone, Call, Clone, 2522 AllocationCallToContextNodeMap.count(Call)); 2523 for (auto &CE : Clone->CallerEdges) { 2524 // Ignore any caller that does not have a recorded callsite Call. 2525 if (!CE->Caller->hasCall()) 2526 continue; 2527 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2528 } 2529 continue; 2530 } 2531 2532 // We may need to do additional node cloning in this case. 2533 // Reset the CallsiteToCalleeFuncCloneMap entry for any callers 2534 // that were previously assigned to call PreviousAssignedFuncClone, 2535 // to record that they now call NewFuncClone. 2536 for (auto CE : Clone->CallerEdges) { 2537 // Ignore any caller that does not have a recorded callsite Call. 2538 if (!CE->Caller->hasCall()) 2539 continue; 2540 2541 if (!CallsiteToCalleeFuncCloneMap.count(CE->Caller) || 2542 // We subsequently fall through to later handling that 2543 // will perform any additional cloning required for 2544 // callers that were calling other function clones. 2545 CallsiteToCalleeFuncCloneMap[CE->Caller] != 2546 PreviousAssignedFuncClone) 2547 continue; 2548 2549 RecordCalleeFuncOfCallsite(CE->Caller, NewFuncClone); 2550 2551 // If we are cloning a function that was already assigned to some 2552 // callers, then essentially we are creating new callsite clones 2553 // of the other callsites in that function that are reached by those 2554 // callers. Clone the other callees of the current callsite's caller 2555 // that were already assigned to PreviousAssignedFuncClone 2556 // accordingly. This is important since we subsequently update the 2557 // calls from the nodes in the graph and their assignments to callee 2558 // functions recorded in CallsiteToCalleeFuncCloneMap. 2559 for (auto CalleeEdge : CE->Caller->CalleeEdges) { 2560 // Skip any that have been removed on an earlier iteration when 2561 // cleaning up newly None type callee edges. 2562 if (!CalleeEdge) 2563 continue; 2564 ContextNode *Callee = CalleeEdge->Callee; 2565 // Skip the current callsite, we are looking for other 2566 // callsites Caller calls, as well as any that does not have a 2567 // recorded callsite Call. 2568 if (Callee == Clone || !Callee->hasCall()) 2569 continue; 2570 ContextNode *NewClone = moveEdgeToNewCalleeClone(CalleeEdge); 2571 removeNoneTypeCalleeEdges(NewClone); 2572 // Moving the edge may have resulted in some none type 2573 // callee edges on the original Callee. 2574 removeNoneTypeCalleeEdges(Callee); 2575 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 2576 // If the Callee node was already assigned to call a specific 2577 // function version, make sure its new clone is assigned to call 2578 // that same function clone. 2579 if (CallsiteToCalleeFuncCloneMap.count(Callee)) 2580 RecordCalleeFuncOfCallsite( 2581 NewClone, CallsiteToCalleeFuncCloneMap[Callee]); 2582 // Update NewClone with the new Call clone of this callsite's Call 2583 // created for the new function clone created earlier. 2584 // Recall that we have already ensured when building the graph 2585 // that each caller can only call callsites within the same 2586 // function, so we are guaranteed that Callee Call is in the 2587 // current OrigFunc. 2588 // CallMap is set up as indexed by original Call at clone 0. 2589 CallInfo OrigCall(Callee->getOrigNode()->Call); 2590 OrigCall.setCloneNo(0); 2591 std::map<CallInfo, CallInfo> &CallMap = 2592 FuncClonesToCallMap[NewFuncClone]; 2593 assert(CallMap.count(OrigCall)); 2594 CallInfo NewCall(CallMap[OrigCall]); 2595 assert(NewCall); 2596 NewClone->setCall(NewCall); 2597 } 2598 } 2599 // Fall through to handling below to perform the recording of the 2600 // function for this callsite clone. This enables handling of cases 2601 // where the callers were assigned to different clones of a function. 2602 } 2603 2604 // See if we can use existing function clone. Walk through 2605 // all caller edges to see if any have already been assigned to 2606 // a clone of this callsite's function. If we can use it, do so. If not, 2607 // because that function clone is already assigned to a different clone 2608 // of this callsite, then we need to clone again. 2609 // Basically, this checking is needed to handle the case where different 2610 // caller functions/callsites may need versions of this function 2611 // containing different mixes of callsite clones across the different 2612 // callsites within the function. If that happens, we need to create 2613 // additional function clones to handle the various combinations. 2614 // 2615 // Keep track of any new clones of this callsite created by the 2616 // following loop, as well as any existing clone that we decided to 2617 // assign this clone to. 2618 std::map<FuncInfo, ContextNode *> FuncCloneToNewCallsiteCloneMap; 2619 FuncInfo FuncCloneAssignedToCurCallsiteClone; 2620 // We need to be able to remove Edge from CallerEdges, so need to adjust 2621 // iterator in the loop. 2622 for (auto EI = Clone->CallerEdges.begin(); 2623 EI != Clone->CallerEdges.end();) { 2624 auto Edge = *EI; 2625 // Ignore any caller that does not have a recorded callsite Call. 2626 if (!Edge->Caller->hasCall()) { 2627 EI++; 2628 continue; 2629 } 2630 // If this caller already assigned to call a version of OrigFunc, need 2631 // to ensure we can assign this callsite clone to that function clone. 2632 if (CallsiteToCalleeFuncCloneMap.count(Edge->Caller)) { 2633 FuncInfo FuncCloneCalledByCaller = 2634 CallsiteToCalleeFuncCloneMap[Edge->Caller]; 2635 // First we need to confirm that this function clone is available 2636 // for use by this callsite node clone. 2637 // 2638 // While FuncCloneToCurNodeCloneMap is built only for this Node and 2639 // its callsite clones, one of those callsite clones X could have 2640 // been assigned to the same function clone called by Edge's caller 2641 // - if Edge's caller calls another callsite within Node's original 2642 // function, and that callsite has another caller reaching clone X. 2643 // We need to clone Node again in this case. 2644 if ((FuncCloneToCurNodeCloneMap.count(FuncCloneCalledByCaller) && 2645 FuncCloneToCurNodeCloneMap[FuncCloneCalledByCaller] != 2646 Clone) || 2647 // Detect when we have multiple callers of this callsite that 2648 // have already been assigned to specific, and different, clones 2649 // of OrigFunc (due to other unrelated callsites in Func they 2650 // reach via call contexts). Is this Clone of callsite Node 2651 // assigned to a different clone of OrigFunc? If so, clone Node 2652 // again. 2653 (FuncCloneAssignedToCurCallsiteClone && 2654 FuncCloneAssignedToCurCallsiteClone != 2655 FuncCloneCalledByCaller)) { 2656 // We need to use a different newly created callsite clone, in 2657 // order to assign it to another new function clone on a 2658 // subsequent iteration over the Clones array (adjusted below). 2659 // Note we specifically do not reset the 2660 // CallsiteToCalleeFuncCloneMap entry for this caller, so that 2661 // when this new clone is processed later we know which version of 2662 // the function to copy (so that other callsite clones we have 2663 // assigned to that function clone are properly cloned over). See 2664 // comments in the function cloning handling earlier. 2665 2666 // Check if we already have cloned this callsite again while 2667 // walking through caller edges, for a caller calling the same 2668 // function clone. If so, we can move this edge to that new clone 2669 // rather than creating yet another new clone. 2670 if (FuncCloneToNewCallsiteCloneMap.count( 2671 FuncCloneCalledByCaller)) { 2672 ContextNode *NewClone = 2673 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller]; 2674 moveEdgeToExistingCalleeClone(Edge, NewClone, &EI); 2675 // Cleanup any none type edges cloned over. 2676 removeNoneTypeCalleeEdges(NewClone); 2677 } else { 2678 // Create a new callsite clone. 2679 ContextNode *NewClone = moveEdgeToNewCalleeClone(Edge, &EI); 2680 removeNoneTypeCalleeEdges(NewClone); 2681 FuncCloneToNewCallsiteCloneMap[FuncCloneCalledByCaller] = 2682 NewClone; 2683 // Add to list of clones and process later. 2684 ClonesWorklist.push_back(NewClone); 2685 assert(EI == Clone->CallerEdges.end() || 2686 Clone->AllocTypes != (uint8_t)AllocationType::None); 2687 assert(NewClone->AllocTypes != (uint8_t)AllocationType::None); 2688 } 2689 // Moving the caller edge may have resulted in some none type 2690 // callee edges. 2691 removeNoneTypeCalleeEdges(Clone); 2692 // We will handle the newly created callsite clone in a subsequent 2693 // iteration over this Node's Clones. Continue here since we 2694 // already adjusted iterator EI while moving the edge. 2695 continue; 2696 } 2697 2698 // Otherwise, we can use the function clone already assigned to this 2699 // caller. 2700 if (!FuncCloneAssignedToCurCallsiteClone) { 2701 FuncCloneAssignedToCurCallsiteClone = FuncCloneCalledByCaller; 2702 // Assign Clone to FuncCloneCalledByCaller 2703 AssignCallsiteCloneToFuncClone( 2704 FuncCloneCalledByCaller, Call, Clone, 2705 AllocationCallToContextNodeMap.count(Call)); 2706 } else 2707 // Don't need to do anything - callsite is already calling this 2708 // function clone. 2709 assert(FuncCloneAssignedToCurCallsiteClone == 2710 FuncCloneCalledByCaller); 2711 2712 } else { 2713 // We have not already assigned this caller to a version of 2714 // OrigFunc. Do the assignment now. 2715 2716 // First check if we have already assigned this callsite clone to a 2717 // clone of OrigFunc for another caller during this iteration over 2718 // its caller edges. 2719 if (!FuncCloneAssignedToCurCallsiteClone) { 2720 // Find first function in FuncClonesToCallMap without an assigned 2721 // clone of this callsite Node. We should always have one 2722 // available at this point due to the earlier cloning when the 2723 // FuncClonesToCallMap size was smaller than the clone number. 2724 for (auto &CF : FuncClonesToCallMap) { 2725 if (!FuncCloneToCurNodeCloneMap.count(CF.first)) { 2726 FuncCloneAssignedToCurCallsiteClone = CF.first; 2727 break; 2728 } 2729 } 2730 assert(FuncCloneAssignedToCurCallsiteClone); 2731 // Assign Clone to FuncCloneAssignedToCurCallsiteClone 2732 AssignCallsiteCloneToFuncClone( 2733 FuncCloneAssignedToCurCallsiteClone, Call, Clone, 2734 AllocationCallToContextNodeMap.count(Call)); 2735 } else 2736 assert(FuncCloneToCurNodeCloneMap 2737 [FuncCloneAssignedToCurCallsiteClone] == Clone); 2738 // Update callers to record function version called. 2739 RecordCalleeFuncOfCallsite(Edge->Caller, 2740 FuncCloneAssignedToCurCallsiteClone); 2741 } 2742 2743 EI++; 2744 } 2745 } 2746 if (VerifyCCG) { 2747 checkNode<DerivedCCG, FuncTy, CallTy>(Node); 2748 for (const auto &PE : Node->CalleeEdges) 2749 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 2750 for (const auto &CE : Node->CallerEdges) 2751 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 2752 for (auto *Clone : Node->Clones) { 2753 checkNode<DerivedCCG, FuncTy, CallTy>(Clone); 2754 for (const auto &PE : Clone->CalleeEdges) 2755 checkNode<DerivedCCG, FuncTy, CallTy>(PE->Callee); 2756 for (const auto &CE : Clone->CallerEdges) 2757 checkNode<DerivedCCG, FuncTy, CallTy>(CE->Caller); 2758 } 2759 } 2760 } 2761 } 2762 2763 auto UpdateCalls = [&](ContextNode *Node, 2764 DenseSet<const ContextNode *> &Visited, 2765 auto &&UpdateCalls) { 2766 auto Inserted = Visited.insert(Node); 2767 if (!Inserted.second) 2768 return; 2769 2770 for (auto *Clone : Node->Clones) 2771 UpdateCalls(Clone, Visited, UpdateCalls); 2772 2773 for (auto &Edge : Node->CallerEdges) 2774 UpdateCalls(Edge->Caller, Visited, UpdateCalls); 2775 2776 // Skip if either no call to update, or if we ended up with no context ids 2777 // (we moved all edges onto other clones). 2778 if (!Node->hasCall() || Node->ContextIds.empty()) 2779 return; 2780 2781 if (Node->IsAllocation) { 2782 updateAllocationCall(Node->Call, allocTypeToUse(Node->AllocTypes)); 2783 return; 2784 } 2785 2786 if (!CallsiteToCalleeFuncCloneMap.count(Node)) 2787 return; 2788 2789 auto CalleeFunc = CallsiteToCalleeFuncCloneMap[Node]; 2790 updateCall(Node->Call, CalleeFunc); 2791 }; 2792 2793 // Performs DFS traversal starting from allocation nodes to update calls to 2794 // reflect cloning decisions recorded earlier. For regular LTO this will 2795 // update the actual calls in the IR to call the appropriate function clone 2796 // (and add attributes to allocation calls), whereas for ThinLTO the decisions 2797 // are recorded in the summary entries. 2798 DenseSet<const ContextNode *> Visited; 2799 for (auto &Entry : AllocationCallToContextNodeMap) 2800 UpdateCalls(Entry.second, Visited, UpdateCalls); 2801 2802 return Changed; 2803 } 2804 2805 static SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> createFunctionClones( 2806 Function &F, unsigned NumClones, Module &M, OptimizationRemarkEmitter &ORE, 2807 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 2808 &FuncToAliasMap) { 2809 // The first "clone" is the original copy, we should only call this if we 2810 // needed to create new clones. 2811 assert(NumClones > 1); 2812 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 2813 VMaps.reserve(NumClones - 1); 2814 FunctionsClonedThinBackend++; 2815 for (unsigned I = 1; I < NumClones; I++) { 2816 VMaps.emplace_back(std::make_unique<ValueToValueMapTy>()); 2817 auto *NewF = CloneFunction(&F, *VMaps.back()); 2818 FunctionClonesThinBackend++; 2819 // Strip memprof and callsite metadata from clone as they are no longer 2820 // needed. 2821 for (auto &BB : *NewF) { 2822 for (auto &Inst : BB) { 2823 Inst.setMetadata(LLVMContext::MD_memprof, nullptr); 2824 Inst.setMetadata(LLVMContext::MD_callsite, nullptr); 2825 } 2826 } 2827 std::string Name = getMemProfFuncName(F.getName(), I); 2828 auto *PrevF = M.getFunction(Name); 2829 if (PrevF) { 2830 // We might have created this when adjusting callsite in another 2831 // function. It should be a declaration. 2832 assert(PrevF->isDeclaration()); 2833 NewF->takeName(PrevF); 2834 PrevF->replaceAllUsesWith(NewF); 2835 PrevF->eraseFromParent(); 2836 } else 2837 NewF->setName(Name); 2838 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofClone", &F) 2839 << "created clone " << ore::NV("NewFunction", NewF)); 2840 2841 // Now handle aliases to this function, and clone those as well. 2842 if (!FuncToAliasMap.count(&F)) 2843 continue; 2844 for (auto *A : FuncToAliasMap[&F]) { 2845 std::string Name = getMemProfFuncName(A->getName(), I); 2846 auto *PrevA = M.getNamedAlias(Name); 2847 auto *NewA = GlobalAlias::create(A->getValueType(), 2848 A->getType()->getPointerAddressSpace(), 2849 A->getLinkage(), Name, NewF); 2850 NewA->copyAttributesFrom(A); 2851 if (PrevA) { 2852 // We might have created this when adjusting callsite in another 2853 // function. It should be a declaration. 2854 assert(PrevA->isDeclaration()); 2855 NewA->takeName(PrevA); 2856 PrevA->replaceAllUsesWith(NewA); 2857 PrevA->eraseFromParent(); 2858 } 2859 } 2860 } 2861 return VMaps; 2862 } 2863 2864 // Locate the summary for F. This is complicated by the fact that it might 2865 // have been internalized or promoted. 2866 static ValueInfo findValueInfoForFunc(const Function &F, const Module &M, 2867 const ModuleSummaryIndex *ImportSummary) { 2868 // FIXME: Ideally we would retain the original GUID in some fashion on the 2869 // function (e.g. as metadata), but for now do our best to locate the 2870 // summary without that information. 2871 ValueInfo TheFnVI = ImportSummary->getValueInfo(F.getGUID()); 2872 if (!TheFnVI) 2873 // See if theFn was internalized, by checking index directly with 2874 // original name (this avoids the name adjustment done by getGUID() for 2875 // internal symbols). 2876 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(F.getName())); 2877 if (TheFnVI) 2878 return TheFnVI; 2879 // Now query with the original name before any promotion was performed. 2880 StringRef OrigName = 2881 ModuleSummaryIndex::getOriginalNameBeforePromote(F.getName()); 2882 std::string OrigId = GlobalValue::getGlobalIdentifier( 2883 OrigName, GlobalValue::InternalLinkage, M.getSourceFileName()); 2884 TheFnVI = ImportSummary->getValueInfo(GlobalValue::getGUID(OrigId)); 2885 if (TheFnVI) 2886 return TheFnVI; 2887 // Could be a promoted local imported from another module. We need to pass 2888 // down more info here to find the original module id. For now, try with 2889 // the OrigName which might have been stored in the OidGuidMap in the 2890 // index. This would not work if there were same-named locals in multiple 2891 // modules, however. 2892 auto OrigGUID = 2893 ImportSummary->getGUIDFromOriginalID(GlobalValue::getGUID(OrigName)); 2894 if (OrigGUID) 2895 TheFnVI = ImportSummary->getValueInfo(OrigGUID); 2896 return TheFnVI; 2897 } 2898 2899 bool MemProfContextDisambiguation::applyImport(Module &M) { 2900 assert(ImportSummary); 2901 bool Changed = false; 2902 2903 auto IsMemProfClone = [](const Function &F) { 2904 return F.getName().contains(MemProfCloneSuffix); 2905 }; 2906 2907 // We also need to clone any aliases that reference cloned functions, because 2908 // the modified callsites may invoke via the alias. Keep track of the aliases 2909 // for each function. 2910 std::map<const Function *, SmallPtrSet<const GlobalAlias *, 1>> 2911 FuncToAliasMap; 2912 for (auto &A : M.aliases()) { 2913 auto *Aliasee = A.getAliaseeObject(); 2914 if (auto *F = dyn_cast<Function>(Aliasee)) 2915 FuncToAliasMap[F].insert(&A); 2916 } 2917 2918 for (auto &F : M) { 2919 if (F.isDeclaration() || IsMemProfClone(F)) 2920 continue; 2921 2922 OptimizationRemarkEmitter ORE(&F); 2923 2924 SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps; 2925 bool ClonesCreated = false; 2926 unsigned NumClonesCreated = 0; 2927 auto CloneFuncIfNeeded = [&](unsigned NumClones) { 2928 // We should at least have version 0 which is the original copy. 2929 assert(NumClones > 0); 2930 // If only one copy needed use original. 2931 if (NumClones == 1) 2932 return; 2933 // If we already performed cloning of this function, confirm that the 2934 // requested number of clones matches (the thin link should ensure the 2935 // number of clones for each constituent callsite is consistent within 2936 // each function), before returning. 2937 if (ClonesCreated) { 2938 assert(NumClonesCreated == NumClones); 2939 return; 2940 } 2941 VMaps = createFunctionClones(F, NumClones, M, ORE, FuncToAliasMap); 2942 // The first "clone" is the original copy, which doesn't have a VMap. 2943 assert(VMaps.size() == NumClones - 1); 2944 Changed = true; 2945 ClonesCreated = true; 2946 NumClonesCreated = NumClones; 2947 }; 2948 2949 // Locate the summary for F. 2950 ValueInfo TheFnVI = findValueInfoForFunc(F, M, ImportSummary); 2951 // If not found, this could be an imported local (see comment in 2952 // findValueInfoForFunc). Skip for now as it will be cloned in its original 2953 // module (where it would have been promoted to global scope so should 2954 // satisfy any reference in this module). 2955 if (!TheFnVI) 2956 continue; 2957 2958 auto *GVSummary = 2959 ImportSummary->findSummaryInModule(TheFnVI, M.getModuleIdentifier()); 2960 if (!GVSummary) 2961 // Must have been imported, use the first summary (might be multiple if 2962 // this was a linkonce_odr). 2963 GVSummary = TheFnVI.getSummaryList().front().get(); 2964 2965 // If this was an imported alias skip it as we won't have the function 2966 // summary, and it should be cloned in the original module. 2967 if (isa<AliasSummary>(GVSummary)) 2968 continue; 2969 2970 auto *FS = cast<FunctionSummary>(GVSummary->getBaseObject()); 2971 2972 if (FS->allocs().empty() && FS->callsites().empty()) 2973 continue; 2974 2975 auto SI = FS->callsites().begin(); 2976 auto AI = FS->allocs().begin(); 2977 2978 // Assume for now that the instructions are in the exact same order 2979 // as when the summary was created, but confirm this is correct by 2980 // matching the stack ids. 2981 for (auto &BB : F) { 2982 for (auto &I : BB) { 2983 auto *CB = dyn_cast<CallBase>(&I); 2984 // Same handling as when creating module summary. 2985 if (!mayHaveMemprofSummary(CB)) 2986 continue; 2987 2988 CallStack<MDNode, MDNode::op_iterator> CallsiteContext( 2989 I.getMetadata(LLVMContext::MD_callsite)); 2990 auto *MemProfMD = I.getMetadata(LLVMContext::MD_memprof); 2991 2992 // Include allocs that were already assigned a memprof function 2993 // attribute in the statistics. 2994 if (CB->getAttributes().hasFnAttr("memprof")) { 2995 assert(!MemProfMD); 2996 CB->getAttributes().getFnAttr("memprof").getValueAsString() == "cold" 2997 ? AllocTypeColdThinBackend++ 2998 : AllocTypeNotColdThinBackend++; 2999 OrigAllocsThinBackend++; 3000 AllocVersionsThinBackend++; 3001 if (!MaxAllocVersionsThinBackend) 3002 MaxAllocVersionsThinBackend = 1; 3003 // Remove any remaining callsite metadata and we can skip the rest of 3004 // the handling for this instruction, since no cloning needed. 3005 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3006 continue; 3007 } 3008 3009 if (MemProfMD) { 3010 // Consult the next alloc node. 3011 assert(AI != FS->allocs().end()); 3012 auto &AllocNode = *(AI++); 3013 3014 // Sanity check that the MIB stack ids match between the summary and 3015 // instruction metadata. 3016 auto MIBIter = AllocNode.MIBs.begin(); 3017 for (auto &MDOp : MemProfMD->operands()) { 3018 assert(MIBIter != AllocNode.MIBs.end()); 3019 LLVM_ATTRIBUTE_UNUSED auto StackIdIndexIter = 3020 MIBIter->StackIdIndices.begin(); 3021 auto *MIBMD = cast<const MDNode>(MDOp); 3022 MDNode *StackMDNode = getMIBStackNode(MIBMD); 3023 assert(StackMDNode); 3024 SmallVector<unsigned> StackIdsFromMetadata; 3025 CallStack<MDNode, MDNode::op_iterator> StackContext(StackMDNode); 3026 for (auto ContextIter = 3027 StackContext.beginAfterSharedPrefix(CallsiteContext); 3028 ContextIter != StackContext.end(); ++ContextIter) { 3029 // If this is a direct recursion, simply skip the duplicate 3030 // entries, to be consistent with how the summary ids were 3031 // generated during ModuleSummaryAnalysis. 3032 if (!StackIdsFromMetadata.empty() && 3033 StackIdsFromMetadata.back() == *ContextIter) 3034 continue; 3035 assert(StackIdIndexIter != MIBIter->StackIdIndices.end()); 3036 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3037 *ContextIter); 3038 StackIdIndexIter++; 3039 } 3040 MIBIter++; 3041 } 3042 3043 // Perform cloning if not yet done. 3044 CloneFuncIfNeeded(/*NumClones=*/AllocNode.Versions.size()); 3045 3046 OrigAllocsThinBackend++; 3047 AllocVersionsThinBackend += AllocNode.Versions.size(); 3048 if (MaxAllocVersionsThinBackend < AllocNode.Versions.size()) 3049 MaxAllocVersionsThinBackend = AllocNode.Versions.size(); 3050 3051 // If there is only one version that means we didn't end up 3052 // considering this function for cloning, and in that case the alloc 3053 // will still be none type or should have gotten the default NotCold. 3054 // Skip that after calling clone helper since that does some sanity 3055 // checks that confirm we haven't decided yet that we need cloning. 3056 if (AllocNode.Versions.size() == 1) { 3057 assert((AllocationType)AllocNode.Versions[0] == 3058 AllocationType::NotCold || 3059 (AllocationType)AllocNode.Versions[0] == 3060 AllocationType::None); 3061 UnclonableAllocsThinBackend++; 3062 continue; 3063 } 3064 3065 // All versions should have a singular allocation type. 3066 assert(llvm::none_of(AllocNode.Versions, [](uint8_t Type) { 3067 return Type == ((uint8_t)AllocationType::NotCold | 3068 (uint8_t)AllocationType::Cold); 3069 })); 3070 3071 // Update the allocation types per the summary info. 3072 for (unsigned J = 0; J < AllocNode.Versions.size(); J++) { 3073 // Ignore any that didn't get an assigned allocation type. 3074 if (AllocNode.Versions[J] == (uint8_t)AllocationType::None) 3075 continue; 3076 AllocationType AllocTy = (AllocationType)AllocNode.Versions[J]; 3077 AllocTy == AllocationType::Cold ? AllocTypeColdThinBackend++ 3078 : AllocTypeNotColdThinBackend++; 3079 std::string AllocTypeString = getAllocTypeAttributeString(AllocTy); 3080 auto A = llvm::Attribute::get(F.getContext(), "memprof", 3081 AllocTypeString); 3082 CallBase *CBClone; 3083 // Copy 0 is the original function. 3084 if (!J) 3085 CBClone = CB; 3086 else 3087 // Since VMaps are only created for new clones, we index with 3088 // clone J-1 (J==0 is the original clone and does not have a VMaps 3089 // entry). 3090 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3091 CBClone->addFnAttr(A); 3092 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofAttribute", CBClone) 3093 << ore::NV("AllocationCall", CBClone) << " in clone " 3094 << ore::NV("Caller", CBClone->getFunction()) 3095 << " marked with memprof allocation attribute " 3096 << ore::NV("Attribute", AllocTypeString)); 3097 } 3098 } else if (!CallsiteContext.empty()) { 3099 // Consult the next callsite node. 3100 assert(SI != FS->callsites().end()); 3101 auto &StackNode = *(SI++); 3102 3103 #ifndef NDEBUG 3104 // Sanity check that the stack ids match between the summary and 3105 // instruction metadata. 3106 auto StackIdIndexIter = StackNode.StackIdIndices.begin(); 3107 for (auto StackId : CallsiteContext) { 3108 assert(StackIdIndexIter != StackNode.StackIdIndices.end()); 3109 assert(ImportSummary->getStackIdAtIndex(*StackIdIndexIter) == 3110 StackId); 3111 StackIdIndexIter++; 3112 } 3113 #endif 3114 3115 // Perform cloning if not yet done. 3116 CloneFuncIfNeeded(/*NumClones=*/StackNode.Clones.size()); 3117 3118 // Should have skipped indirect calls via mayHaveMemprofSummary. 3119 assert(CB->getCalledFunction()); 3120 assert(!IsMemProfClone(*CB->getCalledFunction())); 3121 3122 // Update the calls per the summary info. 3123 // Save orig name since it gets updated in the first iteration 3124 // below. 3125 auto CalleeOrigName = CB->getCalledFunction()->getName(); 3126 for (unsigned J = 0; J < StackNode.Clones.size(); J++) { 3127 // Do nothing if this version calls the original version of its 3128 // callee. 3129 if (!StackNode.Clones[J]) 3130 continue; 3131 auto NewF = M.getOrInsertFunction( 3132 getMemProfFuncName(CalleeOrigName, StackNode.Clones[J]), 3133 CB->getCalledFunction()->getFunctionType()); 3134 CallBase *CBClone; 3135 // Copy 0 is the original function. 3136 if (!J) 3137 CBClone = CB; 3138 else 3139 CBClone = cast<CallBase>((*VMaps[J - 1])[CB]); 3140 CBClone->setCalledFunction(NewF); 3141 ORE.emit(OptimizationRemark(DEBUG_TYPE, "MemprofCall", CBClone) 3142 << ore::NV("Call", CBClone) << " in clone " 3143 << ore::NV("Caller", CBClone->getFunction()) 3144 << " assigned to call function clone " 3145 << ore::NV("Callee", NewF.getCallee())); 3146 } 3147 } 3148 // Memprof and callsite metadata on memory allocations no longer needed. 3149 I.setMetadata(LLVMContext::MD_memprof, nullptr); 3150 I.setMetadata(LLVMContext::MD_callsite, nullptr); 3151 } 3152 } 3153 } 3154 3155 return Changed; 3156 } 3157 3158 template <typename DerivedCCG, typename FuncTy, typename CallTy> 3159 bool CallsiteContextGraph<DerivedCCG, FuncTy, CallTy>::process() { 3160 if (DumpCCG) { 3161 dbgs() << "CCG before cloning:\n"; 3162 dbgs() << *this; 3163 } 3164 if (ExportToDot) 3165 exportToDot("postbuild"); 3166 3167 if (VerifyCCG) { 3168 check(); 3169 } 3170 3171 identifyClones(); 3172 3173 if (VerifyCCG) { 3174 check(); 3175 } 3176 3177 if (DumpCCG) { 3178 dbgs() << "CCG after cloning:\n"; 3179 dbgs() << *this; 3180 } 3181 if (ExportToDot) 3182 exportToDot("cloned"); 3183 3184 bool Changed = assignFunctions(); 3185 3186 if (DumpCCG) { 3187 dbgs() << "CCG after assigning function clones:\n"; 3188 dbgs() << *this; 3189 } 3190 if (ExportToDot) 3191 exportToDot("clonefuncassign"); 3192 3193 return Changed; 3194 } 3195 3196 bool MemProfContextDisambiguation::processModule( 3197 Module &M, 3198 function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) { 3199 3200 // If we have an import summary, then the cloning decisions were made during 3201 // the thin link on the index. Apply them and return. 3202 if (ImportSummary) 3203 return applyImport(M); 3204 3205 // TODO: If/when other types of memprof cloning are enabled beyond just for 3206 // hot and cold, we will need to change this to individually control the 3207 // AllocationType passed to addStackNodesForMIB during CCG construction. 3208 // Note that we specifically check this after applying imports above, so that 3209 // the option isn't needed to be passed to distributed ThinLTO backend 3210 // clang processes, which won't necessarily have visibility into the linker 3211 // dependences. Instead the information is communicated from the LTO link to 3212 // the backends via the combined summary index. 3213 if (!SupportsHotColdNew) 3214 return false; 3215 3216 ModuleCallsiteContextGraph CCG(M, OREGetter); 3217 return CCG.process(); 3218 } 3219 3220 MemProfContextDisambiguation::MemProfContextDisambiguation( 3221 const ModuleSummaryIndex *Summary) 3222 : ImportSummary(Summary) { 3223 if (ImportSummary) { 3224 // The MemProfImportSummary should only be used for testing ThinLTO 3225 // distributed backend handling via opt, in which case we don't have a 3226 // summary from the pass pipeline. 3227 assert(MemProfImportSummary.empty()); 3228 return; 3229 } 3230 if (MemProfImportSummary.empty()) 3231 return; 3232 3233 auto ReadSummaryFile = 3234 errorOrToExpected(MemoryBuffer::getFile(MemProfImportSummary)); 3235 if (!ReadSummaryFile) { 3236 logAllUnhandledErrors(ReadSummaryFile.takeError(), errs(), 3237 "Error loading file '" + MemProfImportSummary + 3238 "': "); 3239 return; 3240 } 3241 auto ImportSummaryForTestingOrErr = getModuleSummaryIndex(**ReadSummaryFile); 3242 if (!ImportSummaryForTestingOrErr) { 3243 logAllUnhandledErrors(ImportSummaryForTestingOrErr.takeError(), errs(), 3244 "Error parsing file '" + MemProfImportSummary + 3245 "': "); 3246 return; 3247 } 3248 ImportSummaryForTesting = std::move(*ImportSummaryForTestingOrErr); 3249 ImportSummary = ImportSummaryForTesting.get(); 3250 } 3251 3252 PreservedAnalyses MemProfContextDisambiguation::run(Module &M, 3253 ModuleAnalysisManager &AM) { 3254 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 3255 auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & { 3256 return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F); 3257 }; 3258 if (!processModule(M, OREGetter)) 3259 return PreservedAnalyses::all(); 3260 return PreservedAnalyses::none(); 3261 } 3262 3263 void MemProfContextDisambiguation::run( 3264 ModuleSummaryIndex &Index, 3265 function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)> 3266 isPrevailing) { 3267 // TODO: If/when other types of memprof cloning are enabled beyond just for 3268 // hot and cold, we will need to change this to individually control the 3269 // AllocationType passed to addStackNodesForMIB during CCG construction. 3270 // The index was set from the option, so these should be in sync. 3271 assert(Index.withSupportsHotColdNew() == SupportsHotColdNew); 3272 if (!SupportsHotColdNew) 3273 return; 3274 3275 IndexCallsiteContextGraph CCG(Index, isPrevailing); 3276 CCG.process(); 3277 } 3278