1 //===- MemProfUse.cpp - memory allocation profile use pass --*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the MemProfUsePass which reads memory profiling data 10 // and uses it to add metadata to instructions to guide optimization. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Transforms/Instrumentation/MemProfUse.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/Statistic.h" 17 #include "llvm/ADT/StringRef.h" 18 #include "llvm/Analysis/MemoryProfileInfo.h" 19 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 20 #include "llvm/Analysis/TargetLibraryInfo.h" 21 #include "llvm/IR/DiagnosticInfo.h" 22 #include "llvm/IR/Function.h" 23 #include "llvm/IR/IntrinsicInst.h" 24 #include "llvm/IR/Module.h" 25 #include "llvm/ProfileData/InstrProf.h" 26 #include "llvm/ProfileData/InstrProfReader.h" 27 #include "llvm/ProfileData/MemProfCommon.h" 28 #include "llvm/Support/BLAKE3.h" 29 #include "llvm/Support/CommandLine.h" 30 #include "llvm/Support/Debug.h" 31 #include "llvm/Support/HashBuilder.h" 32 #include "llvm/Support/VirtualFileSystem.h" 33 #include "llvm/Transforms/Utils/LongestCommonSequence.h" 34 #include <map> 35 #include <set> 36 37 using namespace llvm; 38 using namespace llvm::memprof; 39 40 #define DEBUG_TYPE "memprof" 41 42 namespace llvm { 43 extern cl::opt<bool> PGOWarnMissing; 44 extern cl::opt<bool> NoPGOWarnMismatch; 45 extern cl::opt<bool> NoPGOWarnMismatchComdatWeak; 46 } // namespace llvm 47 48 // By default disable matching of allocation profiles onto operator new that 49 // already explicitly pass a hot/cold hint, since we don't currently 50 // override these hints anyway. 51 static cl::opt<bool> ClMemProfMatchHotColdNew( 52 "memprof-match-hot-cold-new", 53 cl::desc( 54 "Match allocation profiles onto existing hot/cold operator new calls"), 55 cl::Hidden, cl::init(false)); 56 57 static cl::opt<bool> 58 ClPrintMemProfMatchInfo("memprof-print-match-info", 59 cl::desc("Print matching stats for each allocation " 60 "context in this module's profiles"), 61 cl::Hidden, cl::init(false)); 62 63 static cl::opt<bool> 64 SalvageStaleProfile("memprof-salvage-stale-profile", 65 cl::desc("Salvage stale MemProf profile"), 66 cl::init(false), cl::Hidden); 67 68 static cl::opt<bool> ClMemProfAttachCalleeGuids( 69 "memprof-attach-calleeguids", 70 cl::desc( 71 "Attach calleeguids as value profile metadata for indirect calls."), 72 cl::init(true), cl::Hidden); 73 74 static cl::opt<unsigned> MinMatchedColdBytePercent( 75 "memprof-matching-cold-threshold", cl::init(100), cl::Hidden, 76 cl::desc("Min percent of cold bytes matched to hint allocation cold")); 77 78 // Matching statistics 79 STATISTIC(NumOfMemProfMissing, "Number of functions without memory profile."); 80 STATISTIC(NumOfMemProfMismatch, 81 "Number of functions having mismatched memory profile hash."); 82 STATISTIC(NumOfMemProfFunc, "Number of functions having valid memory profile."); 83 STATISTIC(NumOfMemProfAllocContextProfiles, 84 "Number of alloc contexts in memory profile."); 85 STATISTIC(NumOfMemProfCallSiteProfiles, 86 "Number of callsites in memory profile."); 87 STATISTIC(NumOfMemProfMatchedAllocContexts, 88 "Number of matched memory profile alloc contexts."); 89 STATISTIC(NumOfMemProfMatchedAllocs, 90 "Number of matched memory profile allocs."); 91 STATISTIC(NumOfMemProfMatchedCallSites, 92 "Number of matched memory profile callsites."); 93 94 static void addCallsiteMetadata(Instruction &I, 95 ArrayRef<uint64_t> InlinedCallStack, 96 LLVMContext &Ctx) { 97 I.setMetadata(LLVMContext::MD_callsite, 98 buildCallstackMetadata(InlinedCallStack, Ctx)); 99 } 100 101 static uint64_t computeStackId(GlobalValue::GUID Function, uint32_t LineOffset, 102 uint32_t Column) { 103 llvm::HashBuilder<llvm::TruncatedBLAKE3<8>, llvm::endianness::little> 104 HashBuilder; 105 HashBuilder.add(Function, LineOffset, Column); 106 llvm::BLAKE3Result<8> Hash = HashBuilder.final(); 107 uint64_t Id; 108 std::memcpy(&Id, Hash.data(), sizeof(Hash)); 109 return Id; 110 } 111 112 static uint64_t computeStackId(const memprof::Frame &Frame) { 113 return computeStackId(Frame.Function, Frame.LineOffset, Frame.Column); 114 } 115 116 static AllocationType addCallStack(CallStackTrie &AllocTrie, 117 const AllocationInfo *AllocInfo, 118 uint64_t FullStackId) { 119 SmallVector<uint64_t> StackIds; 120 for (const auto &StackFrame : AllocInfo->CallStack) 121 StackIds.push_back(computeStackId(StackFrame)); 122 auto AllocType = getAllocType(AllocInfo->Info.getTotalLifetimeAccessDensity(), 123 AllocInfo->Info.getAllocCount(), 124 AllocInfo->Info.getTotalLifetime()); 125 std::vector<ContextTotalSize> ContextSizeInfo; 126 if (recordContextSizeInfoForAnalysis()) { 127 auto TotalSize = AllocInfo->Info.getTotalSize(); 128 assert(TotalSize); 129 assert(FullStackId != 0); 130 ContextSizeInfo.push_back({FullStackId, TotalSize}); 131 } 132 AllocTrie.addCallStack(AllocType, StackIds, std::move(ContextSizeInfo)); 133 return AllocType; 134 } 135 136 // Return true if InlinedCallStack, computed from a call instruction's debug 137 // info, is a prefix of ProfileCallStack, a list of Frames from profile data 138 // (either the allocation data or a callsite). 139 static bool 140 stackFrameIncludesInlinedCallStack(ArrayRef<Frame> ProfileCallStack, 141 ArrayRef<uint64_t> InlinedCallStack) { 142 return ProfileCallStack.size() >= InlinedCallStack.size() && 143 llvm::equal(ProfileCallStack.take_front(InlinedCallStack.size()), 144 InlinedCallStack, [](const Frame &F, uint64_t StackId) { 145 return computeStackId(F) == StackId; 146 }); 147 } 148 149 static bool isAllocationWithHotColdVariant(const Function *Callee, 150 const TargetLibraryInfo &TLI) { 151 if (!Callee) 152 return false; 153 LibFunc Func; 154 if (!TLI.getLibFunc(*Callee, Func)) 155 return false; 156 switch (Func) { 157 case LibFunc_Znwm: 158 case LibFunc_ZnwmRKSt9nothrow_t: 159 case LibFunc_ZnwmSt11align_val_t: 160 case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t: 161 case LibFunc_Znam: 162 case LibFunc_ZnamRKSt9nothrow_t: 163 case LibFunc_ZnamSt11align_val_t: 164 case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t: 165 case LibFunc_size_returning_new: 166 case LibFunc_size_returning_new_aligned: 167 return true; 168 case LibFunc_Znwm12__hot_cold_t: 169 case LibFunc_ZnwmRKSt9nothrow_t12__hot_cold_t: 170 case LibFunc_ZnwmSt11align_val_t12__hot_cold_t: 171 case LibFunc_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t: 172 case LibFunc_Znam12__hot_cold_t: 173 case LibFunc_ZnamRKSt9nothrow_t12__hot_cold_t: 174 case LibFunc_ZnamSt11align_val_t12__hot_cold_t: 175 case LibFunc_ZnamSt11align_val_tRKSt9nothrow_t12__hot_cold_t: 176 case LibFunc_size_returning_new_hot_cold: 177 case LibFunc_size_returning_new_aligned_hot_cold: 178 return ClMemProfMatchHotColdNew; 179 default: 180 return false; 181 } 182 } 183 184 struct AllocMatchInfo { 185 uint64_t TotalSize = 0; 186 AllocationType AllocType = AllocationType::None; 187 }; 188 189 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> 190 memprof::extractCallsFromIR(Module &M, const TargetLibraryInfo &TLI, 191 function_ref<bool(uint64_t)> IsPresentInProfile) { 192 DenseMap<uint64_t, SmallVector<CallEdgeTy, 0>> Calls; 193 194 auto GetOffset = [](const DILocation *DIL) { 195 return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & 196 0xffff; 197 }; 198 199 for (Function &F : M) { 200 if (F.isDeclaration()) 201 continue; 202 203 for (auto &BB : F) { 204 for (auto &I : BB) { 205 if (!isa<CallBase>(&I) || isa<IntrinsicInst>(&I)) 206 continue; 207 208 auto *CB = dyn_cast<CallBase>(&I); 209 auto *CalledFunction = CB->getCalledFunction(); 210 // Disregard indirect calls and intrinsics. 211 if (!CalledFunction || CalledFunction->isIntrinsic()) 212 continue; 213 214 StringRef CalleeName = CalledFunction->getName(); 215 // True if we are calling a heap allocation function that supports 216 // hot/cold variants. 217 bool IsAlloc = isAllocationWithHotColdVariant(CalledFunction, TLI); 218 // True for the first iteration below, indicating that we are looking at 219 // a leaf node. 220 bool IsLeaf = true; 221 for (const DILocation *DIL = I.getDebugLoc(); DIL; 222 DIL = DIL->getInlinedAt()) { 223 StringRef CallerName = DIL->getSubprogramLinkageName(); 224 assert(!CallerName.empty() && 225 "Be sure to enable -fdebug-info-for-profiling"); 226 uint64_t CallerGUID = memprof::getGUID(CallerName); 227 uint64_t CalleeGUID = memprof::getGUID(CalleeName); 228 // Pretend that we are calling a function with GUID == 0 if we are 229 // in the inline stack leading to a heap allocation function. 230 if (IsAlloc) { 231 if (IsLeaf) { 232 // For leaf nodes, set CalleeGUID to 0 without consulting 233 // IsPresentInProfile. 234 CalleeGUID = 0; 235 } else if (!IsPresentInProfile(CalleeGUID)) { 236 // In addition to the leaf case above, continue to set CalleeGUID 237 // to 0 as long as we don't see CalleeGUID in the profile. 238 CalleeGUID = 0; 239 } else { 240 // Once we encounter a callee that exists in the profile, stop 241 // setting CalleeGUID to 0. 242 IsAlloc = false; 243 } 244 } 245 246 LineLocation Loc = {GetOffset(DIL), DIL->getColumn()}; 247 Calls[CallerGUID].emplace_back(Loc, CalleeGUID); 248 CalleeName = CallerName; 249 IsLeaf = false; 250 } 251 } 252 } 253 } 254 255 // Sort each call list by the source location. 256 for (auto &[CallerGUID, CallList] : Calls) { 257 llvm::sort(CallList); 258 CallList.erase(llvm::unique(CallList), CallList.end()); 259 } 260 261 return Calls; 262 } 263 264 DenseMap<uint64_t, LocToLocMap> 265 memprof::computeUndriftMap(Module &M, IndexedInstrProfReader *MemProfReader, 266 const TargetLibraryInfo &TLI) { 267 DenseMap<uint64_t, LocToLocMap> UndriftMaps; 268 269 DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromProfile = 270 MemProfReader->getMemProfCallerCalleePairs(); 271 DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> CallsFromIR = 272 extractCallsFromIR(M, TLI, [&](uint64_t GUID) { 273 return CallsFromProfile.contains(GUID); 274 }); 275 276 // Compute an undrift map for each CallerGUID. 277 for (const auto &[CallerGUID, IRAnchors] : CallsFromIR) { 278 auto It = CallsFromProfile.find(CallerGUID); 279 if (It == CallsFromProfile.end()) 280 continue; 281 const auto &ProfileAnchors = It->second; 282 283 LocToLocMap Matchings; 284 longestCommonSequence<LineLocation, GlobalValue::GUID>( 285 ProfileAnchors, IRAnchors, std::equal_to<GlobalValue::GUID>(), 286 [&](LineLocation A, LineLocation B) { Matchings.try_emplace(A, B); }); 287 [[maybe_unused]] bool Inserted = 288 UndriftMaps.try_emplace(CallerGUID, std::move(Matchings)).second; 289 290 // The insertion must succeed because we visit each GUID exactly once. 291 assert(Inserted); 292 } 293 294 return UndriftMaps; 295 } 296 297 // Given a MemProfRecord, undrift all the source locations present in the 298 // record in place. 299 static void 300 undriftMemProfRecord(const DenseMap<uint64_t, LocToLocMap> &UndriftMaps, 301 memprof::MemProfRecord &MemProfRec) { 302 // Undrift a call stack in place. 303 auto UndriftCallStack = [&](std::vector<Frame> &CallStack) { 304 for (auto &F : CallStack) { 305 auto I = UndriftMaps.find(F.Function); 306 if (I == UndriftMaps.end()) 307 continue; 308 auto J = I->second.find(LineLocation(F.LineOffset, F.Column)); 309 if (J == I->second.end()) 310 continue; 311 auto &NewLoc = J->second; 312 F.LineOffset = NewLoc.LineOffset; 313 F.Column = NewLoc.Column; 314 } 315 }; 316 317 for (auto &AS : MemProfRec.AllocSites) 318 UndriftCallStack(AS.CallStack); 319 320 for (auto &CS : MemProfRec.CallSites) 321 UndriftCallStack(CS.Frames); 322 } 323 324 // Helper function to process CalleeGuids and create value profile metadata 325 static void addVPMetadata(Module &M, Instruction &I, 326 ArrayRef<GlobalValue::GUID> CalleeGuids) { 327 if (!ClMemProfAttachCalleeGuids || CalleeGuids.empty()) 328 return; 329 330 if (I.getMetadata(LLVMContext::MD_prof)) { 331 uint64_t Unused; 332 // TODO: When merging is implemented, increase this to a typical ICP value 333 // (e.g., 3-6) For now, we only need to check if existing data exists, so 1 334 // is sufficient 335 auto ExistingVD = getValueProfDataFromInst(I, IPVK_IndirectCallTarget, 336 /*MaxNumValueData=*/1, Unused); 337 // We don't know how to merge value profile data yet. 338 if (!ExistingVD.empty()) { 339 return; 340 } 341 } 342 343 SmallVector<InstrProfValueData, 4> VDs; 344 uint64_t TotalCount = 0; 345 346 for (const GlobalValue::GUID CalleeGUID : CalleeGuids) { 347 InstrProfValueData VD; 348 VD.Value = CalleeGUID; 349 // For MemProf, we don't have actual call counts, so we assign 350 // a weight of 1 to each potential target. 351 // TODO: Consider making this weight configurable or increasing it to 352 // improve effectiveness for ICP. 353 VD.Count = 1; 354 VDs.push_back(VD); 355 TotalCount += VD.Count; 356 } 357 358 if (!VDs.empty()) { 359 annotateValueSite(M, I, VDs, TotalCount, IPVK_IndirectCallTarget, 360 VDs.size()); 361 } 362 } 363 364 static void readMemprof(Module &M, Function &F, 365 IndexedInstrProfReader *MemProfReader, 366 const TargetLibraryInfo &TLI, 367 std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> 368 &FullStackIdToAllocMatchInfo, 369 std::set<std::vector<uint64_t>> &MatchedCallSites, 370 DenseMap<uint64_t, LocToLocMap> &UndriftMaps, 371 OptimizationRemarkEmitter &ORE, uint64_t MaxColdSize) { 372 auto &Ctx = M.getContext(); 373 // Previously we used getIRPGOFuncName() here. If F is local linkage, 374 // getIRPGOFuncName() returns FuncName with prefix 'FileName;'. But 375 // llvm-profdata uses FuncName in dwarf to create GUID which doesn't 376 // contain FileName's prefix. It caused local linkage function can't 377 // find MemProfRecord. So we use getName() now. 378 // 'unique-internal-linkage-names' can make MemProf work better for local 379 // linkage function. 380 auto FuncName = F.getName(); 381 auto FuncGUID = Function::getGUIDAssumingExternalLinkage(FuncName); 382 std::optional<memprof::MemProfRecord> MemProfRec; 383 auto Err = MemProfReader->getMemProfRecord(FuncGUID).moveInto(MemProfRec); 384 if (Err) { 385 handleAllErrors(std::move(Err), [&](const InstrProfError &IPE) { 386 auto Err = IPE.get(); 387 bool SkipWarning = false; 388 LLVM_DEBUG(dbgs() << "Error in reading profile for Func " << FuncName 389 << ": "); 390 if (Err == instrprof_error::unknown_function) { 391 NumOfMemProfMissing++; 392 SkipWarning = !PGOWarnMissing; 393 LLVM_DEBUG(dbgs() << "unknown function"); 394 } else if (Err == instrprof_error::hash_mismatch) { 395 NumOfMemProfMismatch++; 396 SkipWarning = 397 NoPGOWarnMismatch || 398 (NoPGOWarnMismatchComdatWeak && 399 (F.hasComdat() || 400 F.getLinkage() == GlobalValue::AvailableExternallyLinkage)); 401 LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")"); 402 } 403 404 if (SkipWarning) 405 return; 406 407 std::string Msg = (IPE.message() + Twine(" ") + F.getName().str() + 408 Twine(" Hash = ") + std::to_string(FuncGUID)) 409 .str(); 410 411 Ctx.diagnose( 412 DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning)); 413 }); 414 return; 415 } 416 417 NumOfMemProfFunc++; 418 419 // If requested, undrfit MemProfRecord so that the source locations in it 420 // match those in the IR. 421 if (SalvageStaleProfile) 422 undriftMemProfRecord(UndriftMaps, *MemProfRec); 423 424 // Detect if there are non-zero column numbers in the profile. If not, 425 // treat all column numbers as 0 when matching (i.e. ignore any non-zero 426 // columns in the IR). The profiled binary might have been built with 427 // column numbers disabled, for example. 428 bool ProfileHasColumns = false; 429 430 // Build maps of the location hash to all profile data with that leaf location 431 // (allocation info and the callsites). 432 std::map<uint64_t, std::set<const AllocationInfo *>> LocHashToAllocInfo; 433 434 // Helper struct for maintaining refs to callsite data. As an alternative we 435 // could store a pointer to the CallSiteInfo struct but we also need the frame 436 // index. Using ArrayRefs instead makes it a little easier to read. 437 struct CallSiteEntry { 438 // Subset of frames for the corresponding CallSiteInfo. 439 ArrayRef<Frame> Frames; 440 // Potential targets for indirect calls. 441 ArrayRef<GlobalValue::GUID> CalleeGuids; 442 443 // Only compare Frame contents. 444 // Use pointer-based equality instead of ArrayRef's operator== which does 445 // element-wise comparison. We want to check if it's the same slice of the 446 // underlying array, not just equivalent content. 447 bool operator==(const CallSiteEntry &Other) const { 448 return Frames.data() == Other.Frames.data() && 449 Frames.size() == Other.Frames.size(); 450 } 451 }; 452 453 struct CallSiteEntryHash { 454 size_t operator()(const CallSiteEntry &Entry) const { 455 return computeFullStackId(Entry.Frames); 456 } 457 }; 458 459 // For the callsites we need to record slices of the frame array (see comments 460 // below where the map entries are added) along with their CalleeGuids. 461 std::map<uint64_t, std::unordered_set<CallSiteEntry, CallSiteEntryHash>> 462 LocHashToCallSites; 463 for (auto &AI : MemProfRec->AllocSites) { 464 NumOfMemProfAllocContextProfiles++; 465 // Associate the allocation info with the leaf frame. The later matching 466 // code will match any inlined call sequences in the IR with a longer prefix 467 // of call stack frames. 468 uint64_t StackId = computeStackId(AI.CallStack[0]); 469 LocHashToAllocInfo[StackId].insert(&AI); 470 ProfileHasColumns |= AI.CallStack[0].Column; 471 } 472 for (auto &CS : MemProfRec->CallSites) { 473 NumOfMemProfCallSiteProfiles++; 474 // Need to record all frames from leaf up to and including this function, 475 // as any of these may or may not have been inlined at this point. 476 unsigned Idx = 0; 477 for (auto &StackFrame : CS.Frames) { 478 uint64_t StackId = computeStackId(StackFrame); 479 ArrayRef<Frame> FrameSlice = ArrayRef<Frame>(CS.Frames).drop_front(Idx++); 480 ArrayRef<GlobalValue::GUID> CalleeGuids(CS.CalleeGuids); 481 LocHashToCallSites[StackId].insert({FrameSlice, CalleeGuids}); 482 483 ProfileHasColumns |= StackFrame.Column; 484 // Once we find this function, we can stop recording. 485 if (StackFrame.Function == FuncGUID) 486 break; 487 } 488 assert(Idx <= CS.Frames.size() && CS.Frames[Idx - 1].Function == FuncGUID); 489 } 490 491 auto GetOffset = [](const DILocation *DIL) { 492 return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) & 493 0xffff; 494 }; 495 496 // Now walk the instructions, looking up the associated profile data using 497 // debug locations. 498 for (auto &BB : F) { 499 for (auto &I : BB) { 500 if (I.isDebugOrPseudoInst()) 501 continue; 502 // We are only interested in calls (allocation or interior call stack 503 // context calls). 504 auto *CI = dyn_cast<CallBase>(&I); 505 if (!CI) 506 continue; 507 auto *CalledFunction = CI->getCalledFunction(); 508 if (CalledFunction && CalledFunction->isIntrinsic()) 509 continue; 510 // List of call stack ids computed from the location hashes on debug 511 // locations (leaf to inlined at root). 512 SmallVector<uint64_t, 8> InlinedCallStack; 513 // Was the leaf location found in one of the profile maps? 514 bool LeafFound = false; 515 // If leaf was found in a map, iterators pointing to its location in both 516 // of the maps. It might exist in neither, one, or both (the latter case 517 // can happen because we don't currently have discriminators to 518 // distinguish the case when a single line/col maps to both an allocation 519 // and another callsite). 520 auto AllocInfoIter = LocHashToAllocInfo.end(); 521 auto CallSitesIter = LocHashToCallSites.end(); 522 for (const DILocation *DIL = I.getDebugLoc(); DIL != nullptr; 523 DIL = DIL->getInlinedAt()) { 524 // Use C++ linkage name if possible. Need to compile with 525 // -fdebug-info-for-profiling to get linkage name. 526 StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName(); 527 if (Name.empty()) 528 Name = DIL->getScope()->getSubprogram()->getName(); 529 auto CalleeGUID = Function::getGUIDAssumingExternalLinkage(Name); 530 auto StackId = computeStackId(CalleeGUID, GetOffset(DIL), 531 ProfileHasColumns ? DIL->getColumn() : 0); 532 // Check if we have found the profile's leaf frame. If yes, collect 533 // the rest of the call's inlined context starting here. If not, see if 534 // we find a match further up the inlined context (in case the profile 535 // was missing debug frames at the leaf). 536 if (!LeafFound) { 537 AllocInfoIter = LocHashToAllocInfo.find(StackId); 538 CallSitesIter = LocHashToCallSites.find(StackId); 539 if (AllocInfoIter != LocHashToAllocInfo.end() || 540 CallSitesIter != LocHashToCallSites.end()) 541 LeafFound = true; 542 } 543 if (LeafFound) 544 InlinedCallStack.push_back(StackId); 545 } 546 // If leaf not in either of the maps, skip inst. 547 if (!LeafFound) 548 continue; 549 550 // First add !memprof metadata from allocation info, if we found the 551 // instruction's leaf location in that map, and if the rest of the 552 // instruction's locations match the prefix Frame locations on an 553 // allocation context with the same leaf. 554 if (AllocInfoIter != LocHashToAllocInfo.end() && 555 // Only consider allocations which support hinting. 556 isAllocationWithHotColdVariant(CI->getCalledFunction(), TLI)) { 557 // We may match this instruction's location list to multiple MIB 558 // contexts. Add them to a Trie specialized for trimming the contexts to 559 // the minimal needed to disambiguate contexts with unique behavior. 560 CallStackTrie AllocTrie(&ORE, MaxColdSize); 561 uint64_t TotalSize = 0; 562 uint64_t TotalColdSize = 0; 563 for (auto *AllocInfo : AllocInfoIter->second) { 564 // Check the full inlined call stack against this one. 565 // If we found and thus matched all frames on the call, include 566 // this MIB. 567 if (stackFrameIncludesInlinedCallStack(AllocInfo->CallStack, 568 InlinedCallStack)) { 569 NumOfMemProfMatchedAllocContexts++; 570 uint64_t FullStackId = 0; 571 if (ClPrintMemProfMatchInfo || recordContextSizeInfoForAnalysis()) 572 FullStackId = computeFullStackId(AllocInfo->CallStack); 573 auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId); 574 TotalSize += AllocInfo->Info.getTotalSize(); 575 if (AllocType == AllocationType::Cold) 576 TotalColdSize += AllocInfo->Info.getTotalSize(); 577 // Record information about the allocation if match info printing 578 // was requested. 579 if (ClPrintMemProfMatchInfo) { 580 assert(FullStackId != 0); 581 FullStackIdToAllocMatchInfo[std::make_pair( 582 FullStackId, InlinedCallStack.size())] = { 583 AllocInfo->Info.getTotalSize(), AllocType}; 584 } 585 } 586 } 587 // If the threshold for the percent of cold bytes is less than 100%, 588 // and not all bytes are cold, see if we should still hint this 589 // allocation as cold without context sensitivity. 590 if (TotalColdSize < TotalSize && MinMatchedColdBytePercent < 100 && 591 TotalColdSize * 100 >= MinMatchedColdBytePercent * TotalSize) { 592 AllocTrie.addSingleAllocTypeAttribute(CI, AllocationType::Cold, 593 "dominant"); 594 continue; 595 } 596 597 // We might not have matched any to the full inlined call stack. 598 // But if we did, create and attach metadata, or a function attribute if 599 // all contexts have identical profiled behavior. 600 if (!AllocTrie.empty()) { 601 NumOfMemProfMatchedAllocs++; 602 // MemprofMDAttached will be false if a function attribute was 603 // attached. 604 bool MemprofMDAttached = AllocTrie.buildAndAttachMIBMetadata(CI); 605 assert(MemprofMDAttached == I.hasMetadata(LLVMContext::MD_memprof)); 606 if (MemprofMDAttached) { 607 // Add callsite metadata for the instruction's location list so that 608 // it simpler later on to identify which part of the MIB contexts 609 // are from this particular instruction (including during inlining, 610 // when the callsite metadata will be updated appropriately). 611 // FIXME: can this be changed to strip out the matching stack 612 // context ids from the MIB contexts and not add any callsite 613 // metadata here to save space? 614 addCallsiteMetadata(I, InlinedCallStack, Ctx); 615 } 616 } 617 continue; 618 } 619 620 if (CallSitesIter == LocHashToCallSites.end()) 621 continue; 622 623 // Otherwise, add callsite metadata. If we reach here then we found the 624 // instruction's leaf location in the callsites map and not the allocation 625 // map. 626 for (const auto &CallSiteEntry : CallSitesIter->second) { 627 // If we found and thus matched all frames on the call, create and 628 // attach call stack metadata. 629 if (stackFrameIncludesInlinedCallStack(CallSiteEntry.Frames, 630 InlinedCallStack)) { 631 NumOfMemProfMatchedCallSites++; 632 addCallsiteMetadata(I, InlinedCallStack, Ctx); 633 634 // Try to attach indirect call metadata if possible. 635 if (!CalledFunction) 636 addVPMetadata(M, I, CallSiteEntry.CalleeGuids); 637 638 // Only need to find one with a matching call stack and add a single 639 // callsite metadata. 640 641 // Accumulate call site matching information upon request. 642 if (ClPrintMemProfMatchInfo) { 643 std::vector<uint64_t> CallStack; 644 append_range(CallStack, InlinedCallStack); 645 MatchedCallSites.insert(std::move(CallStack)); 646 } 647 break; 648 } 649 } 650 } 651 } 652 } 653 654 MemProfUsePass::MemProfUsePass(std::string MemoryProfileFile, 655 IntrusiveRefCntPtr<vfs::FileSystem> FS) 656 : MemoryProfileFileName(MemoryProfileFile), FS(FS) { 657 if (!FS) 658 this->FS = vfs::getRealFileSystem(); 659 } 660 661 PreservedAnalyses MemProfUsePass::run(Module &M, ModuleAnalysisManager &AM) { 662 // Return immediately if the module doesn't contain any function. 663 if (M.empty()) 664 return PreservedAnalyses::all(); 665 666 LLVM_DEBUG(dbgs() << "Read in memory profile:"); 667 auto &Ctx = M.getContext(); 668 auto ReaderOrErr = IndexedInstrProfReader::create(MemoryProfileFileName, *FS); 669 if (Error E = ReaderOrErr.takeError()) { 670 handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) { 671 Ctx.diagnose( 672 DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), EI.message())); 673 }); 674 return PreservedAnalyses::all(); 675 } 676 677 std::unique_ptr<IndexedInstrProfReader> MemProfReader = 678 std::move(ReaderOrErr.get()); 679 if (!MemProfReader) { 680 Ctx.diagnose(DiagnosticInfoPGOProfile( 681 MemoryProfileFileName.data(), StringRef("Cannot get MemProfReader"))); 682 return PreservedAnalyses::all(); 683 } 684 685 if (!MemProfReader->hasMemoryProfile()) { 686 Ctx.diagnose(DiagnosticInfoPGOProfile(MemoryProfileFileName.data(), 687 "Not a memory profile")); 688 return PreservedAnalyses::all(); 689 } 690 691 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager(); 692 693 TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(*M.begin()); 694 DenseMap<uint64_t, LocToLocMap> UndriftMaps; 695 if (SalvageStaleProfile) 696 UndriftMaps = computeUndriftMap(M, MemProfReader.get(), TLI); 697 698 // Map from the stack hash and matched frame count of each allocation context 699 // in the function profiles to the total profiled size (bytes) and allocation 700 // type. 701 std::map<std::pair<uint64_t, unsigned>, AllocMatchInfo> 702 FullStackIdToAllocMatchInfo; 703 704 // Set of the matched call sites, each expressed as a sequence of an inline 705 // call stack. 706 std::set<std::vector<uint64_t>> MatchedCallSites; 707 708 uint64_t MaxColdSize = 0; 709 if (auto *MemProfSum = MemProfReader->getMemProfSummary()) 710 MaxColdSize = MemProfSum->getMaxColdTotalSize(); 711 712 for (auto &F : M) { 713 if (F.isDeclaration()) 714 continue; 715 716 const TargetLibraryInfo &TLI = FAM.getResult<TargetLibraryAnalysis>(F); 717 auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F); 718 readMemprof(M, F, MemProfReader.get(), TLI, FullStackIdToAllocMatchInfo, 719 MatchedCallSites, UndriftMaps, ORE, MaxColdSize); 720 } 721 722 if (ClPrintMemProfMatchInfo) { 723 for (const auto &[IdLengthPair, Info] : FullStackIdToAllocMatchInfo) { 724 auto [Id, Length] = IdLengthPair; 725 errs() << "MemProf " << getAllocTypeAttributeString(Info.AllocType) 726 << " context with id " << Id << " has total profiled size " 727 << Info.TotalSize << " is matched with " << Length << " frames\n"; 728 } 729 730 for (const auto &CallStack : MatchedCallSites) { 731 errs() << "MemProf callsite match for inline call stack"; 732 for (uint64_t StackId : CallStack) 733 errs() << " " << StackId; 734 errs() << "\n"; 735 } 736 } 737 738 return PreservedAnalyses::none(); 739 } 740