1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "CtxInstrProfiling.h" 10 #include "RootAutoDetector.h" 11 #include "sanitizer_common/sanitizer_allocator_internal.h" 12 #include "sanitizer_common/sanitizer_atomic.h" 13 #include "sanitizer_common/sanitizer_atomic_clang.h" 14 #include "sanitizer_common/sanitizer_common.h" 15 #include "sanitizer_common/sanitizer_dense_map.h" 16 #include "sanitizer_common/sanitizer_libc.h" 17 #include "sanitizer_common/sanitizer_mutex.h" 18 #include "sanitizer_common/sanitizer_placement_new.h" 19 #include "sanitizer_common/sanitizer_thread_safety.h" 20 #include "sanitizer_common/sanitizer_vector.h" 21 22 #include <assert.h> 23 24 using namespace __ctx_profile; 25 26 namespace { 27 // Keep track of all the context roots we actually saw, so we can then traverse 28 // them when the user asks for the profile in __llvm_ctx_profile_fetch 29 __sanitizer::SpinMutex AllContextsMutex; 30 SANITIZER_GUARDED_BY(AllContextsMutex) 31 __sanitizer::Vector<ContextRoot *> AllContextRoots; 32 33 __sanitizer::atomic_uintptr_t AllFunctionsData = {}; 34 35 // Keep all the functions for which we collect a flat profile in a linked list. 36 __sanitizer::SpinMutex FlatCtxArenaMutex; 37 SANITIZER_GUARDED_BY(FlatCtxArenaMutex) 38 Arena *FlatCtxArenaHead = nullptr; 39 SANITIZER_GUARDED_BY(FlatCtxArenaMutex) 40 Arena *FlatCtxArena = nullptr; 41 42 // Set to true when we enter a root, and false when we exit - regardless if this 43 // thread collects a contextual profile for that root. 44 __thread bool IsUnderContext = false; 45 __sanitizer::atomic_uint8_t ProfilingStarted = {}; 46 47 __sanitizer::atomic_uintptr_t RootDetector = {}; 48 RootAutoDetector *getRootDetector() { 49 return reinterpret_cast<RootAutoDetector *>( 50 __sanitizer::atomic_load_relaxed(&RootDetector)); 51 } 52 53 // utility to taint a pointer by setting the LSB. There is an assumption 54 // throughout that the addresses of contexts are even (really, they should be 55 // align(8), but "even"-ness is the minimum assumption) 56 // "scratch contexts" are buffers that we return in certain cases - they are 57 // large enough to allow for memory safe counter access, but they don't link 58 // subcontexts below them (the runtime recognizes them and enforces that) 59 ContextNode *markAsScratch(const ContextNode *Ctx) { 60 return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1); 61 } 62 63 // Used when getting the data from TLS. We don't *really* need to reset, but 64 // it's a simpler system if we do. 65 template <typename T> inline T consume(T &V) { 66 auto R = V; 67 V = {0}; 68 return R; 69 } 70 71 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that 72 // large. 73 constexpr size_t kPower = 20; 74 constexpr size_t kBuffSize = 1 << kPower; 75 76 // Highly unlikely we need more than kBuffSize for a context. 77 size_t getArenaAllocSize(size_t Needed) { 78 if (Needed >= kBuffSize) 79 return 2 * Needed; 80 return kBuffSize; 81 } 82 83 // verify the structural integrity of the context 84 bool validate(const ContextRoot *Root) { 85 // all contexts should be laid out in some arena page. Go over each arena 86 // allocated for this Root, and jump over contained contexts based on 87 // self-reported sizes. 88 __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs; 89 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { 90 const auto *Pos = Mem->start(); 91 while (Pos < Mem->pos()) { 92 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); 93 if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true}) 94 .second) 95 return false; 96 Pos += Ctx->size(); 97 } 98 } 99 100 // Now traverse the contexts again the same way, but validate all nonull 101 // subcontext addresses appear in the set computed above. 102 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { 103 const auto *Pos = Mem->start(); 104 while (Pos < Mem->pos()) { 105 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); 106 for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) 107 for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) 108 if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub))) 109 return false; 110 111 Pos += Ctx->size(); 112 } 113 } 114 return true; 115 } 116 117 inline ContextNode *allocContextNode(char *Place, GUID Guid, 118 uint32_t NumCounters, 119 uint32_t NumCallsites, 120 ContextNode *Next = nullptr) { 121 assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0); 122 return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next); 123 } 124 125 void resetContextNode(ContextNode &Node) { 126 // FIXME(mtrofin): this is std::memset, which we can probably use if we 127 // drop/reduce the dependency on sanitizer_common. 128 for (uint32_t I = 0; I < Node.counters_size(); ++I) 129 Node.counters()[I] = 0; 130 for (uint32_t I = 0; I < Node.callsites_size(); ++I) 131 for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next()) 132 resetContextNode(*Next); 133 } 134 135 ContextNode *onContextEnter(ContextNode &Node) { 136 ++Node.counters()[0]; 137 return &Node; 138 } 139 140 } // namespace 141 142 // the scratch buffer - what we give when we can't produce a real context (the 143 // scratch isn't "real" in that it's expected to be clobbered carelessly - we 144 // don't read it). The other important thing is that the callees from a scratch 145 // context also get a scratch context. 146 // Eventually this can be replaced with per-function buffers, a'la the typical 147 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but 148 // the part about determining the nature of the subcontexts does. 149 __thread char __Buffer[kBuffSize] = {0}; 150 151 #define TheScratchContext \ 152 markAsScratch(reinterpret_cast<ContextNode *>(__Buffer)) 153 154 // init the TLSes 155 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, 156 nullptr}; 157 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; 158 159 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = 160 nullptr; 161 162 Arena::Arena(uint32_t Size) : Size(Size) { 163 __sanitizer::internal_memset(start(), 0, Size); 164 } 165 166 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce 167 // the dependency on the latter. 168 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { 169 assert(!Prev || Prev->Next == nullptr); 170 Arena *NewArena = new (__sanitizer::InternalAlloc( 171 Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) 172 Arena(Size); 173 if (Prev) 174 Prev->Next = NewArena; 175 return NewArena; 176 } 177 178 void Arena::freeArenaList(Arena *&A) { 179 assert(A); 180 for (auto *I = A; I != nullptr;) { 181 auto *Current = I; 182 I = I->Next; 183 __sanitizer::InternalFree(Current); 184 } 185 A = nullptr; 186 } 187 188 // If this is the first time we hit a callsite with this (Guid) particular 189 // callee, we need to allocate. 190 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, 191 uint32_t NumCounters, uint32_t NumCallsites) { 192 auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites); 193 auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; 194 char *AllocPlace = Mem->tryBumpAllocate(AllocSize); 195 if (!AllocPlace) { 196 // if we failed to allocate on the current arena, allocate a new arena, 197 // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we 198 // find it from now on for other cases when we need to getCallsiteSlow. 199 // Note that allocateNewArena will link the allocated memory in the list of 200 // Arenas. 201 __llvm_ctx_profile_current_context_root->CurrentMem = Mem = 202 Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem); 203 AllocPlace = Mem->tryBumpAllocate(AllocSize); 204 } 205 auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites, 206 *InsertionPoint); 207 *InsertionPoint = Ret; 208 return Ret; 209 } 210 211 ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid, 212 uint32_t NumCounters) { 213 if (ContextNode *Existing = Data.FlatCtx) 214 return Existing; 215 { 216 // We could instead try to take the lock and, if that fails, return 217 // TheScratchContext. But that could leave message pump loops more sparsely 218 // profiled than everything else. Maybe that doesn't matter, and we can 219 // optimize this later. 220 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex); 221 if (ContextNode *Existing = Data.FlatCtx) 222 return Existing; 223 224 auto NeededSize = ContextNode::getAllocSize(NumCounters, 0); 225 char *AllocBuff = nullptr; 226 { 227 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL( 228 &FlatCtxArenaMutex); 229 if (FlatCtxArena) 230 AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize); 231 if (!AllocBuff) { 232 FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize), 233 FlatCtxArena); 234 AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize); 235 } 236 if (!FlatCtxArenaHead) 237 FlatCtxArenaHead = FlatCtxArena; 238 } 239 auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0); 240 Data.FlatCtx = Ret; 241 242 Data.EntryAddress = Callee; 243 Data.Next = reinterpret_cast<FunctionData *>( 244 __sanitizer::atomic_load_relaxed(&AllFunctionsData)); 245 while (!__sanitizer::atomic_compare_exchange_strong( 246 &AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next), 247 reinterpret_cast<uintptr_t>(&Data), 248 __sanitizer::memory_order_release)) { 249 } 250 } 251 252 return Data.FlatCtx; 253 } 254 255 // This should be called once for a Root. Allocate the first arena, set up the 256 // first context. 257 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters, 258 uint32_t NumCallsites) { 259 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 260 &AllContextsMutex); 261 // Re-check - we got here without having had taken a lock. 262 if (Root->FirstMemBlock) 263 return; 264 const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites); 265 auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed)); 266 Root->FirstMemBlock = M; 267 Root->CurrentMem = M; 268 Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid, 269 NumCounters, NumCallsites); 270 AllContextRoots.PushBack(Root); 271 } 272 273 ContextRoot *FunctionData::getOrAllocateContextRoot() { 274 auto *Root = CtxRoot; 275 if (!canBeRoot(Root)) 276 return Root; 277 if (Root) 278 return Root; 279 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Mutex); 280 Root = CtxRoot; 281 if (!Root) { 282 Root = new (__sanitizer::InternalAlloc(sizeof(ContextRoot))) ContextRoot(); 283 CtxRoot = Root; 284 } 285 286 assert(Root); 287 return Root; 288 } 289 290 ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid, 291 uint32_t Counters, uint32_t Callsites) 292 SANITIZER_NO_THREAD_SAFETY_ANALYSIS { 293 IsUnderContext = true; 294 __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1, 295 __sanitizer::memory_order_relaxed); 296 if (!Root->FirstMemBlock) { 297 setupContext(Root, Guid, Counters, Callsites); 298 } 299 if (Root->Taken.TryLock()) { 300 __llvm_ctx_profile_current_context_root = Root; 301 onContextEnter(*Root->FirstNode); 302 return Root->FirstNode; 303 } 304 // If this thread couldn't take the lock, return scratch context. 305 __llvm_ctx_profile_current_context_root = nullptr; 306 return TheScratchContext; 307 } 308 309 ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid, 310 uint32_t NumCounters, uint32_t NumCallsites, 311 ContextRoot *CtxRoot) { 312 313 // 1) if we are currently collecting a contextual profile, fetch a ContextNode 314 // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted` 315 // to (hopefully) offset the penalty of creating these contexts to before 316 // profiling. 317 // 318 // 2) if we are under a root (regardless if this thread is collecting or not a 319 // contextual profile for that root), do not collect a flat profile. We want 320 // to keep flat profiles only for activations that can't happen under a root, 321 // to avoid confusing profiles. We can, for example, combine flattened and 322 // flat profiles meaningfully, as we wouldn't double-count anything. 323 // 324 // 3) to avoid lengthy startup, don't bother with flat profiles until the 325 // profiling has started. We would reset them anyway when profiling starts. 326 // HOWEVER. This does lose profiling for message pumps: those functions are 327 // entered once and never exit. They should be assumed to be entered before 328 // profiling starts - because profiling should start after the server is up 329 // and running (which is equivalent to "message pumps are set up"). 330 if (!CtxRoot) { 331 if (auto *RAD = getRootDetector()) 332 RAD->sample(); 333 else if (auto *CR = Data.CtxRoot) { 334 if (canBeRoot(CR)) 335 return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites); 336 } 337 if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted)) 338 return TheScratchContext; 339 else 340 return markAsScratch( 341 onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters))); 342 } 343 auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr}); 344 if (Ins) 345 Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode, 346 NumCounters, 0); 347 return markAsScratch(onContextEnter(*Iter->second)); 348 } 349 350 ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee, 351 GUID Guid, uint32_t NumCounters, 352 uint32_t NumCallsites) { 353 auto *CtxRoot = __llvm_ctx_profile_current_context_root; 354 // fast "out" if we're not even doing contextual collection. 355 if (!CtxRoot) 356 return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, 357 nullptr); 358 359 // also fast "out" if the caller is scratch. We can see if it's scratch by 360 // looking at the interior pointer into the subcontexts vector that the caller 361 // provided, which, if the context is scratch, so is that interior pointer 362 // (because all the address calculations are using even values. Or more 363 // precisely, aligned - 8 values) 364 auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); 365 if (!CallsiteContext || isScratch(CallsiteContext)) 366 return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, 367 CtxRoot); 368 369 // if the callee isn't the expected one, return scratch. 370 // Signal handler(s) could have been invoked at any point in the execution. 371 // Should that have happened, and had it (the handler) be built with 372 // instrumentation, its __llvm_ctx_profile_get_context would have failed here. 373 // Its sub call graph would have then populated 374 // __llvm_ctx_profile_{expected_callee | callsite} at index 1. 375 // The normal call graph may be impacted in that, if the signal handler 376 // happened somewhere before we read the TLS here, we'd see the TLS reset and 377 // we'd also fail here. That would just mean we would loose counter values for 378 // the normal subgraph, this time around. That should be very unlikely, but if 379 // it happens too frequently, we should be able to detect discrepancies in 380 // entry counts (caller-callee). At the moment, the design goes on the 381 // assumption that is so unfrequent, though, that it's not worth doing more 382 // for that case. 383 auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); 384 if (ExpectedCallee != Callee) 385 return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites, 386 CtxRoot); 387 388 auto *Callsite = *CallsiteContext; 389 // in the case of indirect calls, we will have all seen targets forming a 390 // linked list here. Find the one corresponding to this callee. 391 while (Callsite && Callsite->guid() != Guid) { 392 Callsite = Callsite->next(); 393 } 394 auto *Ret = Callsite ? Callsite 395 : getCallsiteSlow(Guid, CallsiteContext, NumCounters, 396 NumCallsites); 397 if (Ret->callsites_size() != NumCallsites || 398 Ret->counters_size() != NumCounters) 399 __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: " 400 "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n", 401 reinterpret_cast<void *>(Ret), Guid, NumCallsites, 402 NumCounters, Ret->guid(), Ret->callsites_size(), 403 Ret->counters_size()); 404 onContextEnter(*Ret); 405 return Ret; 406 } 407 408 ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid, 409 uint32_t Counters, 410 uint32_t Callsites) { 411 auto *Root = FData->getOrAllocateContextRoot(); 412 assert(canBeRoot(Root)); 413 return tryStartContextGivenRoot(Root, Guid, Counters, Callsites); 414 } 415 416 void __llvm_ctx_profile_release_context(FunctionData *FData) 417 SANITIZER_NO_THREAD_SAFETY_ANALYSIS { 418 const auto *CurrentRoot = __llvm_ctx_profile_current_context_root; 419 auto *CR = FData->CtxRoot; 420 if (!CurrentRoot || CR != CurrentRoot) 421 return; 422 IsUnderContext = false; 423 assert(CR && canBeRoot(CR)); 424 __llvm_ctx_profile_current_context_root = nullptr; 425 CR->Taken.Unlock(); 426 } 427 428 void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) { 429 size_t NumMemUnits = 0; 430 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 431 &AllContextsMutex); 432 for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { 433 auto *Root = AllContextRoots[I]; 434 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( 435 &Root->Taken); 436 for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) 437 ++NumMemUnits; 438 439 resetContextNode(*Root->FirstNode); 440 if (Root->FirstUnhandledCalleeNode) 441 resetContextNode(*Root->FirstUnhandledCalleeNode); 442 __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0); 443 } 444 if (AutodetectDuration) { 445 // we leak RD intentionally. Knowing when to free it is tricky, there's a 446 // race condition with functions observing the `RootDectector` as non-null. 447 // This can be addressed but the alternatives have some added complexity and 448 // it's not (yet) worth it. 449 auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector))) 450 RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration); 451 RD->start(); 452 } else { 453 __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits); 454 } 455 __sanitizer::atomic_store_relaxed(&ProfilingStarted, true); 456 } 457 458 bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) { 459 __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); 460 if (auto *RD = getRootDetector()) { 461 __sanitizer::Printf("[ctxprof] Expected the root autodetector to have " 462 "finished well before attempting to fetch a context"); 463 RD->join(); 464 } 465 466 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 467 &AllContextsMutex); 468 469 Writer.startContextSection(); 470 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { 471 auto *Root = AllContextRoots[I]; 472 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( 473 &Root->Taken); 474 if (!validate(Root)) { 475 __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid"); 476 return false; 477 } 478 Writer.writeContextual( 479 *Root->FirstNode, Root->FirstUnhandledCalleeNode, 480 __sanitizer::atomic_load_relaxed(&Root->TotalEntries)); 481 } 482 Writer.endContextSection(); 483 Writer.startFlatSection(); 484 // The list progresses behind the head, so taking this snapshot allows the 485 // list to grow concurrently without causing a race condition with our 486 // traversing it. 487 const auto *Pos = reinterpret_cast<const FunctionData *>( 488 __sanitizer::atomic_load_relaxed(&AllFunctionsData)); 489 for (; Pos; Pos = Pos->Next) { 490 const auto *CR = Pos->CtxRoot; 491 if (!CR && canBeRoot(CR)) { 492 const auto *FP = Pos->FlatCtx; 493 Writer.writeFlat(FP->guid(), FP->counters(), FP->counters_size()); 494 } 495 } 496 Writer.endFlatSection(); 497 return true; 498 } 499 500 void __llvm_ctx_profile_free() { 501 __sanitizer::atomic_store_relaxed(&ProfilingStarted, false); 502 { 503 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 504 &AllContextsMutex); 505 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) 506 for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { 507 auto *C = A; 508 A = A->next(); 509 __sanitizer::InternalFree(C); 510 } 511 AllContextRoots.Reset(); 512 } 513 __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U); 514 { 515 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 516 &FlatCtxArenaMutex); 517 FlatCtxArena = nullptr; 518 for (auto *A = FlatCtxArenaHead; A;) { 519 auto *C = A; 520 A = C->next(); 521 __sanitizer::InternalFree(C); 522 } 523 524 FlatCtxArenaHead = nullptr; 525 } 526 } 527