1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "CtxInstrProfiling.h" 10 #include "sanitizer_common/sanitizer_allocator_internal.h" 11 #include "sanitizer_common/sanitizer_common.h" 12 #include "sanitizer_common/sanitizer_dense_map.h" 13 #include "sanitizer_common/sanitizer_libc.h" 14 #include "sanitizer_common/sanitizer_mutex.h" 15 #include "sanitizer_common/sanitizer_placement_new.h" 16 #include "sanitizer_common/sanitizer_thread_safety.h" 17 #include "sanitizer_common/sanitizer_vector.h" 18 19 #include <assert.h> 20 21 using namespace __ctx_profile; 22 23 namespace { 24 // Keep track of all the context roots we actually saw, so we can then traverse 25 // them when the user asks for the profile in __llvm_ctx_profile_fetch 26 __sanitizer::SpinMutex AllContextsMutex; 27 SANITIZER_GUARDED_BY(AllContextsMutex) 28 __sanitizer::Vector<ContextRoot *> AllContextRoots; 29 30 // utility to taint a pointer by setting the LSB. There is an assumption 31 // throughout that the addresses of contexts are even (really, they should be 32 // align(8), but "even"-ness is the minimum assumption) 33 // "scratch contexts" are buffers that we return in certain cases - they are 34 // large enough to allow for memory safe counter access, but they don't link 35 // subcontexts below them (the runtime recognizes them and enforces that) 36 ContextNode *markAsScratch(const ContextNode *Ctx) { 37 return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1); 38 } 39 40 // Used when getting the data from TLS. We don't *really* need to reset, but 41 // it's a simpler system if we do. 42 template <typename T> inline T consume(T &V) { 43 auto R = V; 44 V = {0}; 45 return R; 46 } 47 48 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that 49 // large. 50 constexpr size_t kPower = 20; 51 constexpr size_t kBuffSize = 1 << kPower; 52 53 // Highly unlikely we need more than kBuffSize for a context. 54 size_t getArenaAllocSize(size_t Needed) { 55 if (Needed >= kBuffSize) 56 return 2 * Needed; 57 return kBuffSize; 58 } 59 60 // verify the structural integrity of the context 61 bool validate(const ContextRoot *Root) { 62 // all contexts should be laid out in some arena page. Go over each arena 63 // allocated for this Root, and jump over contained contexts based on 64 // self-reported sizes. 65 __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs; 66 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { 67 const auto *Pos = Mem->start(); 68 while (Pos < Mem->pos()) { 69 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); 70 if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true}) 71 .second) 72 return false; 73 Pos += Ctx->size(); 74 } 75 } 76 77 // Now traverse the contexts again the same way, but validate all nonull 78 // subcontext addresses appear in the set computed above. 79 for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) { 80 const auto *Pos = Mem->start(); 81 while (Pos < Mem->pos()) { 82 const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos); 83 for (uint32_t I = 0; I < Ctx->callsites_size(); ++I) 84 for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next()) 85 if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub))) 86 return false; 87 88 Pos += Ctx->size(); 89 } 90 } 91 return true; 92 } 93 94 inline ContextNode *allocContextNode(char *Place, GUID Guid, 95 uint32_t NrCounters, uint32_t NrCallsites, 96 ContextNode *Next = nullptr) { 97 assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0); 98 return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next); 99 } 100 101 void resetContextNode(ContextNode &Node) { 102 // FIXME(mtrofin): this is std::memset, which we can probably use if we 103 // drop/reduce the dependency on sanitizer_common. 104 for (uint32_t I = 0; I < Node.counters_size(); ++I) 105 Node.counters()[I] = 0; 106 for (uint32_t I = 0; I < Node.callsites_size(); ++I) 107 for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next()) 108 resetContextNode(*Next); 109 } 110 111 void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; } 112 113 } // namespace 114 115 // the scratch buffer - what we give when we can't produce a real context (the 116 // scratch isn't "real" in that it's expected to be clobbered carelessly - we 117 // don't read it). The other important thing is that the callees from a scratch 118 // context also get a scratch context. 119 // Eventually this can be replaced with per-function buffers, a'la the typical 120 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but 121 // the part about determining the nature of the subcontexts does. 122 __thread char __Buffer[kBuffSize] = {0}; 123 124 #define TheScratchContext \ 125 markAsScratch(reinterpret_cast<ContextNode *>(__Buffer)) 126 127 // init the TLSes 128 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr, 129 nullptr}; 130 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0}; 131 132 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root = 133 nullptr; 134 135 Arena::Arena(uint32_t Size) : Size(Size) { 136 __sanitizer::internal_memset(start(), 0, Size); 137 } 138 139 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce 140 // the dependency on the latter. 141 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) { 142 assert(!Prev || Prev->Next == nullptr); 143 Arena *NewArena = new (__sanitizer::InternalAlloc( 144 Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment)) 145 Arena(Size); 146 if (Prev) 147 Prev->Next = NewArena; 148 return NewArena; 149 } 150 151 void Arena::freeArenaList(Arena *&A) { 152 assert(A); 153 for (auto *I = A; I != nullptr;) { 154 auto *Current = I; 155 I = I->Next; 156 __sanitizer::InternalFree(Current); 157 } 158 A = nullptr; 159 } 160 161 // If this is the first time we hit a callsite with this (Guid) particular 162 // callee, we need to allocate. 163 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint, 164 uint32_t NrCounters, uint32_t NrCallsites) { 165 auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites); 166 auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem; 167 char *AllocPlace = Mem->tryBumpAllocate(AllocSize); 168 if (!AllocPlace) { 169 // if we failed to allocate on the current arena, allocate a new arena, 170 // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we 171 // find it from now on for other cases when we need to getCallsiteSlow. 172 // Note that allocateNewArena will link the allocated memory in the list of 173 // Arenas. 174 __llvm_ctx_profile_current_context_root->CurrentMem = Mem = 175 Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem); 176 AllocPlace = Mem->tryBumpAllocate(AllocSize); 177 } 178 auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites, 179 *InsertionPoint); 180 *InsertionPoint = Ret; 181 return Ret; 182 } 183 184 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, 185 uint32_t NrCounters, 186 uint32_t NrCallsites) { 187 // fast "out" if we're not even doing contextual collection. 188 if (!__llvm_ctx_profile_current_context_root) 189 return TheScratchContext; 190 191 // also fast "out" if the caller is scratch. We can see if it's scratch by 192 // looking at the interior pointer into the subcontexts vector that the caller 193 // provided, which, if the context is scratch, so is that interior pointer 194 // (because all the address calculations are using even values. Or more 195 // precisely, aligned - 8 values) 196 auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]); 197 if (!CallsiteContext || isScratch(CallsiteContext)) 198 return TheScratchContext; 199 200 // if the callee isn't the expected one, return scratch. 201 // Signal handler(s) could have been invoked at any point in the execution. 202 // Should that have happened, and had it (the handler) be built with 203 // instrumentation, its __llvm_ctx_profile_get_context would have failed here. 204 // Its sub call graph would have then populated 205 // __llvm_ctx_profile_{expected_callee | callsite} at index 1. 206 // The normal call graph may be impacted in that, if the signal handler 207 // happened somewhere before we read the TLS here, we'd see the TLS reset and 208 // we'd also fail here. That would just mean we would loose counter values for 209 // the normal subgraph, this time around. That should be very unlikely, but if 210 // it happens too frequently, we should be able to detect discrepancies in 211 // entry counts (caller-callee). At the moment, the design goes on the 212 // assumption that is so unfrequent, though, that it's not worth doing more 213 // for that case. 214 auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]); 215 if (ExpectedCallee != Callee) 216 return TheScratchContext; 217 218 auto *Callsite = *CallsiteContext; 219 // in the case of indirect calls, we will have all seen targets forming a 220 // linked list here. Find the one corresponding to this callee. 221 while (Callsite && Callsite->guid() != Guid) { 222 Callsite = Callsite->next(); 223 } 224 auto *Ret = Callsite ? Callsite 225 : getCallsiteSlow(Guid, CallsiteContext, NrCounters, 226 NrCallsites); 227 if (Ret->callsites_size() != NrCallsites || 228 Ret->counters_size() != NrCounters) 229 __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: " 230 "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n", 231 reinterpret_cast<void *>(Ret), Guid, NrCallsites, 232 NrCounters, Ret->guid(), Ret->callsites_size(), 233 Ret->counters_size()); 234 onContextEnter(*Ret); 235 return Ret; 236 } 237 238 // This should be called once for a Root. Allocate the first arena, set up the 239 // first context. 240 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters, 241 uint32_t NrCallsites) { 242 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 243 &AllContextsMutex); 244 // Re-check - we got here without having had taken a lock. 245 if (Root->FirstMemBlock) 246 return; 247 const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites); 248 auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed)); 249 Root->FirstMemBlock = M; 250 Root->CurrentMem = M; 251 Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid, 252 NrCounters, NrCallsites); 253 AllContextRoots.PushBack(Root); 254 } 255 256 ContextNode *__llvm_ctx_profile_start_context( 257 ContextRoot *Root, GUID Guid, uint32_t Counters, 258 uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS { 259 if (!Root->FirstMemBlock) { 260 setupContext(Root, Guid, Counters, Callsites); 261 } 262 if (Root->Taken.TryLock()) { 263 __llvm_ctx_profile_current_context_root = Root; 264 onContextEnter(*Root->FirstNode); 265 return Root->FirstNode; 266 } 267 // If this thread couldn't take the lock, return scratch context. 268 __llvm_ctx_profile_current_context_root = nullptr; 269 return TheScratchContext; 270 } 271 272 void __llvm_ctx_profile_release_context(ContextRoot *Root) 273 SANITIZER_NO_THREAD_SAFETY_ANALYSIS { 274 if (__llvm_ctx_profile_current_context_root) { 275 __llvm_ctx_profile_current_context_root = nullptr; 276 Root->Taken.Unlock(); 277 } 278 } 279 280 void __llvm_ctx_profile_start_collection() { 281 size_t NrMemUnits = 0; 282 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 283 &AllContextsMutex); 284 for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) { 285 auto *Root = AllContextRoots[I]; 286 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock( 287 &Root->Taken); 288 for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) 289 ++NrMemUnits; 290 291 resetContextNode(*Root->FirstNode); 292 } 293 __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits); 294 } 295 296 bool __llvm_ctx_profile_fetch(void *Data, 297 bool (*Writer)(void *W, const ContextNode &)) { 298 assert(Writer); 299 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 300 &AllContextsMutex); 301 302 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) { 303 auto *Root = AllContextRoots[I]; 304 __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock( 305 &Root->Taken); 306 if (!validate(Root)) { 307 __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid"); 308 return false; 309 } 310 if (!Writer(Data, *Root->FirstNode)) 311 return false; 312 } 313 return true; 314 } 315 316 void __llvm_ctx_profile_free() { 317 __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock( 318 &AllContextsMutex); 319 for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) 320 for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) { 321 auto *C = A; 322 A = A->next(); 323 __sanitizer::InternalFree(C); 324 } 325 AllContextRoots.Reset(); 326 } 327