1 /*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO ---------===*\ 2 |* 3 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 |* See https://llvm.org/LICENSE.txt for license information. 5 |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 |* 7 \*===----------------------------------------------------------------------===*/ 8 9 #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_ 10 #define CTX_PROFILE_CTXINSTRPROFILING_H_ 11 12 #include "CtxInstrContextNode.h" 13 #include "sanitizer_common/sanitizer_mutex.h" 14 #include <sanitizer/common_interface_defs.h> 15 16 using namespace llvm::ctx_profile; 17 18 // Forward-declare for the one unittest checking Arena construction zeroes out 19 // its allocatable space. 20 class ArenaTest_ZeroInit_Test; 21 namespace __ctx_profile { 22 23 static constexpr size_t ExpectedAlignment = 8; 24 // We really depend on this, see further below. We currently support x86_64. 25 // When we want to support other archs, we need to trace the places Alignment is 26 // used and adjust accordingly. 27 static_assert(sizeof(void *) == ExpectedAlignment); 28 29 /// Arena (bump allocator) forming a linked list. Intentionally not thread safe. 30 /// Allocation and de-allocation happen using sanitizer APIs. We make that 31 /// explicit. 32 class Arena final { 33 public: 34 // When allocating a new Arena, optionally specify an existing one to append 35 // to, assumed to be the last in the Arena list. We only need to support 36 // appending to the arena list. 37 static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr); 38 static void freeArenaList(Arena *&A); 39 40 uint64_t size() const { return Size; } 41 42 // Allocate S bytes or return nullptr if we don't have that many available. 43 char *tryBumpAllocate(size_t S) { 44 if (Pos + S > Size) 45 return nullptr; 46 Pos += S; 47 return start() + (Pos - S); 48 } 49 50 Arena *next() const { return Next; } 51 52 // the beginning of allocatable memory. 53 const char *start() const { return const_cast<Arena *>(this)->start(); } 54 const char *pos() const { return start() + Pos; } 55 56 private: 57 friend class ::ArenaTest_ZeroInit_Test; 58 explicit Arena(uint32_t Size); 59 ~Arena() = delete; 60 61 char *start() { return reinterpret_cast<char *>(&this[1]); } 62 63 Arena *Next = nullptr; 64 uint64_t Pos = 0; 65 const uint64_t Size; 66 }; 67 68 // The memory available for allocation follows the Arena header, and we expect 69 // it to be thus aligned. 70 static_assert(alignof(Arena) == ExpectedAlignment); 71 72 // Verify maintenance to ContextNode doesn't change this invariant, which makes 73 // sure the inlined vectors are appropriately aligned. 74 static_assert(alignof(ContextNode) == ExpectedAlignment); 75 76 /// ContextRoots are allocated by LLVM for entrypoints. LLVM is only concerned 77 /// with allocating and zero-initializing the global value (as in, GlobalValue) 78 /// for it. 79 struct ContextRoot { 80 ContextNode *FirstNode = nullptr; 81 Arena *FirstMemBlock = nullptr; 82 Arena *CurrentMem = nullptr; 83 // This is init-ed by the static zero initializer in LLVM. 84 // Taken is used to ensure only one thread traverses the contextual graph - 85 // either to read it or to write it. On server side, the same entrypoint will 86 // be entered by numerous threads, but over time, the profile aggregated by 87 // collecting sequentially on one thread at a time is expected to converge to 88 // the aggregate profile that may have been observable on all the threads. 89 // Note that this is node-by-node aggregation, i.e. summing counters of nodes 90 // at the same position in the graph, not flattening. 91 // Threads that cannot lock Taken (fail TryLock) are given a "scratch context" 92 // - a buffer they can clobber, safely from a memory access perspective. 93 // 94 // Note about "scratch"-ness: we currently ignore the data written in them 95 // (which is anyway clobbered). The design allows for that not be the case - 96 // because "scratch"-ness is first and foremost about not trying to build 97 // subcontexts, and is captured by tainting the pointer value (pointer to the 98 // memory treated as context), but right now, we drop that info. 99 // 100 // We could consider relaxing the requirement of more than one thread 101 // entering by holding a few context trees per entrypoint and then aggregating 102 // them (as explained above) at the end of the profile collection - it's a 103 // tradeoff between collection time and memory use: higher precision can be 104 // obtained with either less concurrent collections but more collection time, 105 // or with more concurrent collections (==more memory) and less collection 106 // time. Note that concurrent collection does happen for different 107 // entrypoints, regardless. 108 ::__sanitizer::StaticSpinMutex Taken; 109 110 // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM 111 // instrumentation lowering side because it is responsible for allocating and 112 // zero-initializing ContextRoots. 113 static_assert(sizeof(Taken) == 1); 114 }; 115 116 /// This API is exposed for testing. See the APIs below about the contract with 117 /// LLVM. 118 inline bool isScratch(const void *Ctx) { 119 return (reinterpret_cast<uint64_t>(Ctx) & 1); 120 } 121 122 } // namespace __ctx_profile 123 124 extern "C" { 125 126 // LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic. 127 // position 0 is used when the current context isn't scratch, 1 when it is. They 128 // are volatile because of signal handlers - we mean to specifically control 129 // when the data is loaded. 130 // 131 /// TLS where LLVM stores the pointer of the called value, as part of lowering a 132 /// llvm.instrprof.callsite 133 extern __thread void *volatile __llvm_ctx_profile_expected_callee[2]; 134 /// TLS where LLVM stores the pointer inside a caller's subcontexts vector that 135 /// corresponds to the callsite being lowered. 136 extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2]; 137 138 // __llvm_ctx_profile_current_context_root is exposed for unit testing, 139 // othwerise it's only used internally by compiler-rt/ctx_profile. 140 extern __thread __ctx_profile::ContextRoot 141 *volatile __llvm_ctx_profile_current_context_root; 142 143 /// called by LLVM in the entry BB of a "entry point" function. The returned 144 /// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch. 145 ContextNode *__llvm_ctx_profile_start_context(__ctx_profile::ContextRoot *Root, 146 GUID Guid, uint32_t Counters, 147 uint32_t Callsites); 148 149 /// paired with __llvm_ctx_profile_start_context, and called at the exit of the 150 /// entry point function. 151 void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root); 152 153 /// called for any other function than entry points, in the entry BB of such 154 /// function. Same consideration about LSB of returned value as .._start_context 155 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid, 156 uint32_t NrCounters, 157 uint32_t NrCallsites); 158 159 /// Prepares for collection. Currently this resets counter values but preserves 160 /// internal context tree structure. 161 void __llvm_ctx_profile_start_collection(); 162 163 /// Completely free allocated memory. 164 void __llvm_ctx_profile_free(); 165 166 /// Used to obtain the profile. The Writer is called for each root ContextNode, 167 /// with the ContextRoot::Taken taken. The Writer is responsible for traversing 168 /// the structure underneath. 169 /// The Writer's first parameter plays the role of closure for Writer, and is 170 /// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter. 171 /// The second parameter is the root of a context tree. 172 bool __llvm_ctx_profile_fetch(void *Data, 173 bool (*Writer)(void *, const ContextNode &)); 174 } 175 #endif // CTX_PROFILE_CTXINSTRPROFILING_H_ 176