xref: /freebsd/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.h (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 /*===- CtxInstrProfiling.h- Contextual instrumentation-based PGO  ---------===*\
2 |*
3 |* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 |* See https://llvm.org/LICENSE.txt for license information.
5 |* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 |*
7 \*===----------------------------------------------------------------------===*/
8 
9 #ifndef CTX_PROFILE_CTXINSTRPROFILING_H_
10 #define CTX_PROFILE_CTXINSTRPROFILING_H_
11 
12 #include "CtxInstrContextNode.h"
13 #include "sanitizer_common/sanitizer_dense_map.h"
14 #include "sanitizer_common/sanitizer_mutex.h"
15 #include <sanitizer/common_interface_defs.h>
16 
17 using namespace llvm::ctx_profile;
18 
19 // Forward-declare for the one unittest checking Arena construction zeroes out
20 // its allocatable space.
21 class ArenaTest_ZeroInit_Test;
22 namespace __ctx_profile {
23 
24 static constexpr size_t ExpectedAlignment = 8;
25 // We really depend on this, see further below. We currently support x86_64.
26 // When we want to support other archs, we need to trace the places Alignment is
27 // used and adjust accordingly.
28 static_assert(sizeof(void *) == ExpectedAlignment);
29 
30 /// Arena (bump allocator) forming a linked list. Intentionally not thread safe.
31 /// Allocation and de-allocation happen using sanitizer APIs. We make that
32 /// explicit.
33 class Arena final {
34 public:
35   // When allocating a new Arena, optionally specify an existing one to append
36   // to, assumed to be the last in the Arena list. We only need to support
37   // appending to the arena list.
38   static Arena *allocateNewArena(size_t Size, Arena *Prev = nullptr);
39   static void freeArenaList(Arena *&A);
40 
size()41   uint64_t size() const { return Size; }
42 
43   // Allocate S bytes or return nullptr if we don't have that many available.
tryBumpAllocate(size_t S)44   char *tryBumpAllocate(size_t S) {
45     if (Pos + S > Size)
46       return nullptr;
47     Pos += S;
48     return start() + (Pos - S);
49   }
50 
next()51   Arena *next() const { return Next; }
52 
53   // the beginning of allocatable memory.
start()54   const char *start() const { return const_cast<Arena *>(this)->start(); }
pos()55   const char *pos() const { return start() + Pos; }
56 
57 private:
58   friend class ::ArenaTest_ZeroInit_Test;
59   explicit Arena(uint32_t Size);
60   ~Arena() = delete;
61 
start()62   char *start() { return reinterpret_cast<char *>(&this[1]); }
63 
64   Arena *Next = nullptr;
65   uint64_t Pos = 0;
66   const uint64_t Size;
67 };
68 
69 // The memory available for allocation follows the Arena header, and we expect
70 // it to be thus aligned.
71 static_assert(alignof(Arena) == ExpectedAlignment);
72 
73 // Verify maintenance to ContextNode doesn't change this invariant, which makes
74 // sure the inlined vectors are appropriately aligned.
75 static_assert(alignof(ContextNode) == ExpectedAlignment);
76 
77 /// ContextRoots hold memory and the start of the contextual profile tree for a
78 /// root function.
79 struct ContextRoot {
80   ContextNode *FirstNode = nullptr;
81   Arena *FirstMemBlock = nullptr;
82   Arena *CurrentMem = nullptr;
83 
84   // Count the number of entries - regardless if we could take the `Taken` mutex
85   ::__sanitizer::atomic_uint64_t TotalEntries = {};
86 
87   // Profiles for functions we encounter when collecting a contexutal profile,
88   // that are not associated with a callsite. This is expected to happen for
89   // signal handlers, but it also - problematically - currently happens for
90   // call sites generated after profile instrumentation, primarily
91   // mem{memset|copy|move|set}.
92   // `Unhandled` serves 2 purposes:
93   //   1. identifying such cases (like the memops)
94   //   2. collecting a profile for them, which can be at least used as a flat
95   //   profile
96   ::__sanitizer::DenseMap<GUID, ContextNode *> Unhandled;
97   // Keep the unhandled contexts in a list, as we allocate them, as it makes it
98   // simpler to send to the writer when the profile is fetched.
99   ContextNode *FirstUnhandledCalleeNode = nullptr;
100 
101   // Taken is used to ensure only one thread traverses the contextual graph -
102   // either to read it or to write it. On server side, the same entrypoint will
103   // be entered by numerous threads, but over time, the profile aggregated by
104   // collecting sequentially on one thread at a time is expected to converge to
105   // the aggregate profile that may have been observable on all the threads.
106   // Note that this is node-by-node aggregation, i.e. summing counters of nodes
107   // at the same position in the graph, not flattening.
108   // Threads that cannot lock Taken (fail TryLock) are given a "scratch context"
109   // - a buffer they can clobber, safely from a memory access perspective.
110   //
111   // Note about "scratch"-ness: we currently ignore the data written in them
112   // (which is anyway clobbered). The design allows for that not be the case -
113   // because "scratch"-ness is first and foremost about not trying to build
114   // subcontexts, and is captured by tainting the pointer value (pointer to the
115   // memory treated as context), but right now, we drop that info.
116   //
117   // We could consider relaxing the requirement of more than one thread
118   // entering by holding a few context trees per entrypoint and then aggregating
119   // them (as explained above) at the end of the profile collection - it's a
120   // tradeoff between collection time and memory use: higher precision can be
121   // obtained with either less concurrent collections but more collection time,
122   // or with more concurrent collections (==more memory) and less collection
123   // time. Note that concurrent collection does happen for different
124   // entrypoints, regardless.
125   ::__sanitizer::SpinMutex Taken;
126 };
127 
128 // This is allocated and zero-initialized by the compiler, the in-place
129 // initialization serves mostly as self-documentation and for testing.
130 // The design is influenced by the observation that typically (at least for
131 // datacenter binaries, which is the motivating target of this profiler) less
132 // than 10% of functions in a binary even appear in a profile (of any kind).
133 //
134 // 1) We could pre-allocate the flat profile storage in the compiler, just like
135 // the flat instrumented profiling does. But that penalizes the static size of
136 // the binary for little reason
137 //
138 // 2) We could do the above but zero-initialize the buffers (which should place
139 // them in .bss), and dynamically populate them. This, though, would page-in
140 // more memory upfront for the binary's runtime
141 //
142 // The current design trades off a bit of overhead at the first time a function
143 // is encountered *for flat profiling* for avoiding size penalties.
144 struct FunctionData {
145 #define _PTRDECL(T, N) T *N = nullptr;
146 #define _VOLATILE_PTRDECL(T, N) T *volatile N = nullptr;
147 #define _MUTEXDECL(N) ::__sanitizer::SpinMutex N;
148 #define _CONTEXT_PTR ContextRoot *CtxRoot = nullptr;
149   CTXPROF_FUNCTION_DATA(_PTRDECL, _CONTEXT_PTR, _VOLATILE_PTRDECL, _MUTEXDECL)
150 #undef _CONTEXT_PTR
151 #undef _PTRDECL
152 #undef _VOLATILE_PTRDECL
153 #undef _MUTEXDECL
154 
155   // Constructor for test only - since this is expected to be
156   // initialized by the compiler.
157   FunctionData() = default;
158   ContextRoot *getOrAllocateContextRoot();
159 
160   // If (unlikely) StaticSpinMutex internals change, we need to modify the LLVM
161   // instrumentation lowering side because it is responsible for allocating and
162   // zero-initializing ContextRoots.
163   static_assert(sizeof(Mutex) == 1);
164 };
165 
166 /// This API is exposed for testing. See the APIs below about the contract with
167 /// LLVM.
isScratch(const void * Ctx)168 inline bool isScratch(const void *Ctx) {
169   return (reinterpret_cast<uint64_t>(Ctx) & 1);
170 }
171 
172 // True if Ctx is either nullptr or not the 0x1 value.
canBeRoot(const ContextRoot * Ctx)173 inline bool canBeRoot(const ContextRoot *Ctx) {
174   return reinterpret_cast<uintptr_t>(Ctx) != 1U;
175 }
176 
177 } // namespace __ctx_profile
178 
179 extern "C" {
180 
181 // LLVM fills these in when lowering a llvm.instrprof.callsite intrinsic.
182 // position 0 is used when the current context isn't scratch, 1 when it is. They
183 // are volatile because of signal handlers - we mean to specifically control
184 // when the data is loaded.
185 //
186 /// TLS where LLVM stores the pointer of the called value, as part of lowering a
187 /// llvm.instrprof.callsite
188 extern __thread void *volatile __llvm_ctx_profile_expected_callee[2];
189 /// TLS where LLVM stores the pointer inside a caller's subcontexts vector that
190 /// corresponds to the callsite being lowered.
191 extern __thread ContextNode **volatile __llvm_ctx_profile_callsite[2];
192 
193 // __llvm_ctx_profile_current_context_root is exposed for unit testing,
194 // othwerise it's only used internally by compiler-rt/ctx_profile.
195 extern __thread __ctx_profile::ContextRoot
196     *volatile __llvm_ctx_profile_current_context_root;
197 
198 /// called by LLVM in the entry BB of a "entry point" function. The returned
199 /// pointer may be "tainted" - its LSB set to 1 - to indicate it's scratch.
200 ContextNode *
201 __llvm_ctx_profile_start_context(__ctx_profile::FunctionData *FData, GUID Guid,
202                                  uint32_t Counters, uint32_t Callsites);
203 
204 /// paired with __llvm_ctx_profile_start_context, and called at the exit of the
205 /// entry point function.
206 void __llvm_ctx_profile_release_context(__ctx_profile::FunctionData *FData);
207 
208 /// called for any other function than entry points, in the entry BB of such
209 /// function. Same consideration about LSB of returned value as .._start_context
210 ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData,
211                                             void *Callee, GUID Guid,
212                                             uint32_t NumCounters,
213                                             uint32_t NumCallsites);
214 
215 /// Prepares for collection. Currently this resets counter values but preserves
216 /// internal context tree structure.
217 void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0);
218 
219 /// Completely free allocated memory.
220 void __llvm_ctx_profile_free();
221 
222 /// Used to obtain the profile. The Writer is called for each root ContextNode,
223 /// with the ContextRoot::Taken taken. The Writer is responsible for traversing
224 /// the structure underneath.
225 /// The Writer's first parameter plays the role of closure for Writer, and is
226 /// what the caller of __llvm_ctx_profile_fetch passes as the Data parameter.
227 /// The second parameter is the root of a context tree.
228 bool __llvm_ctx_profile_fetch(ProfileWriter &);
229 }
230 #endif // CTX_PROFILE_CTXINSTRPROFILING_H_
231