xref: /freebsd/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp (revision 4b15965daa99044daf184221b7c283bf7f2d7e66)
1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "CtxInstrProfiling.h"
10 #include "sanitizer_common/sanitizer_allocator_internal.h"
11 #include "sanitizer_common/sanitizer_common.h"
12 #include "sanitizer_common/sanitizer_dense_map.h"
13 #include "sanitizer_common/sanitizer_libc.h"
14 #include "sanitizer_common/sanitizer_mutex.h"
15 #include "sanitizer_common/sanitizer_placement_new.h"
16 #include "sanitizer_common/sanitizer_thread_safety.h"
17 #include "sanitizer_common/sanitizer_vector.h"
18 
19 #include <assert.h>
20 
21 using namespace __ctx_profile;
22 
23 namespace {
24 // Keep track of all the context roots we actually saw, so we can then traverse
25 // them when the user asks for the profile in __llvm_ctx_profile_fetch
26 __sanitizer::SpinMutex AllContextsMutex;
27 SANITIZER_GUARDED_BY(AllContextsMutex)
28 __sanitizer::Vector<ContextRoot *> AllContextRoots;
29 
30 // utility to taint a pointer by setting the LSB. There is an assumption
31 // throughout that the addresses of contexts are even (really, they should be
32 // align(8), but "even"-ness is the minimum assumption)
33 // "scratch contexts" are buffers that we return in certain cases - they are
34 // large enough to allow for memory safe counter access, but they don't link
35 // subcontexts below them (the runtime recognizes them and enforces that)
36 ContextNode *markAsScratch(const ContextNode *Ctx) {
37   return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
38 }
39 
40 // Used when getting the data from TLS. We don't *really* need to reset, but
41 // it's a simpler system if we do.
42 template <typename T> inline T consume(T &V) {
43   auto R = V;
44   V = {0};
45   return R;
46 }
47 
48 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that
49 // large.
50 constexpr size_t kPower = 20;
51 constexpr size_t kBuffSize = 1 << kPower;
52 
53 // Highly unlikely we need more than kBuffSize for a context.
54 size_t getArenaAllocSize(size_t Needed) {
55   if (Needed >= kBuffSize)
56     return 2 * Needed;
57   return kBuffSize;
58 }
59 
60 // verify the structural integrity of the context
61 bool validate(const ContextRoot *Root) {
62   // all contexts should be laid out in some arena page. Go over each arena
63   // allocated for this Root, and jump over contained contexts based on
64   // self-reported sizes.
65   __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
66   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
67     const auto *Pos = Mem->start();
68     while (Pos < Mem->pos()) {
69       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
70       if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
71                .second)
72         return false;
73       Pos += Ctx->size();
74     }
75   }
76 
77   // Now traverse the contexts again the same way, but validate all nonull
78   // subcontext addresses appear in the set computed above.
79   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
80     const auto *Pos = Mem->start();
81     while (Pos < Mem->pos()) {
82       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
83       for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
84         for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
85           if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
86             return false;
87 
88       Pos += Ctx->size();
89     }
90   }
91   return true;
92 }
93 
94 inline ContextNode *allocContextNode(char *Place, GUID Guid,
95                                      uint32_t NrCounters, uint32_t NrCallsites,
96                                      ContextNode *Next = nullptr) {
97   assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
98   return new (Place) ContextNode(Guid, NrCounters, NrCallsites, Next);
99 }
100 
101 void resetContextNode(ContextNode &Node) {
102   // FIXME(mtrofin): this is std::memset, which we can probably use if we
103   // drop/reduce the dependency on sanitizer_common.
104   for (uint32_t I = 0; I < Node.counters_size(); ++I)
105     Node.counters()[I] = 0;
106   for (uint32_t I = 0; I < Node.callsites_size(); ++I)
107     for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
108       resetContextNode(*Next);
109 }
110 
111 void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
112 
113 } // namespace
114 
115 // the scratch buffer - what we give when we can't produce a real context (the
116 // scratch isn't "real" in that it's expected to be clobbered carelessly - we
117 // don't read it). The other important thing is that the callees from a scratch
118 // context also get a scratch context.
119 // Eventually this can be replaced with per-function buffers, a'la the typical
120 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
121 // the part about determining the nature of the subcontexts does.
122 __thread char __Buffer[kBuffSize] = {0};
123 
124 #define TheScratchContext                                                      \
125   markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
126 
127 // init the TLSes
128 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
129                                                                  nullptr};
130 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
131 
132 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
133     nullptr;
134 
135 Arena::Arena(uint32_t Size) : Size(Size) {
136   __sanitizer::internal_memset(start(), 0, Size);
137 }
138 
139 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
140 // the dependency on the latter.
141 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
142   assert(!Prev || Prev->Next == nullptr);
143   Arena *NewArena = new (__sanitizer::InternalAlloc(
144       Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
145       Arena(Size);
146   if (Prev)
147     Prev->Next = NewArena;
148   return NewArena;
149 }
150 
151 void Arena::freeArenaList(Arena *&A) {
152   assert(A);
153   for (auto *I = A; I != nullptr;) {
154     auto *Current = I;
155     I = I->Next;
156     __sanitizer::InternalFree(Current);
157   }
158   A = nullptr;
159 }
160 
161 // If this is the first time we hit a callsite with this (Guid) particular
162 // callee, we need to allocate.
163 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
164                              uint32_t NrCounters, uint32_t NrCallsites) {
165   auto AllocSize = ContextNode::getAllocSize(NrCounters, NrCallsites);
166   auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
167   char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
168   if (!AllocPlace) {
169     // if we failed to allocate on the current arena, allocate a new arena,
170     // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
171     // find it from now on for other cases when we need to getCallsiteSlow.
172     // Note that allocateNewArena will link the allocated memory in the list of
173     // Arenas.
174     __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
175         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
176     AllocPlace = Mem->tryBumpAllocate(AllocSize);
177   }
178   auto *Ret = allocContextNode(AllocPlace, Guid, NrCounters, NrCallsites,
179                                *InsertionPoint);
180   *InsertionPoint = Ret;
181   return Ret;
182 }
183 
184 ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
185                                             uint32_t NrCounters,
186                                             uint32_t NrCallsites) {
187   // fast "out" if we're not even doing contextual collection.
188   if (!__llvm_ctx_profile_current_context_root)
189     return TheScratchContext;
190 
191   // also fast "out" if the caller is scratch. We can see if it's scratch by
192   // looking at the interior pointer into the subcontexts vector that the caller
193   // provided, which, if the context is scratch, so is that interior pointer
194   // (because all the address calculations are using even values. Or more
195   // precisely, aligned - 8 values)
196   auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
197   if (!CallsiteContext || isScratch(CallsiteContext))
198     return TheScratchContext;
199 
200   // if the callee isn't the expected one, return scratch.
201   // Signal handler(s) could have been invoked at any point in the execution.
202   // Should that have happened, and had it (the handler) be built with
203   // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
204   // Its sub call graph would have then populated
205   // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
206   // The normal call graph may be impacted in that, if the signal handler
207   // happened somewhere before we read the TLS here, we'd see the TLS reset and
208   // we'd also fail here. That would just mean we would loose counter values for
209   // the normal subgraph, this time around. That should be very unlikely, but if
210   // it happens too frequently, we should be able to detect discrepancies in
211   // entry counts (caller-callee). At the moment, the design goes on the
212   // assumption that is so unfrequent, though, that it's not worth doing more
213   // for that case.
214   auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
215   if (ExpectedCallee != Callee)
216     return TheScratchContext;
217 
218   auto *Callsite = *CallsiteContext;
219   // in the case of indirect calls, we will have all seen targets forming a
220   // linked list here. Find the one corresponding to this callee.
221   while (Callsite && Callsite->guid() != Guid) {
222     Callsite = Callsite->next();
223   }
224   auto *Ret = Callsite ? Callsite
225                        : getCallsiteSlow(Guid, CallsiteContext, NrCounters,
226                                          NrCallsites);
227   if (Ret->callsites_size() != NrCallsites ||
228       Ret->counters_size() != NrCounters)
229     __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
230                         "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
231                         reinterpret_cast<void *>(Ret), Guid, NrCallsites,
232                         NrCounters, Ret->guid(), Ret->callsites_size(),
233                         Ret->counters_size());
234   onContextEnter(*Ret);
235   return Ret;
236 }
237 
238 // This should be called once for a Root. Allocate the first arena, set up the
239 // first context.
240 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NrCounters,
241                   uint32_t NrCallsites) {
242   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
243       &AllContextsMutex);
244   // Re-check - we got here without having had taken a lock.
245   if (Root->FirstMemBlock)
246     return;
247   const auto Needed = ContextNode::getAllocSize(NrCounters, NrCallsites);
248   auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
249   Root->FirstMemBlock = M;
250   Root->CurrentMem = M;
251   Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
252                                      NrCounters, NrCallsites);
253   AllContextRoots.PushBack(Root);
254 }
255 
256 ContextNode *__llvm_ctx_profile_start_context(
257     ContextRoot *Root, GUID Guid, uint32_t Counters,
258     uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
259   if (!Root->FirstMemBlock) {
260     setupContext(Root, Guid, Counters, Callsites);
261   }
262   if (Root->Taken.TryLock()) {
263     __llvm_ctx_profile_current_context_root = Root;
264     onContextEnter(*Root->FirstNode);
265     return Root->FirstNode;
266   }
267   // If this thread couldn't take the lock, return scratch context.
268   __llvm_ctx_profile_current_context_root = nullptr;
269   return TheScratchContext;
270 }
271 
272 void __llvm_ctx_profile_release_context(ContextRoot *Root)
273     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
274   if (__llvm_ctx_profile_current_context_root) {
275     __llvm_ctx_profile_current_context_root = nullptr;
276     Root->Taken.Unlock();
277   }
278 }
279 
280 void __llvm_ctx_profile_start_collection() {
281   size_t NrMemUnits = 0;
282   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
283       &AllContextsMutex);
284   for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
285     auto *Root = AllContextRoots[I];
286     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
287         &Root->Taken);
288     for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
289       ++NrMemUnits;
290 
291     resetContextNode(*Root->FirstNode);
292   }
293   __sanitizer::Printf("[ctxprof] Initial NrMemUnits: %zu \n", NrMemUnits);
294 }
295 
296 bool __llvm_ctx_profile_fetch(void *Data,
297                               bool (*Writer)(void *W, const ContextNode &)) {
298   assert(Writer);
299   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
300       &AllContextsMutex);
301 
302   for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
303     auto *Root = AllContextRoots[I];
304     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
305         &Root->Taken);
306     if (!validate(Root)) {
307       __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
308       return false;
309     }
310     if (!Writer(Data, *Root->FirstNode))
311       return false;
312   }
313   return true;
314 }
315 
316 void __llvm_ctx_profile_free() {
317   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
318       &AllContextsMutex);
319   for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
320     for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
321       auto *C = A;
322       A = A->next();
323       __sanitizer::InternalFree(C);
324     }
325   AllContextRoots.Reset();
326 }
327