xref: /freebsd/contrib/llvm-project/compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp (revision 700637cbb5e582861067a11aaca4d053546871d2)
1 //===- CtxInstrProfiling.cpp - contextual instrumented PGO ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "CtxInstrProfiling.h"
10 #include "RootAutoDetector.h"
11 #include "sanitizer_common/sanitizer_allocator_internal.h"
12 #include "sanitizer_common/sanitizer_atomic.h"
13 #include "sanitizer_common/sanitizer_atomic_clang.h"
14 #include "sanitizer_common/sanitizer_common.h"
15 #include "sanitizer_common/sanitizer_dense_map.h"
16 #include "sanitizer_common/sanitizer_libc.h"
17 #include "sanitizer_common/sanitizer_mutex.h"
18 #include "sanitizer_common/sanitizer_placement_new.h"
19 #include "sanitizer_common/sanitizer_thread_safety.h"
20 #include "sanitizer_common/sanitizer_vector.h"
21 
22 #include <assert.h>
23 
24 using namespace __ctx_profile;
25 
26 namespace {
27 // Keep track of all the context roots we actually saw, so we can then traverse
28 // them when the user asks for the profile in __llvm_ctx_profile_fetch
29 __sanitizer::SpinMutex AllContextsMutex;
30 SANITIZER_GUARDED_BY(AllContextsMutex)
31 __sanitizer::Vector<ContextRoot *> AllContextRoots;
32 
33 __sanitizer::atomic_uintptr_t AllFunctionsData = {};
34 
35 // Keep all the functions for which we collect a flat profile in a linked list.
36 __sanitizer::SpinMutex FlatCtxArenaMutex;
37 SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
38 Arena *FlatCtxArenaHead = nullptr;
39 SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
40 Arena *FlatCtxArena = nullptr;
41 
42 // Set to true when we enter a root, and false when we exit - regardless if this
43 // thread collects a contextual profile for that root.
44 __thread bool IsUnderContext = false;
45 __sanitizer::atomic_uint8_t ProfilingStarted = {};
46 
47 __sanitizer::atomic_uintptr_t RootDetector = {};
getRootDetector()48 RootAutoDetector *getRootDetector() {
49   return reinterpret_cast<RootAutoDetector *>(
50       __sanitizer::atomic_load_relaxed(&RootDetector));
51 }
52 
53 // utility to taint a pointer by setting the LSB. There is an assumption
54 // throughout that the addresses of contexts are even (really, they should be
55 // align(8), but "even"-ness is the minimum assumption)
56 // "scratch contexts" are buffers that we return in certain cases - they are
57 // large enough to allow for memory safe counter access, but they don't link
58 // subcontexts below them (the runtime recognizes them and enforces that)
markAsScratch(const ContextNode * Ctx)59 ContextNode *markAsScratch(const ContextNode *Ctx) {
60   return reinterpret_cast<ContextNode *>(reinterpret_cast<uint64_t>(Ctx) | 1);
61 }
62 
63 // Used when getting the data from TLS. We don't *really* need to reset, but
64 // it's a simpler system if we do.
consume(T & V)65 template <typename T> inline T consume(T &V) {
66   auto R = V;
67   V = {0};
68   return R;
69 }
70 
71 // We allocate at least kBuffSize Arena pages. The scratch buffer is also that
72 // large.
73 constexpr size_t kPower = 20;
74 constexpr size_t kBuffSize = 1 << kPower;
75 
76 // Highly unlikely we need more than kBuffSize for a context.
getArenaAllocSize(size_t Needed)77 size_t getArenaAllocSize(size_t Needed) {
78   if (Needed >= kBuffSize)
79     return 2 * Needed;
80   return kBuffSize;
81 }
82 
83 // verify the structural integrity of the context
validate(const ContextRoot * Root)84 bool validate(const ContextRoot *Root) {
85   // all contexts should be laid out in some arena page. Go over each arena
86   // allocated for this Root, and jump over contained contexts based on
87   // self-reported sizes.
88   __sanitizer::DenseMap<uint64_t, bool> ContextStartAddrs;
89   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
90     const auto *Pos = Mem->start();
91     while (Pos < Mem->pos()) {
92       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
93       if (!ContextStartAddrs.insert({reinterpret_cast<uint64_t>(Ctx), true})
94                .second)
95         return false;
96       Pos += Ctx->size();
97     }
98   }
99 
100   // Now traverse the contexts again the same way, but validate all nonull
101   // subcontext addresses appear in the set computed above.
102   for (const auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next()) {
103     const auto *Pos = Mem->start();
104     while (Pos < Mem->pos()) {
105       const auto *Ctx = reinterpret_cast<const ContextNode *>(Pos);
106       for (uint32_t I = 0; I < Ctx->callsites_size(); ++I)
107         for (auto *Sub = Ctx->subContexts()[I]; Sub; Sub = Sub->next())
108           if (!ContextStartAddrs.find(reinterpret_cast<uint64_t>(Sub)))
109             return false;
110 
111       Pos += Ctx->size();
112     }
113   }
114   return true;
115 }
116 
allocContextNode(char * Place,GUID Guid,uint32_t NumCounters,uint32_t NumCallsites,ContextNode * Next=nullptr)117 inline ContextNode *allocContextNode(char *Place, GUID Guid,
118                                      uint32_t NumCounters,
119                                      uint32_t NumCallsites,
120                                      ContextNode *Next = nullptr) {
121   assert(reinterpret_cast<uint64_t>(Place) % ExpectedAlignment == 0);
122   return new (Place) ContextNode(Guid, NumCounters, NumCallsites, Next);
123 }
124 
resetContextNode(ContextNode & Node)125 void resetContextNode(ContextNode &Node) {
126   // FIXME(mtrofin): this is std::memset, which we can probably use if we
127   // drop/reduce the dependency on sanitizer_common.
128   for (uint32_t I = 0; I < Node.counters_size(); ++I)
129     Node.counters()[I] = 0;
130   for (uint32_t I = 0; I < Node.callsites_size(); ++I)
131     for (auto *Next = Node.subContexts()[I]; Next; Next = Next->next())
132       resetContextNode(*Next);
133 }
134 
onContextEnter(ContextNode & Node)135 ContextNode *onContextEnter(ContextNode &Node) {
136   ++Node.counters()[0];
137   return &Node;
138 }
139 
140 } // namespace
141 
142 // the scratch buffer - what we give when we can't produce a real context (the
143 // scratch isn't "real" in that it's expected to be clobbered carelessly - we
144 // don't read it). The other important thing is that the callees from a scratch
145 // context also get a scratch context.
146 // Eventually this can be replaced with per-function buffers, a'la the typical
147 // (flat) instrumented FDO buffers. The clobbering aspect won't apply there, but
148 // the part about determining the nature of the subcontexts does.
149 __thread char __Buffer[kBuffSize] = {0};
150 
151 #define TheScratchContext                                                      \
152   markAsScratch(reinterpret_cast<ContextNode *>(__Buffer))
153 
154 // init the TLSes
155 __thread void *volatile __llvm_ctx_profile_expected_callee[2] = {nullptr,
156                                                                  nullptr};
157 __thread ContextNode **volatile __llvm_ctx_profile_callsite[2] = {0, 0};
158 
159 __thread ContextRoot *volatile __llvm_ctx_profile_current_context_root =
160     nullptr;
161 
Arena(uint32_t Size)162 Arena::Arena(uint32_t Size) : Size(Size) {
163   __sanitizer::internal_memset(start(), 0, Size);
164 }
165 
166 // FIXME(mtrofin): use malloc / mmap instead of sanitizer common APIs to reduce
167 // the dependency on the latter.
allocateNewArena(size_t Size,Arena * Prev)168 Arena *Arena::allocateNewArena(size_t Size, Arena *Prev) {
169   assert(!Prev || Prev->Next == nullptr);
170   Arena *NewArena = new (__sanitizer::InternalAlloc(
171       Size + sizeof(Arena), /*cache=*/nullptr, /*alignment=*/ExpectedAlignment))
172       Arena(Size);
173   if (Prev)
174     Prev->Next = NewArena;
175   return NewArena;
176 }
177 
freeArenaList(Arena * & A)178 void Arena::freeArenaList(Arena *&A) {
179   assert(A);
180   for (auto *I = A; I != nullptr;) {
181     auto *Current = I;
182     I = I->Next;
183     __sanitizer::InternalFree(Current);
184   }
185   A = nullptr;
186 }
187 
188 // If this is the first time we hit a callsite with this (Guid) particular
189 // callee, we need to allocate.
getCallsiteSlow(GUID Guid,ContextNode ** InsertionPoint,uint32_t NumCounters,uint32_t NumCallsites)190 ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
191                              uint32_t NumCounters, uint32_t NumCallsites) {
192   auto AllocSize = ContextNode::getAllocSize(NumCounters, NumCallsites);
193   auto *Mem = __llvm_ctx_profile_current_context_root->CurrentMem;
194   char *AllocPlace = Mem->tryBumpAllocate(AllocSize);
195   if (!AllocPlace) {
196     // if we failed to allocate on the current arena, allocate a new arena,
197     // and place it on __llvm_ctx_profile_current_context_root->CurrentMem so we
198     // find it from now on for other cases when we need to getCallsiteSlow.
199     // Note that allocateNewArena will link the allocated memory in the list of
200     // Arenas.
201     __llvm_ctx_profile_current_context_root->CurrentMem = Mem =
202         Mem->allocateNewArena(getArenaAllocSize(AllocSize), Mem);
203     AllocPlace = Mem->tryBumpAllocate(AllocSize);
204   }
205   auto *Ret = allocContextNode(AllocPlace, Guid, NumCounters, NumCallsites,
206                                *InsertionPoint);
207   *InsertionPoint = Ret;
208   return Ret;
209 }
210 
getFlatProfile(FunctionData & Data,void * Callee,GUID Guid,uint32_t NumCounters)211 ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
212                             uint32_t NumCounters) {
213   if (ContextNode *Existing = Data.FlatCtx)
214     return Existing;
215   {
216     // We could instead try to take the lock and, if that fails, return
217     // TheScratchContext. But that could leave message pump loops more sparsely
218     // profiled than everything else. Maybe that doesn't matter, and we can
219     // optimize this later.
220     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
221     if (ContextNode *Existing = Data.FlatCtx)
222       return Existing;
223 
224     auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
225     char *AllocBuff = nullptr;
226     {
227       __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
228           &FlatCtxArenaMutex);
229       if (FlatCtxArena)
230         AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
231       if (!AllocBuff) {
232         FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
233                                                FlatCtxArena);
234         AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
235       }
236       if (!FlatCtxArenaHead)
237         FlatCtxArenaHead = FlatCtxArena;
238     }
239     auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
240     Data.FlatCtx = Ret;
241 
242     Data.EntryAddress = Callee;
243     Data.Next = reinterpret_cast<FunctionData *>(
244         __sanitizer::atomic_load_relaxed(&AllFunctionsData));
245     while (!__sanitizer::atomic_compare_exchange_strong(
246         &AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
247         reinterpret_cast<uintptr_t>(&Data),
248         __sanitizer::memory_order_release)) {
249     }
250   }
251 
252   return Data.FlatCtx;
253 }
254 
255 // This should be called once for a Root. Allocate the first arena, set up the
256 // first context.
setupContext(ContextRoot * Root,GUID Guid,uint32_t NumCounters,uint32_t NumCallsites)257 void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
258                   uint32_t NumCallsites) {
259   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
260       &AllContextsMutex);
261   // Re-check - we got here without having had taken a lock.
262   if (Root->FirstMemBlock)
263     return;
264   const auto Needed = ContextNode::getAllocSize(NumCounters, NumCallsites);
265   auto *M = Arena::allocateNewArena(getArenaAllocSize(Needed));
266   Root->FirstMemBlock = M;
267   Root->CurrentMem = M;
268   Root->FirstNode = allocContextNode(M->tryBumpAllocate(Needed), Guid,
269                                      NumCounters, NumCallsites);
270   AllContextRoots.PushBack(Root);
271 }
272 
getOrAllocateContextRoot()273 ContextRoot *FunctionData::getOrAllocateContextRoot() {
274   auto *Root = CtxRoot;
275   if (!canBeRoot(Root))
276     return Root;
277   if (Root)
278     return Root;
279   __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Mutex);
280   Root = CtxRoot;
281   if (!Root) {
282     Root = new (__sanitizer::InternalAlloc(sizeof(ContextRoot))) ContextRoot();
283     CtxRoot = Root;
284   }
285 
286   assert(Root);
287   return Root;
288 }
289 
tryStartContextGivenRoot(ContextRoot * Root,GUID Guid,uint32_t Counters,uint32_t Callsites)290 ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
291                                       uint32_t Counters, uint32_t Callsites)
292     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
293   IsUnderContext = true;
294   __sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
295                                 __sanitizer::memory_order_relaxed);
296   if (!Root->FirstMemBlock) {
297     setupContext(Root, Guid, Counters, Callsites);
298   }
299   if (Root->Taken.TryLock()) {
300     __llvm_ctx_profile_current_context_root = Root;
301     onContextEnter(*Root->FirstNode);
302     return Root->FirstNode;
303   }
304   // If this thread couldn't take the lock, return scratch context.
305   __llvm_ctx_profile_current_context_root = nullptr;
306   return TheScratchContext;
307 }
308 
getUnhandledContext(FunctionData & Data,void * Callee,GUID Guid,uint32_t NumCounters,uint32_t NumCallsites,ContextRoot * CtxRoot)309 ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
310                                  uint32_t NumCounters, uint32_t NumCallsites,
311                                  ContextRoot *CtxRoot) {
312 
313   // 1) if we are currently collecting a contextual profile, fetch a ContextNode
314   // in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
315   // to (hopefully) offset the penalty of creating these contexts to before
316   // profiling.
317   //
318   // 2) if we are under a root (regardless if this thread is collecting or not a
319   // contextual profile for that root), do not collect a flat profile. We want
320   // to keep flat profiles only for activations that can't happen under a root,
321   // to avoid confusing profiles. We can, for example, combine flattened and
322   // flat profiles meaningfully, as we wouldn't double-count anything.
323   //
324   // 3) to avoid lengthy startup, don't bother with flat profiles until the
325   // profiling has started. We would reset them anyway when profiling starts.
326   // HOWEVER. This does lose profiling for message pumps: those functions are
327   // entered once and never exit. They should be assumed to be entered before
328   // profiling starts - because profiling should start after the server is up
329   // and running (which is equivalent to "message pumps are set up").
330   if (!CtxRoot) {
331     if (auto *RAD = getRootDetector())
332       RAD->sample();
333     else if (auto *CR = Data.CtxRoot) {
334       if (canBeRoot(CR))
335         return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites);
336     }
337     if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
338       return TheScratchContext;
339     else
340       return markAsScratch(
341           onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
342   }
343   auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
344   if (Ins)
345     Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
346                                    NumCounters, 0);
347   return markAsScratch(onContextEnter(*Iter->second));
348 }
349 
__llvm_ctx_profile_get_context(FunctionData * Data,void * Callee,GUID Guid,uint32_t NumCounters,uint32_t NumCallsites)350 ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
351                                             GUID Guid, uint32_t NumCounters,
352                                             uint32_t NumCallsites) {
353   auto *CtxRoot = __llvm_ctx_profile_current_context_root;
354   // fast "out" if we're not even doing contextual collection.
355   if (!CtxRoot)
356     return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
357                                nullptr);
358 
359   // also fast "out" if the caller is scratch. We can see if it's scratch by
360   // looking at the interior pointer into the subcontexts vector that the caller
361   // provided, which, if the context is scratch, so is that interior pointer
362   // (because all the address calculations are using even values. Or more
363   // precisely, aligned - 8 values)
364   auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
365   if (!CallsiteContext || isScratch(CallsiteContext))
366     return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
367                                CtxRoot);
368 
369   // if the callee isn't the expected one, return scratch.
370   // Signal handler(s) could have been invoked at any point in the execution.
371   // Should that have happened, and had it (the handler) be built with
372   // instrumentation, its __llvm_ctx_profile_get_context would have failed here.
373   // Its sub call graph would have then populated
374   // __llvm_ctx_profile_{expected_callee | callsite} at index 1.
375   // The normal call graph may be impacted in that, if the signal handler
376   // happened somewhere before we read the TLS here, we'd see the TLS reset and
377   // we'd also fail here. That would just mean we would loose counter values for
378   // the normal subgraph, this time around. That should be very unlikely, but if
379   // it happens too frequently, we should be able to detect discrepancies in
380   // entry counts (caller-callee). At the moment, the design goes on the
381   // assumption that is so unfrequent, though, that it's not worth doing more
382   // for that case.
383   auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
384   if (ExpectedCallee != Callee)
385     return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
386                                CtxRoot);
387 
388   auto *Callsite = *CallsiteContext;
389   // in the case of indirect calls, we will have all seen targets forming a
390   // linked list here. Find the one corresponding to this callee.
391   while (Callsite && Callsite->guid() != Guid) {
392     Callsite = Callsite->next();
393   }
394   auto *Ret = Callsite ? Callsite
395                        : getCallsiteSlow(Guid, CallsiteContext, NumCounters,
396                                          NumCallsites);
397   if (Ret->callsites_size() != NumCallsites ||
398       Ret->counters_size() != NumCounters)
399     __sanitizer::Printf("[ctxprof] Returned ctx differs from what's asked: "
400                         "Context: %p, Asked: %lu %u %u, Got: %lu %u %u \n",
401                         reinterpret_cast<void *>(Ret), Guid, NumCallsites,
402                         NumCounters, Ret->guid(), Ret->callsites_size(),
403                         Ret->counters_size());
404   onContextEnter(*Ret);
405   return Ret;
406 }
407 
__llvm_ctx_profile_start_context(FunctionData * FData,GUID Guid,uint32_t Counters,uint32_t Callsites)408 ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
409                                               uint32_t Counters,
410                                               uint32_t Callsites) {
411   auto *Root = FData->getOrAllocateContextRoot();
412   assert(canBeRoot(Root));
413   return tryStartContextGivenRoot(Root, Guid, Counters, Callsites);
414 }
415 
__llvm_ctx_profile_release_context(FunctionData * FData)416 void __llvm_ctx_profile_release_context(FunctionData *FData)
417     SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
418   const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
419   auto *CR = FData->CtxRoot;
420   if (!CurrentRoot || CR != CurrentRoot)
421     return;
422   IsUnderContext = false;
423   assert(CR && canBeRoot(CR));
424   __llvm_ctx_profile_current_context_root = nullptr;
425   CR->Taken.Unlock();
426 }
427 
__llvm_ctx_profile_start_collection(unsigned AutodetectDuration)428 void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) {
429   size_t NumMemUnits = 0;
430   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
431       &AllContextsMutex);
432   for (uint32_t I = 0; I < AllContextRoots.Size(); ++I) {
433     auto *Root = AllContextRoots[I];
434     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> Lock(
435         &Root->Taken);
436     for (auto *Mem = Root->FirstMemBlock; Mem; Mem = Mem->next())
437       ++NumMemUnits;
438 
439     resetContextNode(*Root->FirstNode);
440     if (Root->FirstUnhandledCalleeNode)
441       resetContextNode(*Root->FirstUnhandledCalleeNode);
442     __sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
443   }
444   if (AutodetectDuration) {
445     // we leak RD intentionally. Knowing when to free it is tricky, there's a
446     // race condition with functions observing the `RootDectector` as non-null.
447     // This can be addressed but the alternatives have some added complexity and
448     // it's not (yet) worth it.
449     auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
450         RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration);
451     RD->start();
452   } else {
453     __sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
454   }
455   __sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
456 }
457 
__llvm_ctx_profile_fetch(ProfileWriter & Writer)458 bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
459   __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
460   if (auto *RD = getRootDetector()) {
461     __sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
462                         "finished well before attempting to fetch a context");
463     RD->join();
464   }
465 
466   __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
467       &AllContextsMutex);
468 
469   Writer.startContextSection();
470   for (int I = 0, E = AllContextRoots.Size(); I < E; ++I) {
471     auto *Root = AllContextRoots[I];
472     __sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> TakenLock(
473         &Root->Taken);
474     if (!validate(Root)) {
475       __sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
476       return false;
477     }
478     Writer.writeContextual(
479         *Root->FirstNode, Root->FirstUnhandledCalleeNode,
480         __sanitizer::atomic_load_relaxed(&Root->TotalEntries));
481   }
482   Writer.endContextSection();
483   Writer.startFlatSection();
484   // The list progresses behind the head, so taking this snapshot allows the
485   // list to grow concurrently without causing a race condition with our
486   // traversing it.
487   const auto *Pos = reinterpret_cast<const FunctionData *>(
488       __sanitizer::atomic_load_relaxed(&AllFunctionsData));
489   for (; Pos; Pos = Pos->Next) {
490     const auto *CR = Pos->CtxRoot;
491     if (!CR && canBeRoot(CR)) {
492       const auto *FP = Pos->FlatCtx;
493       Writer.writeFlat(FP->guid(), FP->counters(), FP->counters_size());
494     }
495   }
496   Writer.endFlatSection();
497   return true;
498 }
499 
__llvm_ctx_profile_free()500 void __llvm_ctx_profile_free() {
501   __sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
502   {
503     __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
504         &AllContextsMutex);
505     for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
506       for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
507         auto *C = A;
508         A = A->next();
509         __sanitizer::InternalFree(C);
510       }
511     AllContextRoots.Reset();
512   }
513   __sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
514   {
515     __sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
516         &FlatCtxArenaMutex);
517     FlatCtxArena = nullptr;
518     for (auto *A = FlatCtxArenaHead; A;) {
519       auto *C = A;
520       A = C->next();
521       __sanitizer::InternalFree(C);
522     }
523 
524     FlatCtxArenaHead = nullptr;
525   }
526 }
527