xref: /freebsd/contrib/llvm-project/compiler-rt/lib/ctx_profile/RootAutoDetector.cpp (revision 770cf0a5f02dc8983a89c6568d741fbc25baa999)
1 //===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "RootAutoDetector.h"
10 
11 #include "CtxInstrProfiling.h"
12 #include "sanitizer_common/sanitizer_common.h"
13 #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
14 #include <assert.h>
15 #include <dlfcn.h>
16 #include <pthread.h>
17 
18 using namespace __ctx_profile;
19 template <typename T> using Set = DenseMap<T, bool>;
20 
21 namespace __sanitizer {
22 void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23                                     bool request_fast, u32 max_depth) {
24   // We can't implement the fast variant. The fast variant ends up invoking an
25   // external allocator, because of pthread_attr_getstack. If this happens
26   // during an allocation of the program being instrumented, a non-reentrant
27   // lock may be taken (this was observed). The allocator called by
28   // pthread_attr_getstack will also try to take that lock.
29   UnwindSlow(pc, max_depth);
30 }
31 } // namespace __sanitizer
32 
33 RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34   GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35   Parent.AllSamples.PushBack(this);
36 }
37 
38 void RootAutoDetector::start() {
39   atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
40   pthread_create(
41       &WorkerThread, nullptr,
42       +[](void *Ctx) -> void * {
43         RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44         SleepForSeconds(RAD->WaitSeconds);
45         // To avoid holding the AllSamplesMutex, make a snapshot of all the
46         // thread samples collected so far
47         Vector<PerThreadSamples *> SamplesSnapshot;
48         {
49           GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50           SamplesSnapshot.Resize(RAD->AllSamples.Size());
51           for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
52             SamplesSnapshot[I] = RAD->AllSamples[I];
53         }
54         DenseMap<uptr, uint64_t> AllRoots;
55         for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
56           GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
57           SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
58             auto [FAddr, Count] = KVP;
59             AllRoots[FAddr] += Count;
60             return true;
61           });
62         }
63         // FIXME: as a next step, establish a minimum relative nr of samples
64         // per root that would qualify it as a root.
65         for (auto *FD = reinterpret_cast<FunctionData *>(
66                  atomic_load_relaxed(&RAD->FunctionDataListHead));
67              FD; FD = FD->Next) {
68           if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
69             if (canBeRoot(FD->CtxRoot)) {
70               FD->getOrAllocateContextRoot();
71             } else {
72               // FIXME: address this by informing the root detection algorithm
73               // to skip over such functions and pick the next down in the
74               // stack. At that point, this becomes an assert.
75               Printf("[ctxprof] Root auto-detector selected a musttail "
76                      "function for root (%p). Ignoring\n",
77                      FD->EntryAddress);
78             }
79           }
80         }
81         atomic_store_relaxed(&RAD->Self, 0);
82         return nullptr;
83       },
84       this);
85 }
86 
87 void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }
88 
89 void RootAutoDetector::sample() {
90   // tracking reentry in case we want to re-explore fast stack unwind - which
91   // does potentially re-enter the runtime because it calls the instrumented
92   // allocator because of pthread_attr_getstack. See the notes also on
93   // UnwindImpl above.
94   static thread_local bool Entered = false;
95   static thread_local uint64_t Entries = 0;
96   if (Entered || (++Entries % SampleRate))
97     return;
98   Entered = true;
99   collectStack();
100   Entered = false;
101 }
102 
103 void RootAutoDetector::collectStack() {
104   GET_CALLER_PC_BP;
105   BufferedStackTrace CurrentStack;
106   CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
107   // 2 stack frames would be very unlikely to mean anything, since at least the
108   // compiler-rt frame - which can't be inlined - should be observable, which
109   // counts as 1; we can be even more aggressive with this number.
110   if (CurrentStack.size <= 2)
111     return;
112   static thread_local PerThreadSamples *ThisThreadSamples =
113       new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
114           PerThreadSamples(*this);
115 
116   if (!ThisThreadSamples->M.TryLock())
117     return;
118 
119   ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
120   ThisThreadSamples->M.Unlock();
121 }
122 
123 uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
124   // this requires --linkopt=-Wl,--export-dynamic
125   Dl_info Info;
126   if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
127     return reinterpret_cast<uptr>(Info.dli_saddr);
128   return 0;
129 }
130 
131 void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
132   ++TheTrie.Count;
133   auto *Current = &TheTrie;
134   // the stack is backwards - the first callsite is at the top.
135   for (int I = ST.size - 1; I >= 0; --I) {
136     uptr ChildAddr = ST.trace[I];
137     auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
138     ++Iter->second.Count;
139     Current = &Iter->second;
140   }
141 }
142 
143 DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
144   // Assuming a message pump design, roots are those functions called by the
145   // message pump. The message pump is an infinite loop (for all practical
146   // considerations) fetching data from a queue. The root functions return -
147   // otherwise the message pump doesn't work. This function detects roots as the
148   // first place in the trie (starting from the root) where a function calls 2
149   // or more functions.
150   //
151   // We start with a callsite trie - the nodes are callsites. Different child
152   // nodes may actually correspond to the same function.
153   //
154   // For example: using function(callsite)
155   // f1(csf1_1) -> f2(csf2_1) -> f3
156   //            -> f2(csf2_2) -> f4
157   //
158   // would be represented in our trie as:
159   // csf1_1 -> csf2_1 -> f3
160   //        -> csf2_2 -> f4
161   //
162   // While we can assert the control flow returns to f2, we don't know if it
163   // ever returns to f1. f2 could be the message pump.
164   //
165   // We need to convert our callsite tree into a function tree. We can also,
166   // more economically, just see how many distinct functions there are at a
167   // certain depth. When that count is greater than 1, we got to potential roots
168   // and everything above should be considered as non-roots.
169   DenseMap<uptr, uint64_t> Result;
170   Set<const Trie *> Worklist;
171   Worklist.insert({&TheTrie, {}});
172 
173   while (!Worklist.empty()) {
174     Set<const Trie *> NextWorklist;
175     DenseMap<uptr, uint64_t> Candidates;
176     Worklist.forEach([&](const auto &KVP) {
177       auto [Node, _] = KVP;
178       auto SA = getFctStartAddr(Node->CallsiteAddress);
179       Candidates[SA] += Node->Count;
180       Node->Children.forEach([&](auto &ChildKVP) {
181         NextWorklist.insert({&ChildKVP.second, true});
182         return true;
183       });
184       return true;
185     });
186     if (Candidates.size() > 1) {
187       Result.swap(Candidates);
188       break;
189     }
190     Worklist.swap(NextWorklist);
191   }
192   return Result;
193 }
194