1 //===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "RootAutoDetector.h"
10
11 #include "CtxInstrProfiling.h"
12 #include "sanitizer_common/sanitizer_common.h"
13 #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
14 #include <assert.h>
15 #include <dlfcn.h>
16 #include <pthread.h>
17
18 using namespace __ctx_profile;
19 template <typename T> using Set = DenseMap<T, bool>;
20
21 namespace __sanitizer {
UnwindImpl(uptr pc,uptr bp,void * context,bool request_fast,u32 max_depth)22 void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23 bool request_fast, u32 max_depth) {
24 // We can't implement the fast variant. The fast variant ends up invoking an
25 // external allocator, because of pthread_attr_getstack. If this happens
26 // during an allocation of the program being instrumented, a non-reentrant
27 // lock may be taken (this was observed). The allocator called by
28 // pthread_attr_getstack will also try to take that lock.
29 UnwindSlow(pc, max_depth);
30 }
31 } // namespace __sanitizer
32
PerThreadSamples(RootAutoDetector & Parent)33 RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34 GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35 Parent.AllSamples.PushBack(this);
36 }
37
start()38 void RootAutoDetector::start() {
39 atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
40 pthread_create(
41 &WorkerThread, nullptr,
42 +[](void *Ctx) -> void * {
43 RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44 SleepForSeconds(RAD->WaitSeconds);
45 // To avoid holding the AllSamplesMutex, make a snapshot of all the
46 // thread samples collected so far
47 Vector<PerThreadSamples *> SamplesSnapshot;
48 {
49 GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50 SamplesSnapshot.Resize(RAD->AllSamples.Size());
51 for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
52 SamplesSnapshot[I] = RAD->AllSamples[I];
53 }
54 DenseMap<uptr, uint64_t> AllRoots;
55 for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
56 GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
57 SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
58 auto [FAddr, Count] = KVP;
59 AllRoots[FAddr] += Count;
60 return true;
61 });
62 }
63 // FIXME: as a next step, establish a minimum relative nr of samples
64 // per root that would qualify it as a root.
65 for (auto *FD = reinterpret_cast<FunctionData *>(
66 atomic_load_relaxed(&RAD->FunctionDataListHead));
67 FD; FD = FD->Next) {
68 if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
69 if (canBeRoot(FD->CtxRoot)) {
70 FD->getOrAllocateContextRoot();
71 } else {
72 // FIXME: address this by informing the root detection algorithm
73 // to skip over such functions and pick the next down in the
74 // stack. At that point, this becomes an assert.
75 Printf("[ctxprof] Root auto-detector selected a musttail "
76 "function for root (%p). Ignoring\n",
77 FD->EntryAddress);
78 }
79 }
80 }
81 atomic_store_relaxed(&RAD->Self, 0);
82 return nullptr;
83 },
84 this);
85 }
86
join()87 void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }
88
sample()89 void RootAutoDetector::sample() {
90 // tracking reentry in case we want to re-explore fast stack unwind - which
91 // does potentially re-enter the runtime because it calls the instrumented
92 // allocator because of pthread_attr_getstack. See the notes also on
93 // UnwindImpl above.
94 static thread_local bool Entered = false;
95 static thread_local uint64_t Entries = 0;
96 if (Entered || (++Entries % SampleRate))
97 return;
98 Entered = true;
99 collectStack();
100 Entered = false;
101 }
102
collectStack()103 void RootAutoDetector::collectStack() {
104 GET_CALLER_PC_BP;
105 BufferedStackTrace CurrentStack;
106 CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false);
107 // 2 stack frames would be very unlikely to mean anything, since at least the
108 // compiler-rt frame - which can't be inlined - should be observable, which
109 // counts as 1; we can be even more aggressive with this number.
110 if (CurrentStack.size <= 2)
111 return;
112 static thread_local PerThreadSamples *ThisThreadSamples =
113 new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
114 PerThreadSamples(*this);
115
116 if (!ThisThreadSamples->M.TryLock())
117 return;
118
119 ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
120 ThisThreadSamples->M.Unlock();
121 }
122
getFctStartAddr(uptr CallsiteAddress) const123 uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
124 // this requires --linkopt=-Wl,--export-dynamic
125 Dl_info Info;
126 if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
127 return reinterpret_cast<uptr>(Info.dli_saddr);
128 return 0;
129 }
130
insertStack(const StackTrace & ST)131 void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
132 ++TheTrie.Count;
133 auto *Current = &TheTrie;
134 // the stack is backwards - the first callsite is at the top.
135 for (int I = ST.size - 1; I >= 0; --I) {
136 uptr ChildAddr = ST.trace[I];
137 auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
138 ++Iter->second.Count;
139 Current = &Iter->second;
140 }
141 }
142
determineRoots() const143 DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
144 // Assuming a message pump design, roots are those functions called by the
145 // message pump. The message pump is an infinite loop (for all practical
146 // considerations) fetching data from a queue. The root functions return -
147 // otherwise the message pump doesn't work. This function detects roots as the
148 // first place in the trie (starting from the root) where a function calls 2
149 // or more functions.
150 //
151 // We start with a callsite trie - the nodes are callsites. Different child
152 // nodes may actually correspond to the same function.
153 //
154 // For example: using function(callsite)
155 // f1(csf1_1) -> f2(csf2_1) -> f3
156 // -> f2(csf2_2) -> f4
157 //
158 // would be represented in our trie as:
159 // csf1_1 -> csf2_1 -> f3
160 // -> csf2_2 -> f4
161 //
162 // While we can assert the control flow returns to f2, we don't know if it
163 // ever returns to f1. f2 could be the message pump.
164 //
165 // We need to convert our callsite tree into a function tree. We can also,
166 // more economically, just see how many distinct functions there are at a
167 // certain depth. When that count is greater than 1, we got to potential roots
168 // and everything above should be considered as non-roots.
169 DenseMap<uptr, uint64_t> Result;
170 Set<const Trie *> Worklist;
171 Worklist.insert({&TheTrie, {}});
172
173 while (!Worklist.empty()) {
174 Set<const Trie *> NextWorklist;
175 DenseMap<uptr, uint64_t> Candidates;
176 Worklist.forEach([&](const auto &KVP) {
177 auto [Node, _] = KVP;
178 auto SA = getFctStartAddr(Node->CallsiteAddress);
179 Candidates[SA] += Node->Count;
180 Node->Children.forEach([&](auto &ChildKVP) {
181 NextWorklist.insert({&ChildKVP.second, true});
182 return true;
183 });
184 return true;
185 });
186 if (Candidates.size() > 1) {
187 Result.swap(Candidates);
188 break;
189 }
190 Worklist.swap(NextWorklist);
191 }
192 return Result;
193 }
194