1 //===- RootAutodetector.cpp - detect contextual profiling roots -----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 9 #include "RootAutoDetector.h" 10 11 #include "CtxInstrProfiling.h" 12 #include "sanitizer_common/sanitizer_common.h" 13 #include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap) 14 #include <assert.h> 15 #include <dlfcn.h> 16 #include <pthread.h> 17 18 using namespace __ctx_profile; 19 template <typename T> using Set = DenseMap<T, bool>; 20 21 namespace __sanitizer { 22 void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context, 23 bool request_fast, u32 max_depth) { 24 // We can't implement the fast variant. The fast variant ends up invoking an 25 // external allocator, because of pthread_attr_getstack. If this happens 26 // during an allocation of the program being instrumented, a non-reentrant 27 // lock may be taken (this was observed). The allocator called by 28 // pthread_attr_getstack will also try to take that lock. 29 UnwindSlow(pc, max_depth); 30 } 31 } // namespace __sanitizer 32 33 RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) { 34 GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex); 35 Parent.AllSamples.PushBack(this); 36 } 37 38 void RootAutoDetector::start() { 39 atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this)); 40 pthread_create( 41 &WorkerThread, nullptr, 42 +[](void *Ctx) -> void * { 43 RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx); 44 SleepForSeconds(RAD->WaitSeconds); 45 // To avoid holding the AllSamplesMutex, make a snapshot of all the 46 // thread samples collected so far 47 Vector<PerThreadSamples *> SamplesSnapshot; 48 { 49 GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex); 50 SamplesSnapshot.Resize(RAD->AllSamples.Size()); 51 for (uptr I = 0; I < RAD->AllSamples.Size(); ++I) 52 SamplesSnapshot[I] = RAD->AllSamples[I]; 53 } 54 DenseMap<uptr, uint64_t> AllRoots; 55 for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) { 56 GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M); 57 SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) { 58 auto [FAddr, Count] = KVP; 59 AllRoots[FAddr] += Count; 60 return true; 61 }); 62 } 63 // FIXME: as a next step, establish a minimum relative nr of samples 64 // per root that would qualify it as a root. 65 for (auto *FD = reinterpret_cast<FunctionData *>( 66 atomic_load_relaxed(&RAD->FunctionDataListHead)); 67 FD; FD = FD->Next) { 68 if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) { 69 if (canBeRoot(FD->CtxRoot)) { 70 FD->getOrAllocateContextRoot(); 71 } else { 72 // FIXME: address this by informing the root detection algorithm 73 // to skip over such functions and pick the next down in the 74 // stack. At that point, this becomes an assert. 75 Printf("[ctxprof] Root auto-detector selected a musttail " 76 "function for root (%p). Ignoring\n", 77 FD->EntryAddress); 78 } 79 } 80 } 81 atomic_store_relaxed(&RAD->Self, 0); 82 return nullptr; 83 }, 84 this); 85 } 86 87 void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); } 88 89 void RootAutoDetector::sample() { 90 // tracking reentry in case we want to re-explore fast stack unwind - which 91 // does potentially re-enter the runtime because it calls the instrumented 92 // allocator because of pthread_attr_getstack. See the notes also on 93 // UnwindImpl above. 94 static thread_local bool Entered = false; 95 static thread_local uint64_t Entries = 0; 96 if (Entered || (++Entries % SampleRate)) 97 return; 98 Entered = true; 99 collectStack(); 100 Entered = false; 101 } 102 103 void RootAutoDetector::collectStack() { 104 GET_CALLER_PC_BP; 105 BufferedStackTrace CurrentStack; 106 CurrentStack.Unwind(pc, bp, /*context=*/nullptr, /*request_fast=*/false); 107 // 2 stack frames would be very unlikely to mean anything, since at least the 108 // compiler-rt frame - which can't be inlined - should be observable, which 109 // counts as 1; we can be even more aggressive with this number. 110 if (CurrentStack.size <= 2) 111 return; 112 static thread_local PerThreadSamples *ThisThreadSamples = 113 new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples))) 114 PerThreadSamples(*this); 115 116 if (!ThisThreadSamples->M.TryLock()) 117 return; 118 119 ThisThreadSamples->TrieRoot.insertStack(CurrentStack); 120 ThisThreadSamples->M.Unlock(); 121 } 122 123 uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const { 124 // this requires --linkopt=-Wl,--export-dynamic 125 Dl_info Info; 126 if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0) 127 return reinterpret_cast<uptr>(Info.dli_saddr); 128 return 0; 129 } 130 131 void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) { 132 ++TheTrie.Count; 133 auto *Current = &TheTrie; 134 // the stack is backwards - the first callsite is at the top. 135 for (int I = ST.size - 1; I >= 0; --I) { 136 uptr ChildAddr = ST.trace[I]; 137 auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)}); 138 ++Iter->second.Count; 139 Current = &Iter->second; 140 } 141 } 142 143 DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const { 144 // Assuming a message pump design, roots are those functions called by the 145 // message pump. The message pump is an infinite loop (for all practical 146 // considerations) fetching data from a queue. The root functions return - 147 // otherwise the message pump doesn't work. This function detects roots as the 148 // first place in the trie (starting from the root) where a function calls 2 149 // or more functions. 150 // 151 // We start with a callsite trie - the nodes are callsites. Different child 152 // nodes may actually correspond to the same function. 153 // 154 // For example: using function(callsite) 155 // f1(csf1_1) -> f2(csf2_1) -> f3 156 // -> f2(csf2_2) -> f4 157 // 158 // would be represented in our trie as: 159 // csf1_1 -> csf2_1 -> f3 160 // -> csf2_2 -> f4 161 // 162 // While we can assert the control flow returns to f2, we don't know if it 163 // ever returns to f1. f2 could be the message pump. 164 // 165 // We need to convert our callsite tree into a function tree. We can also, 166 // more economically, just see how many distinct functions there are at a 167 // certain depth. When that count is greater than 1, we got to potential roots 168 // and everything above should be considered as non-roots. 169 DenseMap<uptr, uint64_t> Result; 170 Set<const Trie *> Worklist; 171 Worklist.insert({&TheTrie, {}}); 172 173 while (!Worklist.empty()) { 174 Set<const Trie *> NextWorklist; 175 DenseMap<uptr, uint64_t> Candidates; 176 Worklist.forEach([&](const auto &KVP) { 177 auto [Node, _] = KVP; 178 auto SA = getFctStartAddr(Node->CallsiteAddress); 179 Candidates[SA] += Node->Count; 180 Node->Children.forEach([&](auto &ChildKVP) { 181 NextWorklist.insert({&ChildKVP.second, true}); 182 return true; 183 }); 184 return true; 185 }); 186 if (Candidates.size() > 1) { 187 Result.swap(Candidates); 188 break; 189 } 190 Worklist.swap(NextWorklist); 191 } 192 return Result; 193 } 194