xref: /freebsd/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.cpp (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 //===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file is a part of XRay, a dynamic runtime instrumentation system.
10 //
11 // This implements the interface for the profileCollectorService.
12 //
13 //===----------------------------------------------------------------------===//
14 #include "xray_profile_collector.h"
15 #include "sanitizer_common/sanitizer_common.h"
16 #include "xray_allocator.h"
17 #include "xray_defs.h"
18 #include "xray_profiling_flags.h"
19 #include "xray_segmented_array.h"
20 #include <memory>
21 #include <pthread.h>
22 #include <utility>
23 
24 namespace __xray {
25 namespace profileCollectorService {
26 
27 namespace {
28 
29 SpinMutex GlobalMutex;
30 struct ThreadTrie {
31   tid_t TId;
32   alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)];
33 };
34 
35 struct ProfileBuffer {
36   void *Data;
37   size_t Size;
38 };
39 
40 // Current version of the profile format.
41 constexpr u64 XRayProfilingVersion = 0x20180424;
42 
43 // Identifier for XRay profiling files 'xrayprof' in hex.
44 constexpr u64 XRayMagicBytes = 0x7872617970726f66;
45 
46 struct XRayProfilingFileHeader {
47   const u64 MagicBytes = XRayMagicBytes;
48   const u64 Version = XRayProfilingVersion;
49   u64 Timestamp = 0; // System time in nanoseconds.
50   u64 PID = 0;       // Process ID.
51 };
52 
53 struct BlockHeader {
54   u32 BlockSize;
55   u32 BlockNum;
56   u64 ThreadId;
57 };
58 
59 struct ThreadData {
60   BufferQueue *BQ;
61   FunctionCallTrie::Allocators::Buffers Buffers;
62   FunctionCallTrie::Allocators Allocators;
63   FunctionCallTrie FCT;
64   tid_t TId;
65 };
66 
67 using ThreadDataArray = Array<ThreadData>;
68 using ThreadDataAllocator = ThreadDataArray::AllocatorType;
69 
70 // We use a separate buffer queue for the backing store for the allocator used
71 // by the ThreadData array. This lets us host the buffers, allocators, and tries
72 // associated with a thread by moving the data into the array instead of
73 // attempting to copy the data to a separately backed set of tries.
74 alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)];
75 static BufferQueue *BQ = nullptr;
76 static BufferQueue::Buffer Buffer;
77 alignas(ThreadDataAllocator) static std::byte
78     ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)];
79 alignas(ThreadDataArray) static std::byte
80     ThreadDataArrayStorage[sizeof(ThreadDataArray)];
81 
82 static ThreadDataAllocator *TDAllocator = nullptr;
83 static ThreadDataArray *TDArray = nullptr;
84 
85 using ProfileBufferArray = Array<ProfileBuffer>;
86 using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
87 
88 // These need to be global aligned storage to avoid dynamic initialization. We
89 // need these to be aligned to allow us to placement new objects into the
90 // storage, and have pointers to those objects be appropriately aligned.
91 alignas(ProfileBufferArray) static std::byte
92     ProfileBuffersStorage[sizeof(ProfileBufferArray)];
93 alignas(ProfileBufferArrayAllocator) static std::byte
94     ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)];
95 
96 static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
97 static ProfileBufferArray *ProfileBuffers = nullptr;
98 
99 // Use a global flag to determine whether the collector implementation has been
100 // initialized.
101 static atomic_uint8_t CollectorInitialized{0};
102 
103 } // namespace
104 
105 void post(BufferQueue *Q, FunctionCallTrie &&T,
106           FunctionCallTrie::Allocators &&A,
107           FunctionCallTrie::Allocators::Buffers &&B,
108           tid_t TId) XRAY_NEVER_INSTRUMENT {
109   DCHECK_NE(Q, nullptr);
110 
111   // Bail out early if the collector has not been initialized.
112   if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
113     T.~FunctionCallTrie();
114     A.~Allocators();
115     Q->releaseBuffer(B.NodeBuffer);
116     Q->releaseBuffer(B.RootsBuffer);
117     Q->releaseBuffer(B.ShadowStackBuffer);
118     Q->releaseBuffer(B.NodeIdPairBuffer);
119     B.~Buffers();
120     return;
121   }
122 
123   {
124     SpinMutexLock Lock(&GlobalMutex);
125     DCHECK_NE(TDAllocator, nullptr);
126     DCHECK_NE(TDArray, nullptr);
127 
128     if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
129                                TId) == nullptr) {
130       // If we fail to add the data to the array, we should destroy the objects
131       // handed us.
132       T.~FunctionCallTrie();
133       A.~Allocators();
134       Q->releaseBuffer(B.NodeBuffer);
135       Q->releaseBuffer(B.RootsBuffer);
136       Q->releaseBuffer(B.ShadowStackBuffer);
137       Q->releaseBuffer(B.NodeIdPairBuffer);
138       B.~Buffers();
139     }
140   }
141 }
142 
143 // A PathArray represents the function id's representing a stack trace. In this
144 // context a path is almost always represented from the leaf function in a call
145 // stack to a root of the call trie.
146 using PathArray = Array<int32_t>;
147 
148 struct ProfileRecord {
149   using PathAllocator = typename PathArray::AllocatorType;
150 
151   // The Path in this record is the function id's from the leaf to the root of
152   // the function call stack as represented from a FunctionCallTrie.
153   PathArray Path;
154   const FunctionCallTrie::Node *Node;
155 };
156 
157 namespace {
158 
159 using ProfileRecordArray = Array<ProfileRecord>;
160 
161 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate
162 // the path(s) and the data associated with the path.
163 static void
164 populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
165                 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
166   using StackArray = Array<const FunctionCallTrie::Node *>;
167   using StackAllocator = typename StackArray::AllocatorType;
168   StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
169   StackArray DFSStack(StackAlloc);
170   for (const auto *R : Trie.getRoots()) {
171     DFSStack.Append(R);
172     while (!DFSStack.empty()) {
173       auto *Node = DFSStack.back();
174       DFSStack.trim(1);
175       if (Node == nullptr)
176         continue;
177       auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
178       if (Record == nullptr)
179         return;
180       DCHECK_NE(Record, nullptr);
181 
182       // Traverse the Node's parents and as we're doing so, get the FIds in
183       // the order they appear.
184       for (auto N = Node; N != nullptr; N = N->Parent)
185         Record->Path.Append(N->FId);
186       DCHECK(!Record->Path.empty());
187 
188       for (const auto C : Node->Callees)
189         DFSStack.Append(C.NodePtr);
190     }
191   }
192 }
193 
194 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
195                              const ProfileRecordArray &ProfileRecords)
196     XRAY_NEVER_INSTRUMENT {
197   auto NextPtr = static_cast<uint8_t *>(
198                      internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
199                  sizeof(Header);
200   for (const auto &Record : ProfileRecords) {
201     // List of IDs follow:
202     for (const auto FId : Record.Path)
203       NextPtr =
204           static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
205           sizeof(FId);
206 
207     // Add the sentinel here.
208     constexpr int32_t SentinelFId = 0;
209     NextPtr = static_cast<uint8_t *>(
210                   internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
211               sizeof(SentinelFId);
212 
213     // Add the node data here.
214     NextPtr =
215         static_cast<uint8_t *>(internal_memcpy(
216             NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
217         sizeof(Record.Node->CallCount);
218     NextPtr = static_cast<uint8_t *>(
219                   internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
220                                   sizeof(Record.Node->CumulativeLocalTime))) +
221               sizeof(Record.Node->CumulativeLocalTime);
222   }
223 
224   DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
225 }
226 
227 } // namespace
228 
229 void serialize() XRAY_NEVER_INSTRUMENT {
230   if (!atomic_load(&CollectorInitialized, memory_order_acquire))
231     return;
232 
233   SpinMutexLock Lock(&GlobalMutex);
234 
235   // Clear out the global ProfileBuffers, if it's not empty.
236   for (auto &B : *ProfileBuffers)
237     deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
238   ProfileBuffers->trim(ProfileBuffers->size());
239 
240   DCHECK_NE(TDArray, nullptr);
241   if (TDArray->empty())
242     return;
243 
244   // Then repopulate the global ProfileBuffers.
245   u32 I = 0;
246   auto MaxSize = profilingFlags()->global_allocator_max;
247   auto ProfileArena = allocateBuffer(MaxSize);
248   if (ProfileArena == nullptr)
249     return;
250 
251   auto ProfileArenaCleanup = at_scope_exit(
252       [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
253 
254   auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
255   if (PathArena == nullptr)
256     return;
257 
258   auto PathArenaCleanup = at_scope_exit(
259       [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
260 
261   for (const auto &ThreadTrie : *TDArray) {
262     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
263     ProfileRecordAllocator PRAlloc(ProfileArena,
264                                    profilingFlags()->global_allocator_max);
265     ProfileRecord::PathAllocator PathAlloc(
266         PathArena, profilingFlags()->global_allocator_max);
267     ProfileRecordArray ProfileRecords(PRAlloc);
268 
269     // First, we want to compute the amount of space we're going to need. We'll
270     // use a local allocator and an __xray::Array<...> to store the intermediary
271     // data, then compute the size as we're going along. Then we'll allocate the
272     // contiguous space to contain the thread buffer data.
273     if (ThreadTrie.FCT.getRoots().empty())
274       continue;
275 
276     populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
277     DCHECK(!ThreadTrie.FCT.getRoots().empty());
278     DCHECK(!ProfileRecords.empty());
279 
280     // Go through each record, to compute the sizes.
281     //
282     // header size = block size (4 bytes)
283     //   + block number (4 bytes)
284     //   + thread id (8 bytes)
285     // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
286     //   + call count (8 bytes)
287     //   + local time (8 bytes)
288     //   + end of record (8 bytes)
289     u32 CumulativeSizes = 0;
290     for (const auto &Record : ProfileRecords)
291       CumulativeSizes += 20 + (4 * Record.Path.size());
292 
293     BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
294     auto B = ProfileBuffers->Append({});
295     B->Size = sizeof(Header) + CumulativeSizes;
296     B->Data = allocateBuffer(B->Size);
297     DCHECK_NE(B->Data, nullptr);
298     serializeRecords(B, Header, ProfileRecords);
299   }
300 }
301 
302 void reset() XRAY_NEVER_INSTRUMENT {
303   atomic_store(&CollectorInitialized, 0, memory_order_release);
304   SpinMutexLock Lock(&GlobalMutex);
305 
306   if (ProfileBuffers != nullptr) {
307     // Clear out the profile buffers that have been serialized.
308     for (auto &B : *ProfileBuffers)
309       deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
310     ProfileBuffers->trim(ProfileBuffers->size());
311     ProfileBuffers = nullptr;
312   }
313 
314   if (TDArray != nullptr) {
315     // Release the resources as required.
316     for (auto &TD : *TDArray) {
317       TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
318       TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
319       TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
320       TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
321     }
322     // We don't bother destroying the array here because we've already
323     // potentially freed the backing store for the array. Instead we're going to
324     // reset the pointer to nullptr, and re-use the storage later instead
325     // (placement-new'ing into the storage as-is).
326     TDArray = nullptr;
327   }
328 
329   if (TDAllocator != nullptr) {
330     TDAllocator->~Allocator();
331     TDAllocator = nullptr;
332   }
333 
334   if (Buffer.Data != nullptr) {
335     BQ->releaseBuffer(Buffer);
336   }
337 
338   if (BQ == nullptr) {
339     bool Success = false;
340     new (&BufferQueueStorage)
341         BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
342     if (!Success)
343       return;
344     BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
345   } else {
346     BQ->finalize();
347 
348     if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
349         BufferQueue::ErrorCode::Ok)
350       return;
351   }
352 
353   if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
354     return;
355 
356   new (&ProfileBufferArrayAllocatorStorage)
357       ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
358   ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
359       &ProfileBufferArrayAllocatorStorage);
360 
361   new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
362   ProfileBuffers =
363       reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
364 
365   new (&ThreadDataAllocatorStorage)
366       ThreadDataAllocator(Buffer.Data, Buffer.Size);
367   TDAllocator =
368       reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
369   new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
370   TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
371 
372   atomic_store(&CollectorInitialized, 1, memory_order_release);
373 }
374 
375 XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
376   SpinMutexLock Lock(&GlobalMutex);
377 
378   if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
379     return {nullptr, 0};
380 
381   static pthread_once_t Once = PTHREAD_ONCE_INIT;
382   alignas(XRayProfilingFileHeader) static std::byte
383       FileHeaderStorage[sizeof(XRayProfilingFileHeader)];
384   pthread_once(
385       &Once, +[]() XRAY_NEVER_INSTRUMENT {
386         new (&FileHeaderStorage) XRayProfilingFileHeader{};
387       });
388 
389   if (UNLIKELY(B.Data == nullptr)) {
390     // The first buffer should always contain the file header information.
391     auto &FileHeader =
392         *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
393     FileHeader.Timestamp = NanoTime();
394     FileHeader.PID = internal_getpid();
395     return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
396   }
397 
398   if (UNLIKELY(B.Data == &FileHeaderStorage))
399     return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
400 
401   BlockHeader Header;
402   internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
403   auto NextBlock = Header.BlockNum + 1;
404   if (NextBlock < ProfileBuffers->size())
405     return {(*ProfileBuffers)[NextBlock].Data,
406             (*ProfileBuffers)[NextBlock].Size};
407   return {nullptr, 0};
408 }
409 
410 } // namespace profileCollectorService
411 } // namespace __xray
412