1 //===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file is a part of XRay, a dynamic runtime instrumentation system. 10 // 11 // This implements the interface for the profileCollectorService. 12 // 13 //===----------------------------------------------------------------------===// 14 #include "xray_profile_collector.h" 15 #include "sanitizer_common/sanitizer_common.h" 16 #include "xray_allocator.h" 17 #include "xray_defs.h" 18 #include "xray_profiling_flags.h" 19 #include "xray_segmented_array.h" 20 #include <memory> 21 #include <pthread.h> 22 #include <utility> 23 24 namespace __xray { 25 namespace profileCollectorService { 26 27 namespace { 28 29 SpinMutex GlobalMutex; 30 struct ThreadTrie { 31 tid_t TId; 32 alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)]; 33 }; 34 35 struct ProfileBuffer { 36 void *Data; 37 size_t Size; 38 }; 39 40 // Current version of the profile format. 41 constexpr u64 XRayProfilingVersion = 0x20180424; 42 43 // Identifier for XRay profiling files 'xrayprof' in hex. 44 constexpr u64 XRayMagicBytes = 0x7872617970726f66; 45 46 struct XRayProfilingFileHeader { 47 const u64 MagicBytes = XRayMagicBytes; 48 const u64 Version = XRayProfilingVersion; 49 u64 Timestamp = 0; // System time in nanoseconds. 50 u64 PID = 0; // Process ID. 51 }; 52 53 struct BlockHeader { 54 u32 BlockSize; 55 u32 BlockNum; 56 u64 ThreadId; 57 }; 58 59 struct ThreadData { 60 BufferQueue *BQ; 61 FunctionCallTrie::Allocators::Buffers Buffers; 62 FunctionCallTrie::Allocators Allocators; 63 FunctionCallTrie FCT; 64 tid_t TId; 65 }; 66 67 using ThreadDataArray = Array<ThreadData>; 68 using ThreadDataAllocator = ThreadDataArray::AllocatorType; 69 70 // We use a separate buffer queue for the backing store for the allocator used 71 // by the ThreadData array. This lets us host the buffers, allocators, and tries 72 // associated with a thread by moving the data into the array instead of 73 // attempting to copy the data to a separately backed set of tries. 74 alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)]; 75 static BufferQueue *BQ = nullptr; 76 static BufferQueue::Buffer Buffer; 77 alignas(ThreadDataAllocator) static std::byte 78 ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)]; 79 alignas(ThreadDataArray) static std::byte 80 ThreadDataArrayStorage[sizeof(ThreadDataArray)]; 81 82 static ThreadDataAllocator *TDAllocator = nullptr; 83 static ThreadDataArray *TDArray = nullptr; 84 85 using ProfileBufferArray = Array<ProfileBuffer>; 86 using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; 87 88 // These need to be global aligned storage to avoid dynamic initialization. We 89 // need these to be aligned to allow us to placement new objects into the 90 // storage, and have pointers to those objects be appropriately aligned. 91 alignas(ProfileBufferArray) static std::byte 92 ProfileBuffersStorage[sizeof(ProfileBufferArray)]; 93 alignas(ProfileBufferArrayAllocator) static std::byte 94 ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)]; 95 96 static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; 97 static ProfileBufferArray *ProfileBuffers = nullptr; 98 99 // Use a global flag to determine whether the collector implementation has been 100 // initialized. 101 static atomic_uint8_t CollectorInitialized{0}; 102 103 } // namespace 104 105 void post(BufferQueue *Q, FunctionCallTrie &&T, 106 FunctionCallTrie::Allocators &&A, 107 FunctionCallTrie::Allocators::Buffers &&B, 108 tid_t TId) XRAY_NEVER_INSTRUMENT { 109 DCHECK_NE(Q, nullptr); 110 111 // Bail out early if the collector has not been initialized. 112 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { 113 T.~FunctionCallTrie(); 114 A.~Allocators(); 115 Q->releaseBuffer(B.NodeBuffer); 116 Q->releaseBuffer(B.RootsBuffer); 117 Q->releaseBuffer(B.ShadowStackBuffer); 118 Q->releaseBuffer(B.NodeIdPairBuffer); 119 B.~Buffers(); 120 return; 121 } 122 123 { 124 SpinMutexLock Lock(&GlobalMutex); 125 DCHECK_NE(TDAllocator, nullptr); 126 DCHECK_NE(TDArray, nullptr); 127 128 if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), 129 TId) == nullptr) { 130 // If we fail to add the data to the array, we should destroy the objects 131 // handed us. 132 T.~FunctionCallTrie(); 133 A.~Allocators(); 134 Q->releaseBuffer(B.NodeBuffer); 135 Q->releaseBuffer(B.RootsBuffer); 136 Q->releaseBuffer(B.ShadowStackBuffer); 137 Q->releaseBuffer(B.NodeIdPairBuffer); 138 B.~Buffers(); 139 } 140 } 141 } 142 143 // A PathArray represents the function id's representing a stack trace. In this 144 // context a path is almost always represented from the leaf function in a call 145 // stack to a root of the call trie. 146 using PathArray = Array<int32_t>; 147 148 struct ProfileRecord { 149 using PathAllocator = typename PathArray::AllocatorType; 150 151 // The Path in this record is the function id's from the leaf to the root of 152 // the function call stack as represented from a FunctionCallTrie. 153 PathArray Path; 154 const FunctionCallTrie::Node *Node; 155 }; 156 157 namespace { 158 159 using ProfileRecordArray = Array<ProfileRecord>; 160 161 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate 162 // the path(s) and the data associated with the path. 163 static void 164 populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, 165 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { 166 using StackArray = Array<const FunctionCallTrie::Node *>; 167 using StackAllocator = typename StackArray::AllocatorType; 168 StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); 169 StackArray DFSStack(StackAlloc); 170 for (const auto *R : Trie.getRoots()) { 171 DFSStack.Append(R); 172 while (!DFSStack.empty()) { 173 auto *Node = DFSStack.back(); 174 DFSStack.trim(1); 175 if (Node == nullptr) 176 continue; 177 auto Record = PRs.AppendEmplace(PathArray{PA}, Node); 178 if (Record == nullptr) 179 return; 180 DCHECK_NE(Record, nullptr); 181 182 // Traverse the Node's parents and as we're doing so, get the FIds in 183 // the order they appear. 184 for (auto N = Node; N != nullptr; N = N->Parent) 185 Record->Path.Append(N->FId); 186 DCHECK(!Record->Path.empty()); 187 188 for (const auto C : Node->Callees) 189 DFSStack.Append(C.NodePtr); 190 } 191 } 192 } 193 194 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, 195 const ProfileRecordArray &ProfileRecords) 196 XRAY_NEVER_INSTRUMENT { 197 auto NextPtr = static_cast<uint8_t *>( 198 internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + 199 sizeof(Header); 200 for (const auto &Record : ProfileRecords) { 201 // List of IDs follow: 202 for (const auto FId : Record.Path) 203 NextPtr = 204 static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + 205 sizeof(FId); 206 207 // Add the sentinel here. 208 constexpr int32_t SentinelFId = 0; 209 NextPtr = static_cast<uint8_t *>( 210 internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + 211 sizeof(SentinelFId); 212 213 // Add the node data here. 214 NextPtr = 215 static_cast<uint8_t *>(internal_memcpy( 216 NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + 217 sizeof(Record.Node->CallCount); 218 NextPtr = static_cast<uint8_t *>( 219 internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, 220 sizeof(Record.Node->CumulativeLocalTime))) + 221 sizeof(Record.Node->CumulativeLocalTime); 222 } 223 224 DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); 225 } 226 227 } // namespace 228 229 void serialize() XRAY_NEVER_INSTRUMENT { 230 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) 231 return; 232 233 SpinMutexLock Lock(&GlobalMutex); 234 235 // Clear out the global ProfileBuffers, if it's not empty. 236 for (auto &B : *ProfileBuffers) 237 deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); 238 ProfileBuffers->trim(ProfileBuffers->size()); 239 240 DCHECK_NE(TDArray, nullptr); 241 if (TDArray->empty()) 242 return; 243 244 // Then repopulate the global ProfileBuffers. 245 u32 I = 0; 246 auto MaxSize = profilingFlags()->global_allocator_max; 247 auto ProfileArena = allocateBuffer(MaxSize); 248 if (ProfileArena == nullptr) 249 return; 250 251 auto ProfileArenaCleanup = at_scope_exit( 252 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); 253 254 auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); 255 if (PathArena == nullptr) 256 return; 257 258 auto PathArenaCleanup = at_scope_exit( 259 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); 260 261 for (const auto &ThreadTrie : *TDArray) { 262 using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; 263 ProfileRecordAllocator PRAlloc(ProfileArena, 264 profilingFlags()->global_allocator_max); 265 ProfileRecord::PathAllocator PathAlloc( 266 PathArena, profilingFlags()->global_allocator_max); 267 ProfileRecordArray ProfileRecords(PRAlloc); 268 269 // First, we want to compute the amount of space we're going to need. We'll 270 // use a local allocator and an __xray::Array<...> to store the intermediary 271 // data, then compute the size as we're going along. Then we'll allocate the 272 // contiguous space to contain the thread buffer data. 273 if (ThreadTrie.FCT.getRoots().empty()) 274 continue; 275 276 populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); 277 DCHECK(!ThreadTrie.FCT.getRoots().empty()); 278 DCHECK(!ProfileRecords.empty()); 279 280 // Go through each record, to compute the sizes. 281 // 282 // header size = block size (4 bytes) 283 // + block number (4 bytes) 284 // + thread id (8 bytes) 285 // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) 286 // + call count (8 bytes) 287 // + local time (8 bytes) 288 // + end of record (8 bytes) 289 u32 CumulativeSizes = 0; 290 for (const auto &Record : ProfileRecords) 291 CumulativeSizes += 20 + (4 * Record.Path.size()); 292 293 BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; 294 auto B = ProfileBuffers->Append({}); 295 B->Size = sizeof(Header) + CumulativeSizes; 296 B->Data = allocateBuffer(B->Size); 297 DCHECK_NE(B->Data, nullptr); 298 serializeRecords(B, Header, ProfileRecords); 299 } 300 } 301 302 void reset() XRAY_NEVER_INSTRUMENT { 303 atomic_store(&CollectorInitialized, 0, memory_order_release); 304 SpinMutexLock Lock(&GlobalMutex); 305 306 if (ProfileBuffers != nullptr) { 307 // Clear out the profile buffers that have been serialized. 308 for (auto &B : *ProfileBuffers) 309 deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); 310 ProfileBuffers->trim(ProfileBuffers->size()); 311 ProfileBuffers = nullptr; 312 } 313 314 if (TDArray != nullptr) { 315 // Release the resources as required. 316 for (auto &TD : *TDArray) { 317 TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); 318 TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); 319 TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); 320 TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); 321 } 322 // We don't bother destroying the array here because we've already 323 // potentially freed the backing store for the array. Instead we're going to 324 // reset the pointer to nullptr, and re-use the storage later instead 325 // (placement-new'ing into the storage as-is). 326 TDArray = nullptr; 327 } 328 329 if (TDAllocator != nullptr) { 330 TDAllocator->~Allocator(); 331 TDAllocator = nullptr; 332 } 333 334 if (Buffer.Data != nullptr) { 335 BQ->releaseBuffer(Buffer); 336 } 337 338 if (BQ == nullptr) { 339 bool Success = false; 340 new (&BufferQueueStorage) 341 BufferQueue(profilingFlags()->global_allocator_max, 1, Success); 342 if (!Success) 343 return; 344 BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); 345 } else { 346 BQ->finalize(); 347 348 if (BQ->init(profilingFlags()->global_allocator_max, 1) != 349 BufferQueue::ErrorCode::Ok) 350 return; 351 } 352 353 if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) 354 return; 355 356 new (&ProfileBufferArrayAllocatorStorage) 357 ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); 358 ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( 359 &ProfileBufferArrayAllocatorStorage); 360 361 new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); 362 ProfileBuffers = 363 reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); 364 365 new (&ThreadDataAllocatorStorage) 366 ThreadDataAllocator(Buffer.Data, Buffer.Size); 367 TDAllocator = 368 reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); 369 new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); 370 TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); 371 372 atomic_store(&CollectorInitialized, 1, memory_order_release); 373 } 374 375 XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { 376 SpinMutexLock Lock(&GlobalMutex); 377 378 if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) 379 return {nullptr, 0}; 380 381 static pthread_once_t Once = PTHREAD_ONCE_INIT; 382 alignas(XRayProfilingFileHeader) static std::byte 383 FileHeaderStorage[sizeof(XRayProfilingFileHeader)]; 384 pthread_once( 385 &Once, +[]() XRAY_NEVER_INSTRUMENT { 386 new (&FileHeaderStorage) XRayProfilingFileHeader{}; 387 }); 388 389 if (UNLIKELY(B.Data == nullptr)) { 390 // The first buffer should always contain the file header information. 391 auto &FileHeader = 392 *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); 393 FileHeader.Timestamp = NanoTime(); 394 FileHeader.PID = internal_getpid(); 395 return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)}; 396 } 397 398 if (UNLIKELY(B.Data == &FileHeaderStorage)) 399 return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; 400 401 BlockHeader Header; 402 internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); 403 auto NextBlock = Header.BlockNum + 1; 404 if (NextBlock < ProfileBuffers->size()) 405 return {(*ProfileBuffers)[NextBlock].Data, 406 (*ProfileBuffers)[NextBlock].Size}; 407 return {nullptr, 0}; 408 } 409 410 } // namespace profileCollectorService 411 } // namespace __xray 412