1 //===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file is a part of XRay, a dynamic runtime instrumentation system. 10 // 11 // This implements the interface for the profileCollectorService. 12 // 13 //===----------------------------------------------------------------------===// 14 #include "xray_profile_collector.h" 15 #include "sanitizer_common/sanitizer_common.h" 16 #include "xray_allocator.h" 17 #include "xray_defs.h" 18 #include "xray_profiling_flags.h" 19 #include "xray_segmented_array.h" 20 #include <memory> 21 #include <pthread.h> 22 #include <utility> 23 24 namespace __xray { 25 namespace profileCollectorService { 26 27 namespace { 28 29 SpinMutex GlobalMutex; 30 struct ThreadTrie { 31 tid_t TId; 32 typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage; 33 }; 34 35 struct ProfileBuffer { 36 void *Data; 37 size_t Size; 38 }; 39 40 // Current version of the profile format. 41 constexpr u64 XRayProfilingVersion = 0x20180424; 42 43 // Identifier for XRay profiling files 'xrayprof' in hex. 44 constexpr u64 XRayMagicBytes = 0x7872617970726f66; 45 46 struct XRayProfilingFileHeader { 47 const u64 MagicBytes = XRayMagicBytes; 48 const u64 Version = XRayProfilingVersion; 49 u64 Timestamp = 0; // System time in nanoseconds. 50 u64 PID = 0; // Process ID. 51 }; 52 53 struct BlockHeader { 54 u32 BlockSize; 55 u32 BlockNum; 56 u64 ThreadId; 57 }; 58 59 struct ThreadData { 60 BufferQueue *BQ; 61 FunctionCallTrie::Allocators::Buffers Buffers; 62 FunctionCallTrie::Allocators Allocators; 63 FunctionCallTrie FCT; 64 tid_t TId; 65 }; 66 67 using ThreadDataArray = Array<ThreadData>; 68 using ThreadDataAllocator = ThreadDataArray::AllocatorType; 69 70 // We use a separate buffer queue for the backing store for the allocator used 71 // by the ThreadData array. This lets us host the buffers, allocators, and tries 72 // associated with a thread by moving the data into the array instead of 73 // attempting to copy the data to a separately backed set of tries. 74 static typename std::aligned_storage< 75 sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; 76 static BufferQueue *BQ = nullptr; 77 static BufferQueue::Buffer Buffer; 78 static typename std::aligned_storage<sizeof(ThreadDataAllocator), 79 alignof(ThreadDataAllocator)>::type 80 ThreadDataAllocatorStorage; 81 static typename std::aligned_storage<sizeof(ThreadDataArray), 82 alignof(ThreadDataArray)>::type 83 ThreadDataArrayStorage; 84 85 static ThreadDataAllocator *TDAllocator = nullptr; 86 static ThreadDataArray *TDArray = nullptr; 87 88 using ProfileBufferArray = Array<ProfileBuffer>; 89 using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; 90 91 // These need to be global aligned storage to avoid dynamic initialization. We 92 // need these to be aligned to allow us to placement new objects into the 93 // storage, and have pointers to those objects be appropriately aligned. 94 static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type 95 ProfileBuffersStorage; 96 static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type 97 ProfileBufferArrayAllocatorStorage; 98 99 static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; 100 static ProfileBufferArray *ProfileBuffers = nullptr; 101 102 // Use a global flag to determine whether the collector implementation has been 103 // initialized. 104 static atomic_uint8_t CollectorInitialized{0}; 105 106 } // namespace 107 108 void post(BufferQueue *Q, FunctionCallTrie &&T, 109 FunctionCallTrie::Allocators &&A, 110 FunctionCallTrie::Allocators::Buffers &&B, 111 tid_t TId) XRAY_NEVER_INSTRUMENT { 112 DCHECK_NE(Q, nullptr); 113 114 // Bail out early if the collector has not been initialized. 115 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { 116 T.~FunctionCallTrie(); 117 A.~Allocators(); 118 Q->releaseBuffer(B.NodeBuffer); 119 Q->releaseBuffer(B.RootsBuffer); 120 Q->releaseBuffer(B.ShadowStackBuffer); 121 Q->releaseBuffer(B.NodeIdPairBuffer); 122 B.~Buffers(); 123 return; 124 } 125 126 { 127 SpinMutexLock Lock(&GlobalMutex); 128 DCHECK_NE(TDAllocator, nullptr); 129 DCHECK_NE(TDArray, nullptr); 130 131 if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), 132 TId) == nullptr) { 133 // If we fail to add the data to the array, we should destroy the objects 134 // handed us. 135 T.~FunctionCallTrie(); 136 A.~Allocators(); 137 Q->releaseBuffer(B.NodeBuffer); 138 Q->releaseBuffer(B.RootsBuffer); 139 Q->releaseBuffer(B.ShadowStackBuffer); 140 Q->releaseBuffer(B.NodeIdPairBuffer); 141 B.~Buffers(); 142 } 143 } 144 } 145 146 // A PathArray represents the function id's representing a stack trace. In this 147 // context a path is almost always represented from the leaf function in a call 148 // stack to a root of the call trie. 149 using PathArray = Array<int32_t>; 150 151 struct ProfileRecord { 152 using PathAllocator = typename PathArray::AllocatorType; 153 154 // The Path in this record is the function id's from the leaf to the root of 155 // the function call stack as represented from a FunctionCallTrie. 156 PathArray Path; 157 const FunctionCallTrie::Node *Node; 158 }; 159 160 namespace { 161 162 using ProfileRecordArray = Array<ProfileRecord>; 163 164 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate 165 // the path(s) and the data associated with the path. 166 static void 167 populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, 168 const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { 169 using StackArray = Array<const FunctionCallTrie::Node *>; 170 using StackAllocator = typename StackArray::AllocatorType; 171 StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); 172 StackArray DFSStack(StackAlloc); 173 for (const auto *R : Trie.getRoots()) { 174 DFSStack.Append(R); 175 while (!DFSStack.empty()) { 176 auto *Node = DFSStack.back(); 177 DFSStack.trim(1); 178 if (Node == nullptr) 179 continue; 180 auto Record = PRs.AppendEmplace(PathArray{PA}, Node); 181 if (Record == nullptr) 182 return; 183 DCHECK_NE(Record, nullptr); 184 185 // Traverse the Node's parents and as we're doing so, get the FIds in 186 // the order they appear. 187 for (auto N = Node; N != nullptr; N = N->Parent) 188 Record->Path.Append(N->FId); 189 DCHECK(!Record->Path.empty()); 190 191 for (const auto C : Node->Callees) 192 DFSStack.Append(C.NodePtr); 193 } 194 } 195 } 196 197 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, 198 const ProfileRecordArray &ProfileRecords) 199 XRAY_NEVER_INSTRUMENT { 200 auto NextPtr = static_cast<uint8_t *>( 201 internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + 202 sizeof(Header); 203 for (const auto &Record : ProfileRecords) { 204 // List of IDs follow: 205 for (const auto FId : Record.Path) 206 NextPtr = 207 static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + 208 sizeof(FId); 209 210 // Add the sentinel here. 211 constexpr int32_t SentinelFId = 0; 212 NextPtr = static_cast<uint8_t *>( 213 internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + 214 sizeof(SentinelFId); 215 216 // Add the node data here. 217 NextPtr = 218 static_cast<uint8_t *>(internal_memcpy( 219 NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + 220 sizeof(Record.Node->CallCount); 221 NextPtr = static_cast<uint8_t *>( 222 internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, 223 sizeof(Record.Node->CumulativeLocalTime))) + 224 sizeof(Record.Node->CumulativeLocalTime); 225 } 226 227 DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); 228 } 229 230 } // namespace 231 232 void serialize() XRAY_NEVER_INSTRUMENT { 233 if (!atomic_load(&CollectorInitialized, memory_order_acquire)) 234 return; 235 236 SpinMutexLock Lock(&GlobalMutex); 237 238 // Clear out the global ProfileBuffers, if it's not empty. 239 for (auto &B : *ProfileBuffers) 240 deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); 241 ProfileBuffers->trim(ProfileBuffers->size()); 242 243 DCHECK_NE(TDArray, nullptr); 244 if (TDArray->empty()) 245 return; 246 247 // Then repopulate the global ProfileBuffers. 248 u32 I = 0; 249 auto MaxSize = profilingFlags()->global_allocator_max; 250 auto ProfileArena = allocateBuffer(MaxSize); 251 if (ProfileArena == nullptr) 252 return; 253 254 auto ProfileArenaCleanup = at_scope_exit( 255 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); 256 257 auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); 258 if (PathArena == nullptr) 259 return; 260 261 auto PathArenaCleanup = at_scope_exit( 262 [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); 263 264 for (const auto &ThreadTrie : *TDArray) { 265 using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; 266 ProfileRecordAllocator PRAlloc(ProfileArena, 267 profilingFlags()->global_allocator_max); 268 ProfileRecord::PathAllocator PathAlloc( 269 PathArena, profilingFlags()->global_allocator_max); 270 ProfileRecordArray ProfileRecords(PRAlloc); 271 272 // First, we want to compute the amount of space we're going to need. We'll 273 // use a local allocator and an __xray::Array<...> to store the intermediary 274 // data, then compute the size as we're going along. Then we'll allocate the 275 // contiguous space to contain the thread buffer data. 276 if (ThreadTrie.FCT.getRoots().empty()) 277 continue; 278 279 populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); 280 DCHECK(!ThreadTrie.FCT.getRoots().empty()); 281 DCHECK(!ProfileRecords.empty()); 282 283 // Go through each record, to compute the sizes. 284 // 285 // header size = block size (4 bytes) 286 // + block number (4 bytes) 287 // + thread id (8 bytes) 288 // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) 289 // + call count (8 bytes) 290 // + local time (8 bytes) 291 // + end of record (8 bytes) 292 u32 CumulativeSizes = 0; 293 for (const auto &Record : ProfileRecords) 294 CumulativeSizes += 20 + (4 * Record.Path.size()); 295 296 BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; 297 auto B = ProfileBuffers->Append({}); 298 B->Size = sizeof(Header) + CumulativeSizes; 299 B->Data = allocateBuffer(B->Size); 300 DCHECK_NE(B->Data, nullptr); 301 serializeRecords(B, Header, ProfileRecords); 302 } 303 } 304 305 void reset() XRAY_NEVER_INSTRUMENT { 306 atomic_store(&CollectorInitialized, 0, memory_order_release); 307 SpinMutexLock Lock(&GlobalMutex); 308 309 if (ProfileBuffers != nullptr) { 310 // Clear out the profile buffers that have been serialized. 311 for (auto &B : *ProfileBuffers) 312 deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); 313 ProfileBuffers->trim(ProfileBuffers->size()); 314 ProfileBuffers = nullptr; 315 } 316 317 if (TDArray != nullptr) { 318 // Release the resources as required. 319 for (auto &TD : *TDArray) { 320 TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); 321 TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); 322 TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); 323 TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); 324 } 325 // We don't bother destroying the array here because we've already 326 // potentially freed the backing store for the array. Instead we're going to 327 // reset the pointer to nullptr, and re-use the storage later instead 328 // (placement-new'ing into the storage as-is). 329 TDArray = nullptr; 330 } 331 332 if (TDAllocator != nullptr) { 333 TDAllocator->~Allocator(); 334 TDAllocator = nullptr; 335 } 336 337 if (Buffer.Data != nullptr) { 338 BQ->releaseBuffer(Buffer); 339 } 340 341 if (BQ == nullptr) { 342 bool Success = false; 343 new (&BufferQueueStorage) 344 BufferQueue(profilingFlags()->global_allocator_max, 1, Success); 345 if (!Success) 346 return; 347 BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); 348 } else { 349 BQ->finalize(); 350 351 if (BQ->init(profilingFlags()->global_allocator_max, 1) != 352 BufferQueue::ErrorCode::Ok) 353 return; 354 } 355 356 if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) 357 return; 358 359 new (&ProfileBufferArrayAllocatorStorage) 360 ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); 361 ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( 362 &ProfileBufferArrayAllocatorStorage); 363 364 new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); 365 ProfileBuffers = 366 reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); 367 368 new (&ThreadDataAllocatorStorage) 369 ThreadDataAllocator(Buffer.Data, Buffer.Size); 370 TDAllocator = 371 reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); 372 new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); 373 TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); 374 375 atomic_store(&CollectorInitialized, 1, memory_order_release); 376 } 377 378 XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { 379 SpinMutexLock Lock(&GlobalMutex); 380 381 if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) 382 return {nullptr, 0}; 383 384 static pthread_once_t Once = PTHREAD_ONCE_INIT; 385 static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type 386 FileHeaderStorage; 387 pthread_once( 388 &Once, +[]() XRAY_NEVER_INSTRUMENT { 389 new (&FileHeaderStorage) XRayProfilingFileHeader{}; 390 }); 391 392 if (UNLIKELY(B.Data == nullptr)) { 393 // The first buffer should always contain the file header information. 394 auto &FileHeader = 395 *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); 396 FileHeader.Timestamp = NanoTime(); 397 FileHeader.PID = internal_getpid(); 398 return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)}; 399 } 400 401 if (UNLIKELY(B.Data == &FileHeaderStorage)) 402 return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; 403 404 BlockHeader Header; 405 internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); 406 auto NextBlock = Header.BlockNum + 1; 407 if (NextBlock < ProfileBuffers->size()) 408 return {(*ProfileBuffers)[NextBlock].Data, 409 (*ProfileBuffers)[NextBlock].Size}; 410 return {nullptr, 0}; 411 } 412 413 } // namespace profileCollectorService 414 } // namespace __xray 415