1 //===- GsymCreator.cpp ----------------------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 //===----------------------------------------------------------------------===// 7 8 #include "llvm/DebugInfo/GSYM/GsymCreator.h" 9 #include "llvm/DebugInfo/GSYM/FileWriter.h" 10 #include "llvm/DebugInfo/GSYM/Header.h" 11 #include "llvm/DebugInfo/GSYM/LineTable.h" 12 #include "llvm/MC/StringTableBuilder.h" 13 #include "llvm/Support/raw_ostream.h" 14 15 #include <algorithm> 16 #include <cassert> 17 #include <functional> 18 #include <vector> 19 20 using namespace llvm; 21 using namespace gsym; 22 23 GsymCreator::GsymCreator(bool Quiet) 24 : StrTab(StringTableBuilder::ELF), Quiet(Quiet) { 25 insertFile(StringRef()); 26 } 27 28 uint32_t GsymCreator::insertFile(StringRef Path, llvm::sys::path::Style Style) { 29 llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style); 30 llvm::StringRef filename = llvm::sys::path::filename(Path, Style); 31 // We must insert the strings first, then call the FileEntry constructor. 32 // If we inline the insertString() function call into the constructor, the 33 // call order is undefined due to parameter lists not having any ordering 34 // requirements. 35 const uint32_t Dir = insertString(directory); 36 const uint32_t Base = insertString(filename); 37 return insertFileEntry(FileEntry(Dir, Base)); 38 } 39 40 uint32_t GsymCreator::insertFileEntry(FileEntry FE) { 41 std::lock_guard<std::mutex> Guard(Mutex); 42 const auto NextIndex = Files.size(); 43 // Find FE in hash map and insert if not present. 44 auto R = FileEntryToIndex.insert(std::make_pair(FE, NextIndex)); 45 if (R.second) 46 Files.emplace_back(FE); 47 return R.first->second; 48 } 49 50 uint32_t GsymCreator::copyFile(const GsymCreator &SrcGC, uint32_t FileIdx) { 51 // File index zero is reserved for a FileEntry with no directory and no 52 // filename. Any other file and we need to copy the strings for the directory 53 // and filename. 54 if (FileIdx == 0) 55 return 0; 56 const FileEntry SrcFE = SrcGC.Files[FileIdx]; 57 // Copy the strings for the file and then add the newly converted file entry. 58 uint32_t Dir = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Dir)->second); 59 uint32_t Base = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Base)->second); 60 FileEntry DstFE(Dir, Base); 61 return insertFileEntry(DstFE); 62 } 63 64 65 llvm::Error GsymCreator::save(StringRef Path, 66 llvm::support::endianness ByteOrder, 67 std::optional<uint64_t> SegmentSize) const { 68 if (SegmentSize) 69 return saveSegments(Path, ByteOrder, *SegmentSize); 70 std::error_code EC; 71 raw_fd_ostream OutStrm(Path, EC); 72 if (EC) 73 return llvm::errorCodeToError(EC); 74 FileWriter O(OutStrm, ByteOrder); 75 return encode(O); 76 } 77 78 llvm::Error GsymCreator::encode(FileWriter &O) const { 79 std::lock_guard<std::mutex> Guard(Mutex); 80 if (Funcs.empty()) 81 return createStringError(std::errc::invalid_argument, 82 "no functions to encode"); 83 if (!Finalized) 84 return createStringError(std::errc::invalid_argument, 85 "GsymCreator wasn't finalized prior to encoding"); 86 87 if (Funcs.size() > UINT32_MAX) 88 return createStringError(std::errc::invalid_argument, 89 "too many FunctionInfos"); 90 91 std::optional<uint64_t> BaseAddress = getBaseAddress(); 92 // Base address should be valid if we have any functions. 93 if (!BaseAddress) 94 return createStringError(std::errc::invalid_argument, 95 "invalid base address"); 96 Header Hdr; 97 Hdr.Magic = GSYM_MAGIC; 98 Hdr.Version = GSYM_VERSION; 99 Hdr.AddrOffSize = getAddressOffsetSize(); 100 Hdr.UUIDSize = static_cast<uint8_t>(UUID.size()); 101 Hdr.BaseAddress = *BaseAddress; 102 Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size()); 103 Hdr.StrtabOffset = 0; // We will fix this up later. 104 Hdr.StrtabSize = 0; // We will fix this up later. 105 memset(Hdr.UUID, 0, sizeof(Hdr.UUID)); 106 if (UUID.size() > sizeof(Hdr.UUID)) 107 return createStringError(std::errc::invalid_argument, 108 "invalid UUID size %u", (uint32_t)UUID.size()); 109 // Copy the UUID value if we have one. 110 if (UUID.size() > 0) 111 memcpy(Hdr.UUID, UUID.data(), UUID.size()); 112 // Write out the header. 113 llvm::Error Err = Hdr.encode(O); 114 if (Err) 115 return Err; 116 117 const uint64_t MaxAddressOffset = getMaxAddressOffset(); 118 // Write out the address offsets. 119 O.alignTo(Hdr.AddrOffSize); 120 for (const auto &FuncInfo : Funcs) { 121 uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress; 122 // Make sure we calculated the address offsets byte size correctly by 123 // verifying the current address offset is within ranges. We have seen bugs 124 // introduced when the code changes that can cause problems here so it is 125 // good to catch this during testing. 126 assert(AddrOffset <= MaxAddressOffset); 127 (void)MaxAddressOffset; 128 switch (Hdr.AddrOffSize) { 129 case 1: 130 O.writeU8(static_cast<uint8_t>(AddrOffset)); 131 break; 132 case 2: 133 O.writeU16(static_cast<uint16_t>(AddrOffset)); 134 break; 135 case 4: 136 O.writeU32(static_cast<uint32_t>(AddrOffset)); 137 break; 138 case 8: 139 O.writeU64(AddrOffset); 140 break; 141 } 142 } 143 144 // Write out all zeros for the AddrInfoOffsets. 145 O.alignTo(4); 146 const off_t AddrInfoOffsetsOffset = O.tell(); 147 for (size_t i = 0, n = Funcs.size(); i < n; ++i) 148 O.writeU32(0); 149 150 // Write out the file table 151 O.alignTo(4); 152 assert(!Files.empty()); 153 assert(Files[0].Dir == 0); 154 assert(Files[0].Base == 0); 155 size_t NumFiles = Files.size(); 156 if (NumFiles > UINT32_MAX) 157 return createStringError(std::errc::invalid_argument, "too many files"); 158 O.writeU32(static_cast<uint32_t>(NumFiles)); 159 for (auto File : Files) { 160 O.writeU32(File.Dir); 161 O.writeU32(File.Base); 162 } 163 164 // Write out the string table. 165 const off_t StrtabOffset = O.tell(); 166 StrTab.write(O.get_stream()); 167 const off_t StrtabSize = O.tell() - StrtabOffset; 168 std::vector<uint32_t> AddrInfoOffsets; 169 170 // Write out the address infos for each function info. 171 for (const auto &FuncInfo : Funcs) { 172 if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O)) 173 AddrInfoOffsets.push_back(OffsetOrErr.get()); 174 else 175 return OffsetOrErr.takeError(); 176 } 177 // Fixup the string table offset and size in the header 178 O.fixup32((uint32_t)StrtabOffset, offsetof(Header, StrtabOffset)); 179 O.fixup32((uint32_t)StrtabSize, offsetof(Header, StrtabSize)); 180 181 // Fixup all address info offsets 182 uint64_t Offset = 0; 183 for (auto AddrInfoOffset : AddrInfoOffsets) { 184 O.fixup32(AddrInfoOffset, AddrInfoOffsetsOffset + Offset); 185 Offset += 4; 186 } 187 return ErrorSuccess(); 188 } 189 190 // Similar to std::remove_if, but the predicate is binary and it is passed both 191 // the previous and the current element. 192 template <class ForwardIt, class BinaryPredicate> 193 static ForwardIt removeIfBinary(ForwardIt FirstIt, ForwardIt LastIt, 194 BinaryPredicate Pred) { 195 if (FirstIt != LastIt) { 196 auto PrevIt = FirstIt++; 197 FirstIt = std::find_if(FirstIt, LastIt, [&](const auto &Curr) { 198 return Pred(*PrevIt++, Curr); 199 }); 200 if (FirstIt != LastIt) 201 for (ForwardIt CurrIt = FirstIt; ++CurrIt != LastIt;) 202 if (!Pred(*PrevIt, *CurrIt)) { 203 PrevIt = FirstIt; 204 *FirstIt++ = std::move(*CurrIt); 205 } 206 } 207 return FirstIt; 208 } 209 210 llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) { 211 std::lock_guard<std::mutex> Guard(Mutex); 212 if (Finalized) 213 return createStringError(std::errc::invalid_argument, "already finalized"); 214 Finalized = true; 215 216 // Sort function infos so we can emit sorted functions. 217 llvm::sort(Funcs); 218 219 // Don't let the string table indexes change by finalizing in order. 220 StrTab.finalizeInOrder(); 221 222 // Remove duplicates function infos that have both entries from debug info 223 // (DWARF or Breakpad) and entries from the SymbolTable. 224 // 225 // Also handle overlapping function. Usually there shouldn't be any, but they 226 // can and do happen in some rare cases. 227 // 228 // (a) (b) (c) 229 // ^ ^ ^ ^ 230 // |X |Y |X ^ |X 231 // | | | |Y | ^ 232 // | | | v v |Y 233 // v v v v 234 // 235 // In (a) and (b), Y is ignored and X will be reported for the full range. 236 // In (c), both functions will be included in the result and lookups for an 237 // address in the intersection will return Y because of binary search. 238 // 239 // Note that in case of (b), we cannot include Y in the result because then 240 // we wouldn't find any function for range (end of Y, end of X) 241 // with binary search 242 auto NumBefore = Funcs.size(); 243 Funcs.erase( 244 removeIfBinary(Funcs.begin(), Funcs.end(), 245 [&](const auto &Prev, const auto &Curr) { 246 // Empty ranges won't intersect, but we still need to 247 // catch the case where we have multiple symbols at the 248 // same address and coalesce them. 249 const bool ranges_equal = Prev.Range == Curr.Range; 250 if (ranges_equal || Prev.Range.intersects(Curr.Range)) { 251 // Overlapping ranges or empty identical ranges. 252 if (ranges_equal) { 253 // Same address range. Check if one is from debug 254 // info and the other is from a symbol table. If 255 // so, then keep the one with debug info. Our 256 // sorting guarantees that entries with matching 257 // address ranges that have debug info are last in 258 // the sort. 259 if (Prev == Curr) { 260 // FunctionInfo entries match exactly (range, 261 // lines, inlines) 262 263 // We used to output a warning here, but this was 264 // so frequent on some binaries, in particular 265 // when those were built with GCC, that it slowed 266 // down processing extremely. 267 return true; 268 } else { 269 if (!Prev.hasRichInfo() && Curr.hasRichInfo()) { 270 // Same address range, one with no debug info 271 // (symbol) and the next with debug info. Keep 272 // the latter. 273 return true; 274 } else { 275 if (!Quiet) { 276 OS << "warning: same address range contains " 277 "different debug " 278 << "info. Removing:\n" 279 << Prev << "\nIn favor of this one:\n" 280 << Curr << "\n"; 281 } 282 return true; 283 } 284 } 285 } else { 286 if (!Quiet) { // print warnings about overlaps 287 OS << "warning: function ranges overlap:\n" 288 << Prev << "\n" 289 << Curr << "\n"; 290 } 291 } 292 } else if (Prev.Range.size() == 0 && 293 Curr.Range.contains(Prev.Range.start())) { 294 if (!Quiet) { 295 OS << "warning: removing symbol:\n" 296 << Prev << "\nKeeping:\n" 297 << Curr << "\n"; 298 } 299 return true; 300 } 301 302 return false; 303 }), 304 Funcs.end()); 305 306 // If our last function info entry doesn't have a size and if we have valid 307 // text ranges, we should set the size of the last entry since any search for 308 // a high address might match our last entry. By fixing up this size, we can 309 // help ensure we don't cause lookups to always return the last symbol that 310 // has no size when doing lookups. 311 if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) { 312 if (auto Range = 313 ValidTextRanges->getRangeThatContains(Funcs.back().Range.start())) { 314 Funcs.back().Range = {Funcs.back().Range.start(), Range->end()}; 315 } 316 } 317 OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with " 318 << Funcs.size() << " total\n"; 319 return Error::success(); 320 } 321 322 uint32_t GsymCreator::copyString(const GsymCreator &SrcGC, uint32_t StrOff) { 323 // String offset at zero is always the empty string, no copying needed. 324 if (StrOff == 0) 325 return 0; 326 return StrTab.add(SrcGC.StringOffsetMap.find(StrOff)->second); 327 } 328 329 uint32_t GsymCreator::insertString(StringRef S, bool Copy) { 330 if (S.empty()) 331 return 0; 332 333 // The hash can be calculated outside the lock. 334 CachedHashStringRef CHStr(S); 335 std::lock_guard<std::mutex> Guard(Mutex); 336 if (Copy) { 337 // We need to provide backing storage for the string if requested 338 // since StringTableBuilder stores references to strings. Any string 339 // that comes from a section in an object file doesn't need to be 340 // copied, but any string created by code will need to be copied. 341 // This allows GsymCreator to be really fast when parsing DWARF and 342 // other object files as most strings don't need to be copied. 343 if (!StrTab.contains(CHStr)) 344 CHStr = CachedHashStringRef{StringStorage.insert(S).first->getKey(), 345 CHStr.hash()}; 346 } 347 const uint32_t StrOff = StrTab.add(CHStr); 348 // Save a mapping of string offsets to the cached string reference in case 349 // we need to segment the GSYM file and copy string from one string table to 350 // another. 351 if (StringOffsetMap.count(StrOff) == 0) 352 StringOffsetMap.insert(std::make_pair(StrOff, CHStr)); 353 return StrOff; 354 } 355 356 void GsymCreator::addFunctionInfo(FunctionInfo &&FI) { 357 std::lock_guard<std::mutex> Guard(Mutex); 358 Ranges.insert(FI.Range); 359 Funcs.emplace_back(std::move(FI)); 360 } 361 362 void GsymCreator::forEachFunctionInfo( 363 std::function<bool(FunctionInfo &)> const &Callback) { 364 std::lock_guard<std::mutex> Guard(Mutex); 365 for (auto &FI : Funcs) { 366 if (!Callback(FI)) 367 break; 368 } 369 } 370 371 void GsymCreator::forEachFunctionInfo( 372 std::function<bool(const FunctionInfo &)> const &Callback) const { 373 std::lock_guard<std::mutex> Guard(Mutex); 374 for (const auto &FI : Funcs) { 375 if (!Callback(FI)) 376 break; 377 } 378 } 379 380 size_t GsymCreator::getNumFunctionInfos() const { 381 std::lock_guard<std::mutex> Guard(Mutex); 382 return Funcs.size(); 383 } 384 385 bool GsymCreator::IsValidTextAddress(uint64_t Addr) const { 386 if (ValidTextRanges) 387 return ValidTextRanges->contains(Addr); 388 return true; // No valid text ranges has been set, so accept all ranges. 389 } 390 391 bool GsymCreator::hasFunctionInfoForAddress(uint64_t Addr) const { 392 std::lock_guard<std::mutex> Guard(Mutex); 393 return Ranges.contains(Addr); 394 } 395 396 std::optional<uint64_t> GsymCreator::getFirstFunctionAddress() const { 397 if (Finalized && !Funcs.empty()) 398 return std::optional<uint64_t>(Funcs.front().startAddress()); 399 // This code gets used by the segmentation of GSYM files to help determine the 400 // size of the GSYM header while continually adding new FunctionInfo objects 401 // to this object, so we haven't finalized this object yet. 402 if (Ranges.empty()) 403 return std::nullopt; 404 return std::optional<uint64_t>(Ranges.begin()->start()); 405 } 406 407 std::optional<uint64_t> GsymCreator::getLastFunctionAddress() const { 408 if (Finalized && !Funcs.empty()) 409 return std::optional<uint64_t>(Funcs.back().startAddress()); 410 // This code gets used by the segmentation of GSYM files to help determine the 411 // size of the GSYM header while continually adding new FunctionInfo objects 412 // to this object, so we haven't finalized this object yet. 413 if (Ranges.empty()) 414 return std::nullopt; 415 return std::optional<uint64_t>((Ranges.end() - 1)->end()); 416 } 417 418 std::optional<uint64_t> GsymCreator::getBaseAddress() const { 419 if (BaseAddress) 420 return BaseAddress; 421 return getFirstFunctionAddress(); 422 } 423 424 uint64_t GsymCreator::getMaxAddressOffset() const { 425 switch (getAddressOffsetSize()) { 426 case 1: return UINT8_MAX; 427 case 2: return UINT16_MAX; 428 case 4: return UINT32_MAX; 429 case 8: return UINT64_MAX; 430 } 431 llvm_unreachable("invalid address offset"); 432 } 433 434 uint8_t GsymCreator::getAddressOffsetSize() const { 435 const std::optional<uint64_t> BaseAddress = getBaseAddress(); 436 const std::optional<uint64_t> LastFuncAddr = getLastFunctionAddress(); 437 if (BaseAddress && LastFuncAddr) { 438 const uint64_t AddrDelta = *LastFuncAddr - *BaseAddress; 439 if (AddrDelta <= UINT8_MAX) 440 return 1; 441 else if (AddrDelta <= UINT16_MAX) 442 return 2; 443 else if (AddrDelta <= UINT32_MAX) 444 return 4; 445 return 8; 446 } 447 return 1; 448 } 449 450 uint64_t GsymCreator::calculateHeaderAndTableSize() const { 451 uint64_t Size = sizeof(Header); 452 const size_t NumFuncs = Funcs.size(); 453 // Add size of address offset table 454 Size += NumFuncs * getAddressOffsetSize(); 455 // Add size of address info offsets which are 32 bit integers in version 1. 456 Size += NumFuncs * sizeof(uint32_t); 457 // Add file table size 458 Size += Files.size() * sizeof(FileEntry); 459 // Add string table size 460 Size += StrTab.getSize(); 461 462 return Size; 463 } 464 465 // This function takes a InlineInfo class that was copy constructed from an 466 // InlineInfo from the \a SrcGC and updates all members that point to strings 467 // and files to point to strings and files from this GsymCreator. 468 void GsymCreator::fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II) { 469 II.Name = copyString(SrcGC, II.Name); 470 II.CallFile = copyFile(SrcGC, II.CallFile); 471 for (auto &ChildII: II.Children) 472 fixupInlineInfo(SrcGC, ChildII); 473 } 474 475 uint64_t GsymCreator::copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncIdx) { 476 // To copy a function info we need to copy any files and strings over into 477 // this GsymCreator and then copy the function info and update the string 478 // table offsets to match the new offsets. 479 const FunctionInfo &SrcFI = SrcGC.Funcs[FuncIdx]; 480 Ranges.insert(SrcFI.Range); 481 482 FunctionInfo DstFI; 483 DstFI.Range = SrcFI.Range; 484 DstFI.Name = copyString(SrcGC, SrcFI.Name); 485 // Copy the line table if there is one. 486 if (SrcFI.OptLineTable) { 487 // Copy the entire line table. 488 DstFI.OptLineTable = LineTable(SrcFI.OptLineTable.value()); 489 // Fixup all LineEntry::File entries which are indexes in the the file table 490 // from SrcGC and must be converted to file indexes from this GsymCreator. 491 LineTable &DstLT = DstFI.OptLineTable.value(); 492 const size_t NumLines = DstLT.size(); 493 for (size_t I=0; I<NumLines; ++I) { 494 LineEntry &LE = DstLT.get(I); 495 LE.File = copyFile(SrcGC, LE.File); 496 } 497 } 498 // Copy the inline information if needed. 499 if (SrcFI.Inline) { 500 // Make a copy of the source inline information. 501 DstFI.Inline = SrcFI.Inline.value(); 502 // Fixup all strings and files in the copied inline information. 503 fixupInlineInfo(SrcGC, *DstFI.Inline); 504 } 505 std::lock_guard<std::mutex> Guard(Mutex); 506 Funcs.push_back(DstFI); 507 return Funcs.back().cacheEncoding(); 508 } 509 510 llvm::Error GsymCreator::saveSegments(StringRef Path, 511 llvm::support::endianness ByteOrder, 512 uint64_t SegmentSize) const { 513 if (SegmentSize == 0) 514 return createStringError(std::errc::invalid_argument, 515 "invalid segment size zero"); 516 517 size_t FuncIdx = 0; 518 const size_t NumFuncs = Funcs.size(); 519 while (FuncIdx < NumFuncs) { 520 llvm::Expected<std::unique_ptr<GsymCreator>> ExpectedGC = 521 createSegment(SegmentSize, FuncIdx); 522 if (ExpectedGC) { 523 GsymCreator *GC = ExpectedGC->get(); 524 if (GC == NULL) 525 break; // We had not more functions to encode. 526 raw_null_ostream ErrorStrm; 527 llvm::Error Err = GC->finalize(ErrorStrm); 528 if (Err) 529 return Err; 530 std::string SegmentedGsymPath; 531 raw_string_ostream SGP(SegmentedGsymPath); 532 std::optional<uint64_t> FirstFuncAddr = GC->getFirstFunctionAddress(); 533 if (FirstFuncAddr) { 534 SGP << Path << "-" << llvm::format_hex(*FirstFuncAddr, 1); 535 SGP.flush(); 536 Err = GC->save(SegmentedGsymPath, ByteOrder, std::nullopt); 537 if (Err) 538 return Err; 539 } 540 } else { 541 return ExpectedGC.takeError(); 542 } 543 } 544 return Error::success(); 545 } 546 547 llvm::Expected<std::unique_ptr<GsymCreator>> 548 GsymCreator::createSegment(uint64_t SegmentSize, size_t &FuncIdx) const { 549 // No function entries, return empty unique pointer 550 if (FuncIdx >= Funcs.size()) 551 return std::unique_ptr<GsymCreator>(); 552 553 std::unique_ptr<GsymCreator> GC(new GsymCreator(/*Quiet=*/true)); 554 // Set the base address if there is one. 555 if (BaseAddress) 556 GC->setBaseAddress(*BaseAddress); 557 // Copy the UUID value from this object into the new creator. 558 GC->setUUID(UUID); 559 const size_t NumFuncs = Funcs.size(); 560 // Track how big the function infos are for the current segment so we can 561 // emit segments that are close to the requested size. It is quick math to 562 // determine the current header and tables sizes, so we can do that each loop. 563 uint64_t SegmentFuncInfosSize = 0; 564 for (; FuncIdx < NumFuncs; ++FuncIdx) { 565 const uint64_t HeaderAndTableSize = GC->calculateHeaderAndTableSize(); 566 if (HeaderAndTableSize + SegmentFuncInfosSize >= SegmentSize) { 567 if (SegmentFuncInfosSize == 0) 568 return createStringError(std::errc::invalid_argument, 569 "a segment size of %" PRIu64 " is to small to " 570 "fit any function infos, specify a larger value", 571 SegmentSize); 572 573 break; 574 } 575 SegmentFuncInfosSize += alignTo(GC->copyFunctionInfo(*this, FuncIdx), 4); 576 } 577 return std::move(GC); 578 } 579