xref: /freebsd/contrib/llvm-project/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 //===- GsymCreator.cpp ----------------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //===----------------------------------------------------------------------===//
7 
8 #include "llvm/DebugInfo/GSYM/GsymCreator.h"
9 #include "llvm/DebugInfo/GSYM/FileWriter.h"
10 #include "llvm/DebugInfo/GSYM/Header.h"
11 #include "llvm/DebugInfo/GSYM/LineTable.h"
12 #include "llvm/DebugInfo/GSYM/OutputAggregator.h"
13 #include "llvm/MC/StringTableBuilder.h"
14 #include "llvm/Support/raw_ostream.h"
15 
16 #include <algorithm>
17 #include <cassert>
18 #include <functional>
19 #include <vector>
20 
21 using namespace llvm;
22 using namespace gsym;
23 
24 GsymCreator::GsymCreator(bool Quiet)
25     : StrTab(StringTableBuilder::ELF), Quiet(Quiet) {
26   insertFile(StringRef());
27 }
28 
29 uint32_t GsymCreator::insertFile(StringRef Path, llvm::sys::path::Style Style) {
30   llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
31   llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
32   // We must insert the strings first, then call the FileEntry constructor.
33   // If we inline the insertString() function call into the constructor, the
34   // call order is undefined due to parameter lists not having any ordering
35   // requirements.
36   const uint32_t Dir = insertString(directory);
37   const uint32_t Base = insertString(filename);
38   return insertFileEntry(FileEntry(Dir, Base));
39 }
40 
41 uint32_t GsymCreator::insertFileEntry(FileEntry FE) {
42   std::lock_guard<std::mutex> Guard(Mutex);
43   const auto NextIndex = Files.size();
44   // Find FE in hash map and insert if not present.
45   auto R = FileEntryToIndex.insert(std::make_pair(FE, NextIndex));
46   if (R.second)
47     Files.emplace_back(FE);
48   return R.first->second;
49 }
50 
51 uint32_t GsymCreator::copyFile(const GsymCreator &SrcGC, uint32_t FileIdx) {
52   // File index zero is reserved for a FileEntry with no directory and no
53   // filename. Any other file and we need to copy the strings for the directory
54   // and filename.
55   if (FileIdx == 0)
56     return 0;
57   const FileEntry SrcFE = SrcGC.Files[FileIdx];
58   // Copy the strings for the file and then add the newly converted file entry.
59   uint32_t Dir =
60       SrcFE.Dir == 0
61           ? 0
62           : StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Dir)->second);
63   uint32_t Base = StrTab.add(SrcGC.StringOffsetMap.find(SrcFE.Base)->second);
64   FileEntry DstFE(Dir, Base);
65   return insertFileEntry(DstFE);
66 }
67 
68 llvm::Error GsymCreator::save(StringRef Path, llvm::endianness ByteOrder,
69                               std::optional<uint64_t> SegmentSize) const {
70   if (SegmentSize)
71     return saveSegments(Path, ByteOrder, *SegmentSize);
72   std::error_code EC;
73   raw_fd_ostream OutStrm(Path, EC);
74   if (EC)
75     return llvm::errorCodeToError(EC);
76   FileWriter O(OutStrm, ByteOrder);
77   return encode(O);
78 }
79 
80 llvm::Error GsymCreator::encode(FileWriter &O) const {
81   std::lock_guard<std::mutex> Guard(Mutex);
82   if (Funcs.empty())
83     return createStringError(std::errc::invalid_argument,
84                              "no functions to encode");
85   if (!Finalized)
86     return createStringError(std::errc::invalid_argument,
87                              "GsymCreator wasn't finalized prior to encoding");
88 
89   if (Funcs.size() > UINT32_MAX)
90     return createStringError(std::errc::invalid_argument,
91                              "too many FunctionInfos");
92 
93   std::optional<uint64_t> BaseAddress = getBaseAddress();
94   // Base address should be valid if we have any functions.
95   if (!BaseAddress)
96     return createStringError(std::errc::invalid_argument,
97                              "invalid base address");
98   Header Hdr;
99   Hdr.Magic = GSYM_MAGIC;
100   Hdr.Version = GSYM_VERSION;
101   Hdr.AddrOffSize = getAddressOffsetSize();
102   Hdr.UUIDSize = static_cast<uint8_t>(UUID.size());
103   Hdr.BaseAddress = *BaseAddress;
104   Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
105   Hdr.StrtabOffset = 0; // We will fix this up later.
106   Hdr.StrtabSize = 0;   // We will fix this up later.
107   memset(Hdr.UUID, 0, sizeof(Hdr.UUID));
108   if (UUID.size() > sizeof(Hdr.UUID))
109     return createStringError(std::errc::invalid_argument,
110                              "invalid UUID size %u", (uint32_t)UUID.size());
111   // Copy the UUID value if we have one.
112   if (UUID.size() > 0)
113     memcpy(Hdr.UUID, UUID.data(), UUID.size());
114   // Write out the header.
115   llvm::Error Err = Hdr.encode(O);
116   if (Err)
117     return Err;
118 
119   const uint64_t MaxAddressOffset = getMaxAddressOffset();
120   // Write out the address offsets.
121   O.alignTo(Hdr.AddrOffSize);
122   for (const auto &FuncInfo : Funcs) {
123     uint64_t AddrOffset = FuncInfo.startAddress() - Hdr.BaseAddress;
124     // Make sure we calculated the address offsets byte size correctly by
125     // verifying the current address offset is within ranges. We have seen bugs
126     // introduced when the code changes that can cause problems here so it is
127     // good to catch this during testing.
128     assert(AddrOffset <= MaxAddressOffset);
129     (void)MaxAddressOffset;
130     switch (Hdr.AddrOffSize) {
131     case 1:
132       O.writeU8(static_cast<uint8_t>(AddrOffset));
133       break;
134     case 2:
135       O.writeU16(static_cast<uint16_t>(AddrOffset));
136       break;
137     case 4:
138       O.writeU32(static_cast<uint32_t>(AddrOffset));
139       break;
140     case 8:
141       O.writeU64(AddrOffset);
142       break;
143     }
144   }
145 
146   // Write out all zeros for the AddrInfoOffsets.
147   O.alignTo(4);
148   const off_t AddrInfoOffsetsOffset = O.tell();
149   for (size_t i = 0, n = Funcs.size(); i < n; ++i)
150     O.writeU32(0);
151 
152   // Write out the file table
153   O.alignTo(4);
154   assert(!Files.empty());
155   assert(Files[0].Dir == 0);
156   assert(Files[0].Base == 0);
157   size_t NumFiles = Files.size();
158   if (NumFiles > UINT32_MAX)
159     return createStringError(std::errc::invalid_argument, "too many files");
160   O.writeU32(static_cast<uint32_t>(NumFiles));
161   for (auto File : Files) {
162     O.writeU32(File.Dir);
163     O.writeU32(File.Base);
164   }
165 
166   // Write out the string table.
167   const off_t StrtabOffset = O.tell();
168   StrTab.write(O.get_stream());
169   const off_t StrtabSize = O.tell() - StrtabOffset;
170   std::vector<uint32_t> AddrInfoOffsets;
171 
172   // Write out the address infos for each function info.
173   for (const auto &FuncInfo : Funcs) {
174     if (Expected<uint64_t> OffsetOrErr = FuncInfo.encode(O))
175       AddrInfoOffsets.push_back(OffsetOrErr.get());
176     else
177       return OffsetOrErr.takeError();
178   }
179   // Fixup the string table offset and size in the header
180   O.fixup32((uint32_t)StrtabOffset, offsetof(Header, StrtabOffset));
181   O.fixup32((uint32_t)StrtabSize, offsetof(Header, StrtabSize));
182 
183   // Fixup all address info offsets
184   uint64_t Offset = 0;
185   for (auto AddrInfoOffset : AddrInfoOffsets) {
186     O.fixup32(AddrInfoOffset, AddrInfoOffsetsOffset + Offset);
187     Offset += 4;
188   }
189   return ErrorSuccess();
190 }
191 
192 llvm::Error GsymCreator::finalize(OutputAggregator &Out) {
193   std::lock_guard<std::mutex> Guard(Mutex);
194   if (Finalized)
195     return createStringError(std::errc::invalid_argument, "already finalized");
196   Finalized = true;
197 
198   // Don't let the string table indexes change by finalizing in order.
199   StrTab.finalizeInOrder();
200 
201   // Remove duplicates function infos that have both entries from debug info
202   // (DWARF or Breakpad) and entries from the SymbolTable.
203   //
204   // Also handle overlapping function. Usually there shouldn't be any, but they
205   // can and do happen in some rare cases.
206   //
207   // (a)          (b)         (c)
208   //     ^  ^       ^            ^
209   //     |X |Y      |X ^         |X
210   //     |  |       |  |Y        |  ^
211   //     |  |       |  v         v  |Y
212   //     v  v       v               v
213   //
214   // In (a) and (b), Y is ignored and X will be reported for the full range.
215   // In (c), both functions will be included in the result and lookups for an
216   // address in the intersection will return Y because of binary search.
217   //
218   // Note that in case of (b), we cannot include Y in the result because then
219   // we wouldn't find any function for range (end of Y, end of X)
220   // with binary search
221 
222   const auto NumBefore = Funcs.size();
223   // Only sort and unique if this isn't a segment. If this is a segment we
224   // already finalized the main GsymCreator with all of the function infos
225   // and then the already sorted and uniqued function infos were added to this
226   // object.
227   if (!IsSegment) {
228     if (NumBefore > 1) {
229       // Sort function infos so we can emit sorted functions.
230       llvm::sort(Funcs);
231       std::vector<FunctionInfo> FinalizedFuncs;
232       FinalizedFuncs.reserve(Funcs.size());
233       FinalizedFuncs.emplace_back(std::move(Funcs.front()));
234       for (size_t Idx=1; Idx < NumBefore; ++Idx) {
235         FunctionInfo &Prev = FinalizedFuncs.back();
236         FunctionInfo &Curr = Funcs[Idx];
237         // Empty ranges won't intersect, but we still need to
238         // catch the case where we have multiple symbols at the
239         // same address and coalesce them.
240         const bool ranges_equal = Prev.Range == Curr.Range;
241         if (ranges_equal || Prev.Range.intersects(Curr.Range)) {
242           // Overlapping ranges or empty identical ranges.
243           if (ranges_equal) {
244             // Same address range. Check if one is from debug
245             // info and the other is from a symbol table. If
246             // so, then keep the one with debug info. Our
247             // sorting guarantees that entries with matching
248             // address ranges that have debug info are last in
249             // the sort.
250             if (!(Prev == Curr)) {
251               if (Prev.hasRichInfo() && Curr.hasRichInfo())
252                 Out.Report(
253                     "Duplicate address ranges with different debug info.",
254                     [&](raw_ostream &OS) {
255                       OS << "warning: same address range contains "
256                             "different debug "
257                          << "info. Removing:\n"
258                          << Prev << "\nIn favor of this one:\n"
259                          << Curr << "\n";
260                     });
261 
262               // We want to swap the current entry with the previous since
263               // later entries with the same range always have more debug info
264               // or different debug info.
265               std::swap(Prev, Curr);
266             }
267           } else {
268             Out.Report("Overlapping function ranges", [&](raw_ostream &OS) {
269               // print warnings about overlaps
270               OS << "warning: function ranges overlap:\n"
271                 << Prev << "\n"
272                 << Curr << "\n";
273             });
274             FinalizedFuncs.emplace_back(std::move(Curr));
275           }
276         } else {
277           if (Prev.Range.size() == 0 && Curr.Range.contains(Prev.Range.start())) {
278             // Symbols on macOS don't have address ranges, so if the range
279             // doesn't match and the size is zero, then we replace the empty
280             // symbol function info with the current one.
281             std::swap(Prev, Curr);
282           } else {
283             FinalizedFuncs.emplace_back(std::move(Curr));
284           }
285         }
286       }
287       std::swap(Funcs, FinalizedFuncs);
288     }
289     // If our last function info entry doesn't have a size and if we have valid
290     // text ranges, we should set the size of the last entry since any search for
291     // a high address might match our last entry. By fixing up this size, we can
292     // help ensure we don't cause lookups to always return the last symbol that
293     // has no size when doing lookups.
294     if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
295       if (auto Range =
296               ValidTextRanges->getRangeThatContains(Funcs.back().Range.start())) {
297         Funcs.back().Range = {Funcs.back().Range.start(), Range->end()};
298       }
299     }
300     Out << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
301         << Funcs.size() << " total\n";
302   }
303   return Error::success();
304 }
305 
306 uint32_t GsymCreator::copyString(const GsymCreator &SrcGC, uint32_t StrOff) {
307   // String offset at zero is always the empty string, no copying needed.
308   if (StrOff == 0)
309     return 0;
310   return StrTab.add(SrcGC.StringOffsetMap.find(StrOff)->second);
311 }
312 
313 uint32_t GsymCreator::insertString(StringRef S, bool Copy) {
314   if (S.empty())
315     return 0;
316 
317   // The hash can be calculated outside the lock.
318   CachedHashStringRef CHStr(S);
319   std::lock_guard<std::mutex> Guard(Mutex);
320   if (Copy) {
321     // We need to provide backing storage for the string if requested
322     // since StringTableBuilder stores references to strings. Any string
323     // that comes from a section in an object file doesn't need to be
324     // copied, but any string created by code will need to be copied.
325     // This allows GsymCreator to be really fast when parsing DWARF and
326     // other object files as most strings don't need to be copied.
327     if (!StrTab.contains(CHStr))
328       CHStr = CachedHashStringRef{StringStorage.insert(S).first->getKey(),
329                                   CHStr.hash()};
330   }
331   const uint32_t StrOff = StrTab.add(CHStr);
332   // Save a mapping of string offsets to the cached string reference in case
333   // we need to segment the GSYM file and copy string from one string table to
334   // another.
335   if (StringOffsetMap.count(StrOff) == 0)
336     StringOffsetMap.insert(std::make_pair(StrOff, CHStr));
337   return StrOff;
338 }
339 
340 void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
341   std::lock_guard<std::mutex> Guard(Mutex);
342   Funcs.emplace_back(std::move(FI));
343 }
344 
345 void GsymCreator::forEachFunctionInfo(
346     std::function<bool(FunctionInfo &)> const &Callback) {
347   std::lock_guard<std::mutex> Guard(Mutex);
348   for (auto &FI : Funcs) {
349     if (!Callback(FI))
350       break;
351   }
352 }
353 
354 void GsymCreator::forEachFunctionInfo(
355     std::function<bool(const FunctionInfo &)> const &Callback) const {
356   std::lock_guard<std::mutex> Guard(Mutex);
357   for (const auto &FI : Funcs) {
358     if (!Callback(FI))
359       break;
360   }
361 }
362 
363 size_t GsymCreator::getNumFunctionInfos() const {
364   std::lock_guard<std::mutex> Guard(Mutex);
365   return Funcs.size();
366 }
367 
368 bool GsymCreator::IsValidTextAddress(uint64_t Addr) const {
369   if (ValidTextRanges)
370     return ValidTextRanges->contains(Addr);
371   return true; // No valid text ranges has been set, so accept all ranges.
372 }
373 
374 std::optional<uint64_t> GsymCreator::getFirstFunctionAddress() const {
375   // If we have finalized then Funcs are sorted. If we are a segment then
376   // Funcs will be sorted as well since function infos get added from an
377   // already finalized GsymCreator object where its functions were sorted and
378   // uniqued.
379   if ((Finalized || IsSegment) && !Funcs.empty())
380     return std::optional<uint64_t>(Funcs.front().startAddress());
381   return std::nullopt;
382 }
383 
384 std::optional<uint64_t> GsymCreator::getLastFunctionAddress() const {
385   // If we have finalized then Funcs are sorted. If we are a segment then
386   // Funcs will be sorted as well since function infos get added from an
387   // already finalized GsymCreator object where its functions were sorted and
388   // uniqued.
389   if ((Finalized || IsSegment) && !Funcs.empty())
390     return std::optional<uint64_t>(Funcs.back().startAddress());
391   return std::nullopt;
392 }
393 
394 std::optional<uint64_t> GsymCreator::getBaseAddress() const {
395   if (BaseAddress)
396     return BaseAddress;
397   return getFirstFunctionAddress();
398 }
399 
400 uint64_t GsymCreator::getMaxAddressOffset() const {
401   switch (getAddressOffsetSize()) {
402     case 1: return UINT8_MAX;
403     case 2: return UINT16_MAX;
404     case 4: return UINT32_MAX;
405     case 8: return UINT64_MAX;
406   }
407   llvm_unreachable("invalid address offset");
408 }
409 
410 uint8_t GsymCreator::getAddressOffsetSize() const {
411   const std::optional<uint64_t> BaseAddress = getBaseAddress();
412   const std::optional<uint64_t> LastFuncAddr = getLastFunctionAddress();
413   if (BaseAddress && LastFuncAddr) {
414     const uint64_t AddrDelta = *LastFuncAddr - *BaseAddress;
415     if (AddrDelta <= UINT8_MAX)
416       return 1;
417     else if (AddrDelta <= UINT16_MAX)
418       return 2;
419     else if (AddrDelta <= UINT32_MAX)
420       return 4;
421     return 8;
422   }
423   return 1;
424 }
425 
426 uint64_t GsymCreator::calculateHeaderAndTableSize() const {
427   uint64_t Size = sizeof(Header);
428   const size_t NumFuncs = Funcs.size();
429   // Add size of address offset table
430   Size += NumFuncs * getAddressOffsetSize();
431   // Add size of address info offsets which are 32 bit integers in version 1.
432   Size += NumFuncs * sizeof(uint32_t);
433   // Add file table size
434   Size += Files.size() * sizeof(FileEntry);
435   // Add string table size
436   Size += StrTab.getSize();
437 
438   return Size;
439 }
440 
441 // This function takes a InlineInfo class that was copy constructed from an
442 // InlineInfo from the \a SrcGC and updates all members that point to strings
443 // and files to point to strings and files from this GsymCreator.
444 void GsymCreator::fixupInlineInfo(const GsymCreator &SrcGC, InlineInfo &II) {
445   II.Name = copyString(SrcGC, II.Name);
446   II.CallFile = copyFile(SrcGC, II.CallFile);
447   for (auto &ChildII: II.Children)
448     fixupInlineInfo(SrcGC, ChildII);
449 }
450 
451 uint64_t GsymCreator::copyFunctionInfo(const GsymCreator &SrcGC, size_t FuncIdx) {
452   // To copy a function info we need to copy any files and strings over into
453   // this GsymCreator and then copy the function info and update the string
454   // table offsets to match the new offsets.
455   const FunctionInfo &SrcFI = SrcGC.Funcs[FuncIdx];
456 
457   FunctionInfo DstFI;
458   DstFI.Range = SrcFI.Range;
459   DstFI.Name = copyString(SrcGC, SrcFI.Name);
460   // Copy the line table if there is one.
461   if (SrcFI.OptLineTable) {
462     // Copy the entire line table.
463     DstFI.OptLineTable = LineTable(SrcFI.OptLineTable.value());
464     // Fixup all LineEntry::File entries which are indexes in the the file table
465     // from SrcGC and must be converted to file indexes from this GsymCreator.
466     LineTable &DstLT = DstFI.OptLineTable.value();
467     const size_t NumLines = DstLT.size();
468     for (size_t I=0; I<NumLines; ++I) {
469       LineEntry &LE = DstLT.get(I);
470       LE.File = copyFile(SrcGC, LE.File);
471     }
472   }
473   // Copy the inline information if needed.
474   if (SrcFI.Inline) {
475     // Make a copy of the source inline information.
476     DstFI.Inline = SrcFI.Inline.value();
477     // Fixup all strings and files in the copied inline information.
478     fixupInlineInfo(SrcGC, *DstFI.Inline);
479   }
480   std::lock_guard<std::mutex> Guard(Mutex);
481   Funcs.emplace_back(DstFI);
482   return Funcs.back().cacheEncoding();
483 }
484 
485 llvm::Error GsymCreator::saveSegments(StringRef Path,
486                                       llvm::endianness ByteOrder,
487                                       uint64_t SegmentSize) const {
488   if (SegmentSize == 0)
489     return createStringError(std::errc::invalid_argument,
490                              "invalid segment size zero");
491 
492   size_t FuncIdx = 0;
493   const size_t NumFuncs = Funcs.size();
494   while (FuncIdx < NumFuncs) {
495     llvm::Expected<std::unique_ptr<GsymCreator>> ExpectedGC =
496         createSegment(SegmentSize, FuncIdx);
497     if (ExpectedGC) {
498       GsymCreator *GC = ExpectedGC->get();
499       if (GC == NULL)
500         break; // We had not more functions to encode.
501       // Don't collect any messages at all
502       OutputAggregator Out(nullptr);
503       llvm::Error Err = GC->finalize(Out);
504       if (Err)
505         return Err;
506       std::string SegmentedGsymPath;
507       raw_string_ostream SGP(SegmentedGsymPath);
508       std::optional<uint64_t> FirstFuncAddr = GC->getFirstFunctionAddress();
509       if (FirstFuncAddr) {
510         SGP << Path << "-" << llvm::format_hex(*FirstFuncAddr, 1);
511         SGP.flush();
512         Err = GC->save(SegmentedGsymPath, ByteOrder, std::nullopt);
513         if (Err)
514           return Err;
515       }
516     } else {
517       return ExpectedGC.takeError();
518     }
519   }
520   return Error::success();
521 }
522 
523 llvm::Expected<std::unique_ptr<GsymCreator>>
524 GsymCreator::createSegment(uint64_t SegmentSize, size_t &FuncIdx) const {
525   // No function entries, return empty unique pointer
526   if (FuncIdx >= Funcs.size())
527     return std::unique_ptr<GsymCreator>();
528 
529   std::unique_ptr<GsymCreator> GC(new GsymCreator(/*Quiet=*/true));
530 
531   // Tell the creator that this is a segment.
532   GC->setIsSegment();
533 
534   // Set the base address if there is one.
535   if (BaseAddress)
536     GC->setBaseAddress(*BaseAddress);
537   // Copy the UUID value from this object into the new creator.
538   GC->setUUID(UUID);
539   const size_t NumFuncs = Funcs.size();
540   // Track how big the function infos are for the current segment so we can
541   // emit segments that are close to the requested size. It is quick math to
542   // determine the current header and tables sizes, so we can do that each loop.
543   uint64_t SegmentFuncInfosSize = 0;
544   for (; FuncIdx < NumFuncs; ++FuncIdx) {
545     const uint64_t HeaderAndTableSize = GC->calculateHeaderAndTableSize();
546     if (HeaderAndTableSize + SegmentFuncInfosSize >= SegmentSize) {
547       if (SegmentFuncInfosSize == 0)
548         return createStringError(std::errc::invalid_argument,
549                                  "a segment size of %" PRIu64 " is to small to "
550                                  "fit any function infos, specify a larger value",
551                                  SegmentSize);
552 
553       break;
554     }
555     SegmentFuncInfosSize += alignTo(GC->copyFunctionInfo(*this, FuncIdx), 4);
556   }
557   return std::move(GC);
558 }
559