xref: /freebsd/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp (revision 1342eb5a832fa10e689a29faab3acb6054e4778c)
1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The data structures defined in this file are based on the reference
10 // implementation which is available at
11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16 #include "llvm/DebugInfo/CodeView/RecordName.h"
17 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
21 #include "llvm/DebugInfo/MSF/MSFCommon.h"
22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24 #include "llvm/DebugInfo/PDB/Native/Hash.h"
25 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26 #include "llvm/Support/BinaryItemStream.h"
27 #include "llvm/Support/BinaryStreamWriter.h"
28 #include "llvm/Support/FormatVariadic.h"
29 #include "llvm/Support/Parallel.h"
30 #include "llvm/Support/TimeProfiler.h"
31 #include "llvm/Support/xxhash.h"
32 #include <algorithm>
33 #include <vector>
34 
35 using namespace llvm;
36 using namespace llvm::msf;
37 using namespace llvm::pdb;
38 using namespace llvm::codeview;
39 
40 // Helper class for building the public and global PDB hash table buckets.
41 struct llvm::pdb::GSIHashStreamBuilder {
42   // Sum of the size of all public or global records.
43   uint64_t RecordByteSize = 0;
44 
45   std::vector<PSHashRecord> HashRecords;
46 
47   // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
48   // reference implementation builds a hash table with IPHR_HASH buckets in it.
49   // The last bucket is used to link together free hash table cells in a linked
50   // list, but it is always empty in the compressed, on-disk format. However,
51   // the bitmap must have a bit for it.
52   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
53 
54   std::vector<support::ulittle32_t> HashBuckets;
55 
56   uint32_t calculateSerializedLength() const;
57   Error commit(BinaryStreamWriter &Writer);
58 
59   void finalizePublicBuckets();
60   void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
61 
62   // Assign public and global symbol records into hash table buckets.
63   // Modifies the list of records to store the bucket index, but does not
64   // change the order.
65   void finalizeBuckets(uint32_t RecordZeroOffset,
66                        MutableArrayRef<BulkPublic> Globals);
67 };
68 
69 // DenseMapInfo implementation for deduplicating symbol records.
70 struct llvm::pdb::SymbolDenseMapInfo {
71   static inline CVSymbol getEmptyKey() {
72     static CVSymbol Empty;
73     return Empty;
74   }
75   static inline CVSymbol getTombstoneKey() {
76     static CVSymbol Tombstone(
77         DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
78     return Tombstone;
79   }
80   static unsigned getHashValue(const CVSymbol &Val) {
81     return xxh3_64bits(Val.RecordData);
82   }
83   static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
84     return LHS.RecordData == RHS.RecordData;
85   }
86 };
87 
88 namespace {
89 LLVM_PACKED_START
90 struct PublicSym32Layout {
91   RecordPrefix Prefix;
92   PublicSym32Header Pub;
93   // char Name[];
94 };
95 LLVM_PACKED_END
96 } // namespace
97 
98 // Calculate how much memory this public needs when serialized.
99 static uint32_t sizeOfPublic(const BulkPublic &Pub) {
100   uint32_t NameLen = Pub.NameLen;
101   NameLen = std::min(NameLen,
102                      uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
103   return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
104 }
105 
106 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
107   // Assume the caller has allocated sizeOfPublic bytes.
108   uint32_t NameLen = std::min(
109       Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
110   size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
111   assert(Size == sizeOfPublic(Pub));
112   auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
113   FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
114   FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
115   FixedMem->Pub.Flags = Pub.Flags;
116   FixedMem->Pub.Offset = Pub.Offset;
117   FixedMem->Pub.Segment = Pub.Segment;
118   char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
119   memcpy(NameMem, Pub.Name, NameLen);
120   // Zero the null terminator and remaining bytes.
121   memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
122   return CVSymbol(ArrayRef(Mem, Size));
123 }
124 
125 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
126   uint32_t Size = sizeof(GSIHashHeader);
127   Size += HashRecords.size() * sizeof(PSHashRecord);
128   Size += HashBitmap.size() * sizeof(uint32_t);
129   Size += HashBuckets.size() * sizeof(uint32_t);
130   return Size;
131 }
132 
133 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
134   GSIHashHeader Header;
135   Header.VerSignature = GSIHashHeader::HdrSignature;
136   Header.VerHdr = GSIHashHeader::HdrVersion;
137   Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
138   Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
139 
140   if (auto EC = Writer.writeObject(Header))
141     return EC;
142 
143   if (auto EC = Writer.writeArray(ArrayRef(HashRecords)))
144     return EC;
145   if (auto EC = Writer.writeArray(ArrayRef(HashBitmap)))
146     return EC;
147   if (auto EC = Writer.writeArray(ArrayRef(HashBuckets)))
148     return EC;
149   return Error::success();
150 }
151 
152 static bool isAsciiString(StringRef S) {
153   return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
154 }
155 
156 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
157 static int gsiRecordCmp(StringRef S1, StringRef S2) {
158   size_t LS = S1.size();
159   size_t RS = S2.size();
160   // Shorter strings always compare less than longer strings.
161   if (LS != RS)
162     return (LS > RS) - (LS < RS);
163 
164   // If either string contains non ascii characters, memcmp them.
165   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
166     return memcmp(S1.data(), S2.data(), LS);
167 
168   // Both strings are ascii, perform a case-insensitive comparison.
169   return S1.compare_insensitive(S2.data());
170 }
171 
172 void GSIStreamBuilder::finalizePublicBuckets() {
173   PSH->finalizeBuckets(0, Publics);
174 }
175 
176 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
177   // Build up a list of globals to be bucketed. Use the BulkPublic data
178   // structure for this purpose, even though these are global records, not
179   // public records. Most of the same fields are required:
180   // - Name
181   // - NameLen
182   // - SymOffset
183   // - BucketIdx
184   // The dead fields are Offset, Segment, and Flags.
185   std::vector<BulkPublic> Records;
186   Records.resize(Globals.size());
187   uint32_t SymOffset = RecordZeroOffset;
188   for (size_t I = 0, E = Globals.size(); I < E; ++I) {
189     StringRef Name = getSymbolName(Globals[I]);
190     Records[I].Name = Name.data();
191     Records[I].NameLen = Name.size();
192     Records[I].SymOffset = SymOffset;
193     SymOffset += Globals[I].length();
194   }
195 
196   GSH->finalizeBuckets(RecordZeroOffset, Records);
197 }
198 
199 void GSIHashStreamBuilder::finalizeBuckets(
200     uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
201   // Hash every name in parallel.
202   parallelFor(0, Records.size(), [&](size_t I) {
203     Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
204   });
205 
206   // Count up the size of each bucket. Then, use an exclusive prefix sum to
207   // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
208   // we can't use it yet.
209   uint32_t BucketStarts[IPHR_HASH] = {0};
210   for (const BulkPublic &P : Records)
211     ++BucketStarts[P.BucketIdx];
212   uint32_t Sum = 0;
213   for (uint32_t &B : BucketStarts) {
214     uint32_t Size = B;
215     B = Sum;
216     Sum += Size;
217   }
218 
219   // Place globals into the hash table in bucket order. When placing a global,
220   // update the bucket start. Every hash table slot should be filled. Always use
221   // a refcount of one for now.
222   HashRecords.resize(Records.size());
223   uint32_t BucketCursors[IPHR_HASH];
224   memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
225   for (int I = 0, E = Records.size(); I < E; ++I) {
226     uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
227     HashRecords[HashIdx].Off = I;
228     HashRecords[HashIdx].CRef = 1;
229   }
230 
231   // Within the buckets, sort each bucket by memcmp of the symbol's name.  It's
232   // important that we use the same sorting algorithm as is used by the
233   // reference implementation to ensure that the search for a record within a
234   // bucket can properly early-out when it detects the record won't be found.
235   // The algorithm used here corresponds to the function
236   // caseInsensitiveComparePchPchCchCch in the reference implementation.
237   parallelFor(0, IPHR_HASH, [&](size_t I) {
238     auto B = HashRecords.begin() + BucketStarts[I];
239     auto E = HashRecords.begin() + BucketCursors[I];
240     if (B == E)
241       return;
242     auto BucketCmp = [Records](const PSHashRecord &LHash,
243                                const PSHashRecord &RHash) {
244       const BulkPublic &L = Records[uint32_t(LHash.Off)];
245       const BulkPublic &R = Records[uint32_t(RHash.Off)];
246       assert(L.BucketIdx == R.BucketIdx);
247       int Cmp = gsiRecordCmp(L.getName(), R.getName());
248       if (Cmp != 0)
249         return Cmp < 0;
250       // This comparison is necessary to make the sorting stable in the presence
251       // of two static globals with the same name. The easiest way to observe
252       // this is with S_LDATA32 records.
253       return L.SymOffset < R.SymOffset;
254     };
255     llvm::sort(B, E, BucketCmp);
256 
257     // After we are done sorting, replace the global indices with the stream
258     // offsets of each global. Add one when writing symbol offsets to disk.
259     // See GSI1::fixSymRecs.
260     for (PSHashRecord &HRec : make_range(B, E))
261       HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
262   });
263 
264   // For each non-empty bucket, push the bucket start offset into HashBuckets
265   // and set a bit in the hash bitmap.
266   for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
267     uint32_t Word = 0;
268     for (uint32_t J = 0; J < 32; ++J) {
269       // Skip empty buckets.
270       uint32_t BucketIdx = I * 32 + J;
271       if (BucketIdx >= IPHR_HASH ||
272           BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
273         continue;
274       Word |= (1U << J);
275 
276       // Calculate what the offset of the first hash record in the chain would
277       // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
278       // each record would be 12 bytes. See HROffsetCalc in gsi.h.
279       const int SizeOfHROffsetCalc = 12;
280       ulittle32_t ChainStartOff =
281           ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
282       HashBuckets.push_back(ChainStartOff);
283     }
284     HashBitmap[I] = Word;
285   }
286 }
287 
288 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
289     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
290       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
291 
292 GSIStreamBuilder::~GSIStreamBuilder() = default;
293 
294 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
295   uint32_t Size = 0;
296   Size += sizeof(PublicsStreamHeader);
297   Size += PSH->calculateSerializedLength();
298   Size += Publics.size() * sizeof(uint32_t); // AddrMap
299   // FIXME: Add thunk map and section offsets for incremental linking.
300 
301   return Size;
302 }
303 
304 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
305   return GSH->calculateSerializedLength();
306 }
307 
308 Error GSIStreamBuilder::finalizeMsfLayout() {
309   // First we write public symbol records, then we write global symbol records.
310   finalizePublicBuckets();
311   finalizeGlobalBuckets(PSH->RecordByteSize);
312 
313   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
314   if (!Idx)
315     return Idx.takeError();
316   GlobalsStreamIndex = *Idx;
317 
318   Idx = Msf.addStream(calculatePublicsHashStreamSize());
319   if (!Idx)
320     return Idx.takeError();
321   PublicsStreamIndex = *Idx;
322 
323   uint64_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
324   if (RecordBytes > UINT32_MAX)
325     return make_error<StringError>(
326         formatv("the public symbols ({0} bytes) and global symbols ({1} bytes) "
327                 "are too large to fit in a PDB file; "
328                 "the maximum total is {2} bytes.",
329                 PSH->RecordByteSize, GSH->RecordByteSize, UINT32_MAX),
330         inconvertibleErrorCode());
331 
332   Idx = Msf.addStream(RecordBytes);
333   if (!Idx)
334     return Idx.takeError();
335   RecordStreamIndex = *Idx;
336   return Error::success();
337 }
338 
339 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
340   assert(Publics.empty() && PSH->RecordByteSize == 0 &&
341          "publics can only be added once");
342   Publics = std::move(PublicsIn);
343 
344   // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
345   parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
346     return L.getName() < R.getName();
347   });
348 
349   // Assign offsets and calculate the length of the public symbol records.
350   uint32_t SymOffset = 0;
351   for (BulkPublic &Pub : Publics) {
352     Pub.SymOffset = SymOffset;
353     SymOffset += sizeOfPublic(Pub);
354   }
355 
356   // Remember the length of the public stream records.
357   PSH->RecordByteSize = SymOffset;
358 }
359 
360 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
361   serializeAndAddGlobal(Sym);
362 }
363 
364 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
365   serializeAndAddGlobal(Sym);
366 }
367 
368 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
369   serializeAndAddGlobal(Sym);
370 }
371 
372 template <typename T>
373 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
374   T Copy(Symbol);
375   addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
376                                                    CodeViewContainer::Pdb));
377 }
378 
379 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
380   // Ignore duplicate typedefs and constants.
381   if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
382     auto Iter = GlobalsSeen.insert(Symbol);
383     if (!Iter.second)
384       return;
385   }
386   GSH->RecordByteSize += Symbol.length();
387   Globals.push_back(Symbol);
388 }
389 
390 // Serialize each public and write it.
391 static Error writePublics(BinaryStreamWriter &Writer,
392                           ArrayRef<BulkPublic> Publics) {
393   std::vector<uint8_t> Storage;
394   for (const BulkPublic &Pub : Publics) {
395     Storage.resize(sizeOfPublic(Pub));
396     serializePublic(Storage.data(), Pub);
397     if (Error E = Writer.writeBytes(Storage))
398       return E;
399   }
400   return Error::success();
401 }
402 
403 static Error writeRecords(BinaryStreamWriter &Writer,
404                           ArrayRef<CVSymbol> Records) {
405   BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);
406   ItemStream.setItems(Records);
407   BinaryStreamRef RecordsRef(ItemStream);
408   return Writer.writeStreamRef(RecordsRef);
409 }
410 
411 Error GSIStreamBuilder::commitSymbolRecordStream(
412     WritableBinaryStreamRef Stream) {
413   BinaryStreamWriter Writer(Stream);
414 
415   // Write public symbol records first, followed by global symbol records.  This
416   // must match the order that we assume in finalizeMsfLayout when computing
417   // PSHZero and GSHZero.
418   if (auto EC = writePublics(Writer, Publics))
419     return EC;
420   if (auto EC = writeRecords(Writer, Globals))
421     return EC;
422 
423   return Error::success();
424 }
425 
426 static std::vector<support::ulittle32_t>
427 computeAddrMap(ArrayRef<BulkPublic> Publics) {
428   // Build a parallel vector of indices into the Publics vector, and sort it by
429   // address.
430   std::vector<ulittle32_t> PubAddrMap;
431   PubAddrMap.reserve(Publics.size());
432   for (int I = 0, E = Publics.size(); I < E; ++I)
433     PubAddrMap.push_back(ulittle32_t(I));
434 
435   auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
436     const BulkPublic &L = Publics[LIdx];
437     const BulkPublic &R = Publics[RIdx];
438     if (L.Segment != R.Segment)
439       return L.Segment < R.Segment;
440     if (L.Offset != R.Offset)
441       return L.Offset < R.Offset;
442     // parallelSort is unstable, so we have to do name comparison to ensure
443     // that two names for the same location come out in a deterministic order.
444     return L.getName() < R.getName();
445   };
446   parallelSort(PubAddrMap, AddrCmp);
447 
448   // Rewrite the public symbol indices into symbol offsets.
449   for (ulittle32_t &Entry : PubAddrMap)
450     Entry = Publics[Entry].SymOffset;
451   return PubAddrMap;
452 }
453 
454 Error GSIStreamBuilder::commitPublicsHashStream(
455     WritableBinaryStreamRef Stream) {
456   BinaryStreamWriter Writer(Stream);
457   PublicsStreamHeader Header;
458 
459   // FIXME: Fill these in. They are for incremental linking.
460   Header.SymHash = PSH->calculateSerializedLength();
461   Header.AddrMap = Publics.size() * 4;
462   Header.NumThunks = 0;
463   Header.SizeOfThunk = 0;
464   Header.ISectThunkTable = 0;
465   memset(Header.Padding, 0, sizeof(Header.Padding));
466   Header.OffThunkTable = 0;
467   Header.NumSections = 0;
468   if (auto EC = Writer.writeObject(Header))
469     return EC;
470 
471   if (auto EC = PSH->commit(Writer))
472     return EC;
473 
474   std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
475   assert(PubAddrMap.size() == Publics.size());
476   if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap)))
477     return EC;
478 
479   return Error::success();
480 }
481 
482 Error GSIStreamBuilder::commitGlobalsHashStream(
483     WritableBinaryStreamRef Stream) {
484   BinaryStreamWriter Writer(Stream);
485   return GSH->commit(Writer);
486 }
487 
488 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
489                                WritableBinaryStreamRef Buffer) {
490   llvm::TimeTraceScope timeScope("Commit GSI stream");
491   auto GS = WritableMappedBlockStream::createIndexedStream(
492       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
493   auto PS = WritableMappedBlockStream::createIndexedStream(
494       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
495   auto PRS = WritableMappedBlockStream::createIndexedStream(
496       Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
497 
498   if (auto EC = commitSymbolRecordStream(*PRS))
499     return EC;
500   if (auto EC = commitGlobalsHashStream(*GS))
501     return EC;
502   if (auto EC = commitPublicsHashStream(*PS))
503     return EC;
504   return Error::success();
505 }
506