xref: /freebsd/contrib/llvm-project/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp (revision 3f0efe05432b1633991114ca4ca330102a561959)
1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // The data structures defined in this file are based on the reference
10 // implementation which is available at
11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
16 #include "llvm/DebugInfo/CodeView/RecordName.h"
17 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
20 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
21 #include "llvm/DebugInfo/MSF/MSFCommon.h"
22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
24 #include "llvm/DebugInfo/PDB/Native/Hash.h"
25 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
26 #include "llvm/Support/BinaryItemStream.h"
27 #include "llvm/Support/BinaryStreamWriter.h"
28 #include "llvm/Support/Parallel.h"
29 #include "llvm/Support/TimeProfiler.h"
30 #include "llvm/Support/xxhash.h"
31 #include <algorithm>
32 #include <vector>
33 
34 using namespace llvm;
35 using namespace llvm::msf;
36 using namespace llvm::pdb;
37 using namespace llvm::codeview;
38 
39 // Helper class for building the public and global PDB hash table buckets.
40 struct llvm::pdb::GSIHashStreamBuilder {
41   // Sum of the size of all public or global records.
42   uint32_t RecordByteSize = 0;
43 
44   std::vector<PSHashRecord> HashRecords;
45 
46   // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
47   // reference implementation builds a hash table with IPHR_HASH buckets in it.
48   // The last bucket is used to link together free hash table cells in a linked
49   // list, but it is always empty in the compressed, on-disk format. However,
50   // the bitmap must have a bit for it.
51   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
52 
53   std::vector<support::ulittle32_t> HashBuckets;
54 
55   uint32_t calculateSerializedLength() const;
56   Error commit(BinaryStreamWriter &Writer);
57 
58   void finalizePublicBuckets();
59   void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
60 
61   // Assign public and global symbol records into hash table buckets.
62   // Modifies the list of records to store the bucket index, but does not
63   // change the order.
64   void finalizeBuckets(uint32_t RecordZeroOffset,
65                        MutableArrayRef<BulkPublic> Globals);
66 };
67 
68 // DenseMapInfo implementation for deduplicating symbol records.
69 struct llvm::pdb::SymbolDenseMapInfo {
70   static inline CVSymbol getEmptyKey() {
71     static CVSymbol Empty;
72     return Empty;
73   }
74   static inline CVSymbol getTombstoneKey() {
75     static CVSymbol Tombstone(
76         DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
77     return Tombstone;
78   }
79   static unsigned getHashValue(const CVSymbol &Val) {
80     return xxh3_64bits(Val.RecordData);
81   }
82   static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
83     return LHS.RecordData == RHS.RecordData;
84   }
85 };
86 
87 namespace {
88 LLVM_PACKED_START
89 struct PublicSym32Layout {
90   RecordPrefix Prefix;
91   PublicSym32Header Pub;
92   // char Name[];
93 };
94 LLVM_PACKED_END
95 } // namespace
96 
97 // Calculate how much memory this public needs when serialized.
98 static uint32_t sizeOfPublic(const BulkPublic &Pub) {
99   uint32_t NameLen = Pub.NameLen;
100   NameLen = std::min(NameLen,
101                      uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
102   return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
103 }
104 
105 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
106   // Assume the caller has allocated sizeOfPublic bytes.
107   uint32_t NameLen = std::min(
108       Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
109   size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
110   assert(Size == sizeOfPublic(Pub));
111   auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
112   FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
113   FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
114   FixedMem->Pub.Flags = Pub.Flags;
115   FixedMem->Pub.Offset = Pub.Offset;
116   FixedMem->Pub.Segment = Pub.Segment;
117   char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
118   memcpy(NameMem, Pub.Name, NameLen);
119   // Zero the null terminator and remaining bytes.
120   memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
121   return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
122 }
123 
124 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
125   uint32_t Size = sizeof(GSIHashHeader);
126   Size += HashRecords.size() * sizeof(PSHashRecord);
127   Size += HashBitmap.size() * sizeof(uint32_t);
128   Size += HashBuckets.size() * sizeof(uint32_t);
129   return Size;
130 }
131 
132 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
133   GSIHashHeader Header;
134   Header.VerSignature = GSIHashHeader::HdrSignature;
135   Header.VerHdr = GSIHashHeader::HdrVersion;
136   Header.HrSize = HashRecords.size() * sizeof(PSHashRecord);
137   Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4;
138 
139   if (auto EC = Writer.writeObject(Header))
140     return EC;
141 
142   if (auto EC = Writer.writeArray(ArrayRef(HashRecords)))
143     return EC;
144   if (auto EC = Writer.writeArray(ArrayRef(HashBitmap)))
145     return EC;
146   if (auto EC = Writer.writeArray(ArrayRef(HashBuckets)))
147     return EC;
148   return Error::success();
149 }
150 
151 static bool isAsciiString(StringRef S) {
152   return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; });
153 }
154 
155 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
156 static int gsiRecordCmp(StringRef S1, StringRef S2) {
157   size_t LS = S1.size();
158   size_t RS = S2.size();
159   // Shorter strings always compare less than longer strings.
160   if (LS != RS)
161     return (LS > RS) - (LS < RS);
162 
163   // If either string contains non ascii characters, memcmp them.
164   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
165     return memcmp(S1.data(), S2.data(), LS);
166 
167   // Both strings are ascii, perform a case-insensitive comparison.
168   return S1.compare_insensitive(S2.data());
169 }
170 
171 void GSIStreamBuilder::finalizePublicBuckets() {
172   PSH->finalizeBuckets(0, Publics);
173 }
174 
175 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
176   // Build up a list of globals to be bucketed. Use the BulkPublic data
177   // structure for this purpose, even though these are global records, not
178   // public records. Most of the same fields are required:
179   // - Name
180   // - NameLen
181   // - SymOffset
182   // - BucketIdx
183   // The dead fields are Offset, Segment, and Flags.
184   std::vector<BulkPublic> Records;
185   Records.resize(Globals.size());
186   uint32_t SymOffset = RecordZeroOffset;
187   for (size_t I = 0, E = Globals.size(); I < E; ++I) {
188     StringRef Name = getSymbolName(Globals[I]);
189     Records[I].Name = Name.data();
190     Records[I].NameLen = Name.size();
191     Records[I].SymOffset = SymOffset;
192     SymOffset += Globals[I].length();
193   }
194 
195   GSH->finalizeBuckets(RecordZeroOffset, Records);
196 }
197 
198 void GSIHashStreamBuilder::finalizeBuckets(
199     uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
200   // Hash every name in parallel.
201   parallelFor(0, Records.size(), [&](size_t I) {
202     Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
203   });
204 
205   // Count up the size of each bucket. Then, use an exclusive prefix sum to
206   // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
207   // we can't use it yet.
208   uint32_t BucketStarts[IPHR_HASH] = {0};
209   for (const BulkPublic &P : Records)
210     ++BucketStarts[P.BucketIdx];
211   uint32_t Sum = 0;
212   for (uint32_t &B : BucketStarts) {
213     uint32_t Size = B;
214     B = Sum;
215     Sum += Size;
216   }
217 
218   // Place globals into the hash table in bucket order. When placing a global,
219   // update the bucket start. Every hash table slot should be filled. Always use
220   // a refcount of one for now.
221   HashRecords.resize(Records.size());
222   uint32_t BucketCursors[IPHR_HASH];
223   memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
224   for (int I = 0, E = Records.size(); I < E; ++I) {
225     uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
226     HashRecords[HashIdx].Off = I;
227     HashRecords[HashIdx].CRef = 1;
228   }
229 
230   // Within the buckets, sort each bucket by memcmp of the symbol's name.  It's
231   // important that we use the same sorting algorithm as is used by the
232   // reference implementation to ensure that the search for a record within a
233   // bucket can properly early-out when it detects the record won't be found.
234   // The algorithm used here corresponds to the function
235   // caseInsensitiveComparePchPchCchCch in the reference implementation.
236   parallelFor(0, IPHR_HASH, [&](size_t I) {
237     auto B = HashRecords.begin() + BucketStarts[I];
238     auto E = HashRecords.begin() + BucketCursors[I];
239     if (B == E)
240       return;
241     auto BucketCmp = [Records](const PSHashRecord &LHash,
242                                const PSHashRecord &RHash) {
243       const BulkPublic &L = Records[uint32_t(LHash.Off)];
244       const BulkPublic &R = Records[uint32_t(RHash.Off)];
245       assert(L.BucketIdx == R.BucketIdx);
246       int Cmp = gsiRecordCmp(L.getName(), R.getName());
247       if (Cmp != 0)
248         return Cmp < 0;
249       // This comparison is necessary to make the sorting stable in the presence
250       // of two static globals with the same name. The easiest way to observe
251       // this is with S_LDATA32 records.
252       return L.SymOffset < R.SymOffset;
253     };
254     llvm::sort(B, E, BucketCmp);
255 
256     // After we are done sorting, replace the global indices with the stream
257     // offsets of each global. Add one when writing symbol offsets to disk.
258     // See GSI1::fixSymRecs.
259     for (PSHashRecord &HRec : make_range(B, E))
260       HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
261   });
262 
263   // For each non-empty bucket, push the bucket start offset into HashBuckets
264   // and set a bit in the hash bitmap.
265   for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
266     uint32_t Word = 0;
267     for (uint32_t J = 0; J < 32; ++J) {
268       // Skip empty buckets.
269       uint32_t BucketIdx = I * 32 + J;
270       if (BucketIdx >= IPHR_HASH ||
271           BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
272         continue;
273       Word |= (1U << J);
274 
275       // Calculate what the offset of the first hash record in the chain would
276       // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
277       // each record would be 12 bytes. See HROffsetCalc in gsi.h.
278       const int SizeOfHROffsetCalc = 12;
279       ulittle32_t ChainStartOff =
280           ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
281       HashBuckets.push_back(ChainStartOff);
282     }
283     HashBitmap[I] = Word;
284   }
285 }
286 
287 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf)
288     : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()),
289       GSH(std::make_unique<GSIHashStreamBuilder>()) {}
290 
291 GSIStreamBuilder::~GSIStreamBuilder() = default;
292 
293 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
294   uint32_t Size = 0;
295   Size += sizeof(PublicsStreamHeader);
296   Size += PSH->calculateSerializedLength();
297   Size += Publics.size() * sizeof(uint32_t); // AddrMap
298   // FIXME: Add thunk map and section offsets for incremental linking.
299 
300   return Size;
301 }
302 
303 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
304   return GSH->calculateSerializedLength();
305 }
306 
307 Error GSIStreamBuilder::finalizeMsfLayout() {
308   // First we write public symbol records, then we write global symbol records.
309   finalizePublicBuckets();
310   finalizeGlobalBuckets(PSH->RecordByteSize);
311 
312   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
313   if (!Idx)
314     return Idx.takeError();
315   GlobalsStreamIndex = *Idx;
316 
317   Idx = Msf.addStream(calculatePublicsHashStreamSize());
318   if (!Idx)
319     return Idx.takeError();
320   PublicsStreamIndex = *Idx;
321 
322   uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
323 
324   Idx = Msf.addStream(RecordBytes);
325   if (!Idx)
326     return Idx.takeError();
327   RecordStreamIndex = *Idx;
328   return Error::success();
329 }
330 
331 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
332   assert(Publics.empty() && PSH->RecordByteSize == 0 &&
333          "publics can only be added once");
334   Publics = std::move(PublicsIn);
335 
336   // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
337   parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
338     return L.getName() < R.getName();
339   });
340 
341   // Assign offsets and calculate the length of the public symbol records.
342   uint32_t SymOffset = 0;
343   for (BulkPublic &Pub : Publics) {
344     Pub.SymOffset = SymOffset;
345     SymOffset += sizeOfPublic(Pub);
346   }
347 
348   // Remember the length of the public stream records.
349   PSH->RecordByteSize = SymOffset;
350 }
351 
352 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
353   serializeAndAddGlobal(Sym);
354 }
355 
356 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
357   serializeAndAddGlobal(Sym);
358 }
359 
360 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
361   serializeAndAddGlobal(Sym);
362 }
363 
364 template <typename T>
365 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
366   T Copy(Symbol);
367   addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
368                                                    CodeViewContainer::Pdb));
369 }
370 
371 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
372   // Ignore duplicate typedefs and constants.
373   if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
374     auto Iter = GlobalsSeen.insert(Symbol);
375     if (!Iter.second)
376       return;
377   }
378   GSH->RecordByteSize += Symbol.length();
379   Globals.push_back(Symbol);
380 }
381 
382 // Serialize each public and write it.
383 static Error writePublics(BinaryStreamWriter &Writer,
384                           ArrayRef<BulkPublic> Publics) {
385   std::vector<uint8_t> Storage;
386   for (const BulkPublic &Pub : Publics) {
387     Storage.resize(sizeOfPublic(Pub));
388     serializePublic(Storage.data(), Pub);
389     if (Error E = Writer.writeBytes(Storage))
390       return E;
391   }
392   return Error::success();
393 }
394 
395 static Error writeRecords(BinaryStreamWriter &Writer,
396                           ArrayRef<CVSymbol> Records) {
397   BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little);
398   ItemStream.setItems(Records);
399   BinaryStreamRef RecordsRef(ItemStream);
400   return Writer.writeStreamRef(RecordsRef);
401 }
402 
403 Error GSIStreamBuilder::commitSymbolRecordStream(
404     WritableBinaryStreamRef Stream) {
405   BinaryStreamWriter Writer(Stream);
406 
407   // Write public symbol records first, followed by global symbol records.  This
408   // must match the order that we assume in finalizeMsfLayout when computing
409   // PSHZero and GSHZero.
410   if (auto EC = writePublics(Writer, Publics))
411     return EC;
412   if (auto EC = writeRecords(Writer, Globals))
413     return EC;
414 
415   return Error::success();
416 }
417 
418 static std::vector<support::ulittle32_t>
419 computeAddrMap(ArrayRef<BulkPublic> Publics) {
420   // Build a parallel vector of indices into the Publics vector, and sort it by
421   // address.
422   std::vector<ulittle32_t> PubAddrMap;
423   PubAddrMap.reserve(Publics.size());
424   for (int I = 0, E = Publics.size(); I < E; ++I)
425     PubAddrMap.push_back(ulittle32_t(I));
426 
427   auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
428     const BulkPublic &L = Publics[LIdx];
429     const BulkPublic &R = Publics[RIdx];
430     if (L.Segment != R.Segment)
431       return L.Segment < R.Segment;
432     if (L.Offset != R.Offset)
433       return L.Offset < R.Offset;
434     // parallelSort is unstable, so we have to do name comparison to ensure
435     // that two names for the same location come out in a deterministic order.
436     return L.getName() < R.getName();
437   };
438   parallelSort(PubAddrMap, AddrCmp);
439 
440   // Rewrite the public symbol indices into symbol offsets.
441   for (ulittle32_t &Entry : PubAddrMap)
442     Entry = Publics[Entry].SymOffset;
443   return PubAddrMap;
444 }
445 
446 Error GSIStreamBuilder::commitPublicsHashStream(
447     WritableBinaryStreamRef Stream) {
448   BinaryStreamWriter Writer(Stream);
449   PublicsStreamHeader Header;
450 
451   // FIXME: Fill these in. They are for incremental linking.
452   Header.SymHash = PSH->calculateSerializedLength();
453   Header.AddrMap = Publics.size() * 4;
454   Header.NumThunks = 0;
455   Header.SizeOfThunk = 0;
456   Header.ISectThunkTable = 0;
457   memset(Header.Padding, 0, sizeof(Header.Padding));
458   Header.OffThunkTable = 0;
459   Header.NumSections = 0;
460   if (auto EC = Writer.writeObject(Header))
461     return EC;
462 
463   if (auto EC = PSH->commit(Writer))
464     return EC;
465 
466   std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
467   assert(PubAddrMap.size() == Publics.size());
468   if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap)))
469     return EC;
470 
471   return Error::success();
472 }
473 
474 Error GSIStreamBuilder::commitGlobalsHashStream(
475     WritableBinaryStreamRef Stream) {
476   BinaryStreamWriter Writer(Stream);
477   return GSH->commit(Writer);
478 }
479 
480 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
481                                WritableBinaryStreamRef Buffer) {
482   llvm::TimeTraceScope timeScope("Commit GSI stream");
483   auto GS = WritableMappedBlockStream::createIndexedStream(
484       Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator());
485   auto PS = WritableMappedBlockStream::createIndexedStream(
486       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
487   auto PRS = WritableMappedBlockStream::createIndexedStream(
488       Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
489 
490   if (auto EC = commitSymbolRecordStream(*PRS))
491     return EC;
492   if (auto EC = commitGlobalsHashStream(*GS))
493     return EC;
494   if (auto EC = commitPublicsHashStream(*PS))
495     return EC;
496   return Error::success();
497 }
498