1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The data structures defined in this file are based on the reference 10 // implementation which is available at 11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" 16 #include "llvm/DebugInfo/CodeView/RecordName.h" 17 #include "llvm/DebugInfo/CodeView/RecordSerialization.h" 18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h" 19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" 20 #include "llvm/DebugInfo/MSF/MSFBuilder.h" 21 #include "llvm/DebugInfo/MSF/MSFCommon.h" 22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h" 23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" 24 #include "llvm/DebugInfo/PDB/Native/Hash.h" 25 #include "llvm/DebugInfo/PDB/Native/RawTypes.h" 26 #include "llvm/Support/BinaryItemStream.h" 27 #include "llvm/Support/BinaryStreamWriter.h" 28 #include "llvm/Support/FormatVariadic.h" 29 #include "llvm/Support/Parallel.h" 30 #include "llvm/Support/TimeProfiler.h" 31 #include "llvm/Support/xxhash.h" 32 #include <algorithm> 33 #include <vector> 34 35 using namespace llvm; 36 using namespace llvm::msf; 37 using namespace llvm::pdb; 38 using namespace llvm::codeview; 39 40 // Helper class for building the public and global PDB hash table buckets. 41 struct llvm::pdb::GSIHashStreamBuilder { 42 // Sum of the size of all public or global records. 43 uint64_t RecordByteSize = 0; 44 45 std::vector<PSHashRecord> HashRecords; 46 47 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The 48 // reference implementation builds a hash table with IPHR_HASH buckets in it. 49 // The last bucket is used to link together free hash table cells in a linked 50 // list, but it is always empty in the compressed, on-disk format. However, 51 // the bitmap must have a bit for it. 52 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap; 53 54 std::vector<support::ulittle32_t> HashBuckets; 55 56 uint32_t calculateSerializedLength() const; 57 Error commit(BinaryStreamWriter &Writer); 58 59 void finalizePublicBuckets(); 60 void finalizeGlobalBuckets(uint32_t RecordZeroOffset); 61 62 // Assign public and global symbol records into hash table buckets. 63 // Modifies the list of records to store the bucket index, but does not 64 // change the order. 65 void finalizeBuckets(uint32_t RecordZeroOffset, 66 MutableArrayRef<BulkPublic> Globals); 67 }; 68 69 // DenseMapInfo implementation for deduplicating symbol records. 70 struct llvm::pdb::SymbolDenseMapInfo { 71 static inline CVSymbol getEmptyKey() { 72 static CVSymbol Empty; 73 return Empty; 74 } 75 static inline CVSymbol getTombstoneKey() { 76 static CVSymbol Tombstone( 77 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey()); 78 return Tombstone; 79 } 80 static unsigned getHashValue(const CVSymbol &Val) { 81 return xxh3_64bits(Val.RecordData); 82 } 83 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { 84 return LHS.RecordData == RHS.RecordData; 85 } 86 }; 87 88 namespace { 89 LLVM_PACKED_START 90 struct PublicSym32Layout { 91 RecordPrefix Prefix; 92 PublicSym32Header Pub; 93 // char Name[]; 94 }; 95 LLVM_PACKED_END 96 } // namespace 97 98 // Calculate how much memory this public needs when serialized. 99 static uint32_t sizeOfPublic(const BulkPublic &Pub) { 100 uint32_t NameLen = Pub.NameLen; 101 NameLen = std::min(NameLen, 102 uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 103 return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 104 } 105 106 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) { 107 // Assume the caller has allocated sizeOfPublic bytes. 108 uint32_t NameLen = std::min( 109 Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 110 size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 111 assert(Size == sizeOfPublic(Pub)); 112 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem); 113 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32); 114 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2); 115 FixedMem->Pub.Flags = Pub.Flags; 116 FixedMem->Pub.Offset = Pub.Offset; 117 FixedMem->Pub.Segment = Pub.Segment; 118 char *NameMem = reinterpret_cast<char *>(FixedMem + 1); 119 memcpy(NameMem, Pub.Name, NameLen); 120 // Zero the null terminator and remaining bytes. 121 memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen); 122 return CVSymbol(ArrayRef(Mem, Size)); 123 } 124 125 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { 126 uint32_t Size = sizeof(GSIHashHeader); 127 Size += HashRecords.size() * sizeof(PSHashRecord); 128 Size += HashBitmap.size() * sizeof(uint32_t); 129 Size += HashBuckets.size() * sizeof(uint32_t); 130 return Size; 131 } 132 133 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { 134 GSIHashHeader Header; 135 Header.VerSignature = GSIHashHeader::HdrSignature; 136 Header.VerHdr = GSIHashHeader::HdrVersion; 137 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); 138 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; 139 140 if (auto EC = Writer.writeObject(Header)) 141 return EC; 142 143 if (auto EC = Writer.writeArray(ArrayRef(HashRecords))) 144 return EC; 145 if (auto EC = Writer.writeArray(ArrayRef(HashBitmap))) 146 return EC; 147 if (auto EC = Writer.writeArray(ArrayRef(HashBuckets))) 148 return EC; 149 return Error::success(); 150 } 151 152 static bool isAsciiString(StringRef S) { 153 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); 154 } 155 156 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp 157 static int gsiRecordCmp(StringRef S1, StringRef S2) { 158 size_t LS = S1.size(); 159 size_t RS = S2.size(); 160 // Shorter strings always compare less than longer strings. 161 if (LS != RS) 162 return (LS > RS) - (LS < RS); 163 164 // If either string contains non ascii characters, memcmp them. 165 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) 166 return memcmp(S1.data(), S2.data(), LS); 167 168 // Both strings are ascii, perform a case-insensitive comparison. 169 return S1.compare_insensitive(S2.data()); 170 } 171 172 void GSIStreamBuilder::finalizePublicBuckets() { 173 PSH->finalizeBuckets(0, Publics); 174 } 175 176 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) { 177 // Build up a list of globals to be bucketed. Use the BulkPublic data 178 // structure for this purpose, even though these are global records, not 179 // public records. Most of the same fields are required: 180 // - Name 181 // - NameLen 182 // - SymOffset 183 // - BucketIdx 184 // The dead fields are Offset, Segment, and Flags. 185 std::vector<BulkPublic> Records; 186 Records.resize(Globals.size()); 187 uint32_t SymOffset = RecordZeroOffset; 188 for (size_t I = 0, E = Globals.size(); I < E; ++I) { 189 StringRef Name = getSymbolName(Globals[I]); 190 Records[I].Name = Name.data(); 191 Records[I].NameLen = Name.size(); 192 Records[I].SymOffset = SymOffset; 193 SymOffset += Globals[I].length(); 194 } 195 196 GSH->finalizeBuckets(RecordZeroOffset, Records); 197 } 198 199 void GSIHashStreamBuilder::finalizeBuckets( 200 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) { 201 // Hash every name in parallel. 202 parallelFor(0, Records.size(), [&](size_t I) { 203 Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH); 204 }); 205 206 // Count up the size of each bucket. Then, use an exclusive prefix sum to 207 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but 208 // we can't use it yet. 209 uint32_t BucketStarts[IPHR_HASH] = {0}; 210 for (const BulkPublic &P : Records) 211 ++BucketStarts[P.BucketIdx]; 212 uint32_t Sum = 0; 213 for (uint32_t &B : BucketStarts) { 214 uint32_t Size = B; 215 B = Sum; 216 Sum += Size; 217 } 218 219 // Place globals into the hash table in bucket order. When placing a global, 220 // update the bucket start. Every hash table slot should be filled. Always use 221 // a refcount of one for now. 222 HashRecords.resize(Records.size()); 223 uint32_t BucketCursors[IPHR_HASH]; 224 memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors)); 225 for (int I = 0, E = Records.size(); I < E; ++I) { 226 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++; 227 HashRecords[HashIdx].Off = I; 228 HashRecords[HashIdx].CRef = 1; 229 } 230 231 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's 232 // important that we use the same sorting algorithm as is used by the 233 // reference implementation to ensure that the search for a record within a 234 // bucket can properly early-out when it detects the record won't be found. 235 // The algorithm used here corresponds to the function 236 // caseInsensitiveComparePchPchCchCch in the reference implementation. 237 parallelFor(0, IPHR_HASH, [&](size_t I) { 238 auto B = HashRecords.begin() + BucketStarts[I]; 239 auto E = HashRecords.begin() + BucketCursors[I]; 240 if (B == E) 241 return; 242 auto BucketCmp = [Records](const PSHashRecord &LHash, 243 const PSHashRecord &RHash) { 244 const BulkPublic &L = Records[uint32_t(LHash.Off)]; 245 const BulkPublic &R = Records[uint32_t(RHash.Off)]; 246 assert(L.BucketIdx == R.BucketIdx); 247 int Cmp = gsiRecordCmp(L.getName(), R.getName()); 248 if (Cmp != 0) 249 return Cmp < 0; 250 // This comparison is necessary to make the sorting stable in the presence 251 // of two static globals with the same name. The easiest way to observe 252 // this is with S_LDATA32 records. 253 return L.SymOffset < R.SymOffset; 254 }; 255 llvm::sort(B, E, BucketCmp); 256 257 // After we are done sorting, replace the global indices with the stream 258 // offsets of each global. Add one when writing symbol offsets to disk. 259 // See GSI1::fixSymRecs. 260 for (PSHashRecord &HRec : make_range(B, E)) 261 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1; 262 }); 263 264 // For each non-empty bucket, push the bucket start offset into HashBuckets 265 // and set a bit in the hash bitmap. 266 for (uint32_t I = 0; I < HashBitmap.size(); ++I) { 267 uint32_t Word = 0; 268 for (uint32_t J = 0; J < 32; ++J) { 269 // Skip empty buckets. 270 uint32_t BucketIdx = I * 32 + J; 271 if (BucketIdx >= IPHR_HASH || 272 BucketStarts[BucketIdx] == BucketCursors[BucketIdx]) 273 continue; 274 Word |= (1U << J); 275 276 // Calculate what the offset of the first hash record in the chain would 277 // be if it were inflated to contain 32-bit pointers. On a 32-bit system, 278 // each record would be 12 bytes. See HROffsetCalc in gsi.h. 279 const int SizeOfHROffsetCalc = 12; 280 ulittle32_t ChainStartOff = 281 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc); 282 HashBuckets.push_back(ChainStartOff); 283 } 284 HashBitmap[I] = Word; 285 } 286 } 287 288 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) 289 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()), 290 GSH(std::make_unique<GSIHashStreamBuilder>()) {} 291 292 GSIStreamBuilder::~GSIStreamBuilder() = default; 293 294 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { 295 uint32_t Size = 0; 296 Size += sizeof(PublicsStreamHeader); 297 Size += PSH->calculateSerializedLength(); 298 Size += Publics.size() * sizeof(uint32_t); // AddrMap 299 // FIXME: Add thunk map and section offsets for incremental linking. 300 301 return Size; 302 } 303 304 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { 305 return GSH->calculateSerializedLength(); 306 } 307 308 Error GSIStreamBuilder::finalizeMsfLayout() { 309 // First we write public symbol records, then we write global symbol records. 310 finalizePublicBuckets(); 311 finalizeGlobalBuckets(PSH->RecordByteSize); 312 313 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize()); 314 if (!Idx) 315 return Idx.takeError(); 316 GlobalsStreamIndex = *Idx; 317 318 Idx = Msf.addStream(calculatePublicsHashStreamSize()); 319 if (!Idx) 320 return Idx.takeError(); 321 PublicsStreamIndex = *Idx; 322 323 uint64_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize; 324 if (RecordBytes > UINT32_MAX) 325 return make_error<StringError>( 326 formatv("the public symbols ({0} bytes) and global symbols ({1} bytes) " 327 "are too large to fit in a PDB file; " 328 "the maximum total is {2} bytes.", 329 PSH->RecordByteSize, GSH->RecordByteSize, UINT32_MAX), 330 inconvertibleErrorCode()); 331 332 Idx = Msf.addStream(RecordBytes); 333 if (!Idx) 334 return Idx.takeError(); 335 RecordStreamIndex = *Idx; 336 return Error::success(); 337 } 338 339 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) { 340 assert(Publics.empty() && PSH->RecordByteSize == 0 && 341 "publics can only be added once"); 342 Publics = std::move(PublicsIn); 343 344 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism. 345 parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) { 346 return L.getName() < R.getName(); 347 }); 348 349 // Assign offsets and calculate the length of the public symbol records. 350 uint32_t SymOffset = 0; 351 for (BulkPublic &Pub : Publics) { 352 Pub.SymOffset = SymOffset; 353 SymOffset += sizeOfPublic(Pub); 354 } 355 356 // Remember the length of the public stream records. 357 PSH->RecordByteSize = SymOffset; 358 } 359 360 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { 361 serializeAndAddGlobal(Sym); 362 } 363 364 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { 365 serializeAndAddGlobal(Sym); 366 } 367 368 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { 369 serializeAndAddGlobal(Sym); 370 } 371 372 template <typename T> 373 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) { 374 T Copy(Symbol); 375 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), 376 CodeViewContainer::Pdb)); 377 } 378 379 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) { 380 // Ignore duplicate typedefs and constants. 381 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { 382 auto Iter = GlobalsSeen.insert(Symbol); 383 if (!Iter.second) 384 return; 385 } 386 GSH->RecordByteSize += Symbol.length(); 387 Globals.push_back(Symbol); 388 } 389 390 // Serialize each public and write it. 391 static Error writePublics(BinaryStreamWriter &Writer, 392 ArrayRef<BulkPublic> Publics) { 393 std::vector<uint8_t> Storage; 394 for (const BulkPublic &Pub : Publics) { 395 Storage.resize(sizeOfPublic(Pub)); 396 serializePublic(Storage.data(), Pub); 397 if (Error E = Writer.writeBytes(Storage)) 398 return E; 399 } 400 return Error::success(); 401 } 402 403 static Error writeRecords(BinaryStreamWriter &Writer, 404 ArrayRef<CVSymbol> Records) { 405 BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little); 406 ItemStream.setItems(Records); 407 BinaryStreamRef RecordsRef(ItemStream); 408 return Writer.writeStreamRef(RecordsRef); 409 } 410 411 Error GSIStreamBuilder::commitSymbolRecordStream( 412 WritableBinaryStreamRef Stream) { 413 BinaryStreamWriter Writer(Stream); 414 415 // Write public symbol records first, followed by global symbol records. This 416 // must match the order that we assume in finalizeMsfLayout when computing 417 // PSHZero and GSHZero. 418 if (auto EC = writePublics(Writer, Publics)) 419 return EC; 420 if (auto EC = writeRecords(Writer, Globals)) 421 return EC; 422 423 return Error::success(); 424 } 425 426 static std::vector<support::ulittle32_t> 427 computeAddrMap(ArrayRef<BulkPublic> Publics) { 428 // Build a parallel vector of indices into the Publics vector, and sort it by 429 // address. 430 std::vector<ulittle32_t> PubAddrMap; 431 PubAddrMap.reserve(Publics.size()); 432 for (int I = 0, E = Publics.size(); I < E; ++I) 433 PubAddrMap.push_back(ulittle32_t(I)); 434 435 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) { 436 const BulkPublic &L = Publics[LIdx]; 437 const BulkPublic &R = Publics[RIdx]; 438 if (L.Segment != R.Segment) 439 return L.Segment < R.Segment; 440 if (L.Offset != R.Offset) 441 return L.Offset < R.Offset; 442 // parallelSort is unstable, so we have to do name comparison to ensure 443 // that two names for the same location come out in a deterministic order. 444 return L.getName() < R.getName(); 445 }; 446 parallelSort(PubAddrMap, AddrCmp); 447 448 // Rewrite the public symbol indices into symbol offsets. 449 for (ulittle32_t &Entry : PubAddrMap) 450 Entry = Publics[Entry].SymOffset; 451 return PubAddrMap; 452 } 453 454 Error GSIStreamBuilder::commitPublicsHashStream( 455 WritableBinaryStreamRef Stream) { 456 BinaryStreamWriter Writer(Stream); 457 PublicsStreamHeader Header; 458 459 // FIXME: Fill these in. They are for incremental linking. 460 Header.SymHash = PSH->calculateSerializedLength(); 461 Header.AddrMap = Publics.size() * 4; 462 Header.NumThunks = 0; 463 Header.SizeOfThunk = 0; 464 Header.ISectThunkTable = 0; 465 memset(Header.Padding, 0, sizeof(Header.Padding)); 466 Header.OffThunkTable = 0; 467 Header.NumSections = 0; 468 if (auto EC = Writer.writeObject(Header)) 469 return EC; 470 471 if (auto EC = PSH->commit(Writer)) 472 return EC; 473 474 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics); 475 assert(PubAddrMap.size() == Publics.size()); 476 if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap))) 477 return EC; 478 479 return Error::success(); 480 } 481 482 Error GSIStreamBuilder::commitGlobalsHashStream( 483 WritableBinaryStreamRef Stream) { 484 BinaryStreamWriter Writer(Stream); 485 return GSH->commit(Writer); 486 } 487 488 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, 489 WritableBinaryStreamRef Buffer) { 490 llvm::TimeTraceScope timeScope("Commit GSI stream"); 491 auto GS = WritableMappedBlockStream::createIndexedStream( 492 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); 493 auto PS = WritableMappedBlockStream::createIndexedStream( 494 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); 495 auto PRS = WritableMappedBlockStream::createIndexedStream( 496 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); 497 498 if (auto EC = commitSymbolRecordStream(*PRS)) 499 return EC; 500 if (auto EC = commitGlobalsHashStream(*GS)) 501 return EC; 502 if (auto EC = commitPublicsHashStream(*PS)) 503 return EC; 504 return Error::success(); 505 } 506