1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The data structures defined in this file are based on the reference 10 // implementation which is available at 11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" 16 #include "llvm/DebugInfo/CodeView/RecordName.h" 17 #include "llvm/DebugInfo/CodeView/RecordSerialization.h" 18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h" 19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" 20 #include "llvm/DebugInfo/MSF/MSFBuilder.h" 21 #include "llvm/DebugInfo/MSF/MSFCommon.h" 22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h" 23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" 24 #include "llvm/DebugInfo/PDB/Native/Hash.h" 25 #include "llvm/DebugInfo/PDB/Native/RawTypes.h" 26 #include "llvm/Support/BinaryItemStream.h" 27 #include "llvm/Support/BinaryStreamWriter.h" 28 #include "llvm/Support/Parallel.h" 29 #include "llvm/Support/xxhash.h" 30 #include <algorithm> 31 #include <vector> 32 33 using namespace llvm; 34 using namespace llvm::msf; 35 using namespace llvm::pdb; 36 using namespace llvm::codeview; 37 38 // Helper class for building the public and global PDB hash table buckets. 39 struct llvm::pdb::GSIHashStreamBuilder { 40 // Sum of the size of all public or global records. 41 uint32_t RecordByteSize = 0; 42 43 std::vector<PSHashRecord> HashRecords; 44 45 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The 46 // reference implementation builds a hash table with IPHR_HASH buckets in it. 47 // The last bucket is used to link together free hash table cells in a linked 48 // list, but it is always empty in the compressed, on-disk format. However, 49 // the bitmap must have a bit for it. 50 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap; 51 52 std::vector<support::ulittle32_t> HashBuckets; 53 54 uint32_t calculateSerializedLength() const; 55 Error commit(BinaryStreamWriter &Writer); 56 57 void finalizePublicBuckets(); 58 void finalizeGlobalBuckets(uint32_t RecordZeroOffset); 59 60 // Assign public and global symbol records into hash table buckets. 61 // Modifies the list of records to store the bucket index, but does not 62 // change the order. 63 void finalizeBuckets(uint32_t RecordZeroOffset, 64 MutableArrayRef<BulkPublic> Globals); 65 }; 66 67 // DenseMapInfo implementation for deduplicating symbol records. 68 struct llvm::pdb::SymbolDenseMapInfo { 69 static inline CVSymbol getEmptyKey() { 70 static CVSymbol Empty; 71 return Empty; 72 } 73 static inline CVSymbol getTombstoneKey() { 74 static CVSymbol Tombstone( 75 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey()); 76 return Tombstone; 77 } 78 static unsigned getHashValue(const CVSymbol &Val) { 79 return xxh3_64bits(Val.RecordData); 80 } 81 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { 82 return LHS.RecordData == RHS.RecordData; 83 } 84 }; 85 86 namespace { 87 LLVM_PACKED_START 88 struct PublicSym32Layout { 89 RecordPrefix Prefix; 90 PublicSym32Header Pub; 91 // char Name[]; 92 }; 93 LLVM_PACKED_END 94 } // namespace 95 96 // Calculate how much memory this public needs when serialized. 97 static uint32_t sizeOfPublic(const BulkPublic &Pub) { 98 uint32_t NameLen = Pub.NameLen; 99 NameLen = std::min(NameLen, 100 uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 101 return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 102 } 103 104 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) { 105 // Assume the caller has allocated sizeOfPublic bytes. 106 uint32_t NameLen = std::min( 107 Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 108 size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 109 assert(Size == sizeOfPublic(Pub)); 110 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem); 111 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32); 112 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2); 113 FixedMem->Pub.Flags = Pub.Flags; 114 FixedMem->Pub.Offset = Pub.Offset; 115 FixedMem->Pub.Segment = Pub.Segment; 116 char *NameMem = reinterpret_cast<char *>(FixedMem + 1); 117 memcpy(NameMem, Pub.Name, NameLen); 118 // Zero the null terminator and remaining bytes. 119 memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen); 120 return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size)); 121 } 122 123 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { 124 uint32_t Size = sizeof(GSIHashHeader); 125 Size += HashRecords.size() * sizeof(PSHashRecord); 126 Size += HashBitmap.size() * sizeof(uint32_t); 127 Size += HashBuckets.size() * sizeof(uint32_t); 128 return Size; 129 } 130 131 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { 132 GSIHashHeader Header; 133 Header.VerSignature = GSIHashHeader::HdrSignature; 134 Header.VerHdr = GSIHashHeader::HdrVersion; 135 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); 136 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; 137 138 if (auto EC = Writer.writeObject(Header)) 139 return EC; 140 141 if (auto EC = Writer.writeArray(ArrayRef(HashRecords))) 142 return EC; 143 if (auto EC = Writer.writeArray(ArrayRef(HashBitmap))) 144 return EC; 145 if (auto EC = Writer.writeArray(ArrayRef(HashBuckets))) 146 return EC; 147 return Error::success(); 148 } 149 150 static bool isAsciiString(StringRef S) { 151 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); 152 } 153 154 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp 155 static int gsiRecordCmp(StringRef S1, StringRef S2) { 156 size_t LS = S1.size(); 157 size_t RS = S2.size(); 158 // Shorter strings always compare less than longer strings. 159 if (LS != RS) 160 return (LS > RS) - (LS < RS); 161 162 // If either string contains non ascii characters, memcmp them. 163 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) 164 return memcmp(S1.data(), S2.data(), LS); 165 166 // Both strings are ascii, perform a case-insensitive comparison. 167 return S1.compare_insensitive(S2.data()); 168 } 169 170 void GSIStreamBuilder::finalizePublicBuckets() { 171 PSH->finalizeBuckets(0, Publics); 172 } 173 174 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) { 175 // Build up a list of globals to be bucketed. Use the BulkPublic data 176 // structure for this purpose, even though these are global records, not 177 // public records. Most of the same fields are required: 178 // - Name 179 // - NameLen 180 // - SymOffset 181 // - BucketIdx 182 // The dead fields are Offset, Segment, and Flags. 183 std::vector<BulkPublic> Records; 184 Records.resize(Globals.size()); 185 uint32_t SymOffset = RecordZeroOffset; 186 for (size_t I = 0, E = Globals.size(); I < E; ++I) { 187 StringRef Name = getSymbolName(Globals[I]); 188 Records[I].Name = Name.data(); 189 Records[I].NameLen = Name.size(); 190 Records[I].SymOffset = SymOffset; 191 SymOffset += Globals[I].length(); 192 } 193 194 GSH->finalizeBuckets(RecordZeroOffset, Records); 195 } 196 197 void GSIHashStreamBuilder::finalizeBuckets( 198 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) { 199 // Hash every name in parallel. 200 parallelFor(0, Records.size(), [&](size_t I) { 201 Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH); 202 }); 203 204 // Count up the size of each bucket. Then, use an exclusive prefix sum to 205 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but 206 // we can't use it yet. 207 uint32_t BucketStarts[IPHR_HASH] = {0}; 208 for (const BulkPublic &P : Records) 209 ++BucketStarts[P.BucketIdx]; 210 uint32_t Sum = 0; 211 for (uint32_t &B : BucketStarts) { 212 uint32_t Size = B; 213 B = Sum; 214 Sum += Size; 215 } 216 217 // Place globals into the hash table in bucket order. When placing a global, 218 // update the bucket start. Every hash table slot should be filled. Always use 219 // a refcount of one for now. 220 HashRecords.resize(Records.size()); 221 uint32_t BucketCursors[IPHR_HASH]; 222 memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors)); 223 for (int I = 0, E = Records.size(); I < E; ++I) { 224 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++; 225 HashRecords[HashIdx].Off = I; 226 HashRecords[HashIdx].CRef = 1; 227 } 228 229 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's 230 // important that we use the same sorting algorithm as is used by the 231 // reference implementation to ensure that the search for a record within a 232 // bucket can properly early-out when it detects the record won't be found. 233 // The algorithm used here corresponds to the function 234 // caseInsensitiveComparePchPchCchCch in the reference implementation. 235 parallelFor(0, IPHR_HASH, [&](size_t I) { 236 auto B = HashRecords.begin() + BucketStarts[I]; 237 auto E = HashRecords.begin() + BucketCursors[I]; 238 if (B == E) 239 return; 240 auto BucketCmp = [Records](const PSHashRecord &LHash, 241 const PSHashRecord &RHash) { 242 const BulkPublic &L = Records[uint32_t(LHash.Off)]; 243 const BulkPublic &R = Records[uint32_t(RHash.Off)]; 244 assert(L.BucketIdx == R.BucketIdx); 245 int Cmp = gsiRecordCmp(L.getName(), R.getName()); 246 if (Cmp != 0) 247 return Cmp < 0; 248 // This comparison is necessary to make the sorting stable in the presence 249 // of two static globals with the same name. The easiest way to observe 250 // this is with S_LDATA32 records. 251 return L.SymOffset < R.SymOffset; 252 }; 253 llvm::sort(B, E, BucketCmp); 254 255 // After we are done sorting, replace the global indices with the stream 256 // offsets of each global. Add one when writing symbol offsets to disk. 257 // See GSI1::fixSymRecs. 258 for (PSHashRecord &HRec : make_range(B, E)) 259 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1; 260 }); 261 262 // For each non-empty bucket, push the bucket start offset into HashBuckets 263 // and set a bit in the hash bitmap. 264 for (uint32_t I = 0; I < HashBitmap.size(); ++I) { 265 uint32_t Word = 0; 266 for (uint32_t J = 0; J < 32; ++J) { 267 // Skip empty buckets. 268 uint32_t BucketIdx = I * 32 + J; 269 if (BucketIdx >= IPHR_HASH || 270 BucketStarts[BucketIdx] == BucketCursors[BucketIdx]) 271 continue; 272 Word |= (1U << J); 273 274 // Calculate what the offset of the first hash record in the chain would 275 // be if it were inflated to contain 32-bit pointers. On a 32-bit system, 276 // each record would be 12 bytes. See HROffsetCalc in gsi.h. 277 const int SizeOfHROffsetCalc = 12; 278 ulittle32_t ChainStartOff = 279 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc); 280 HashBuckets.push_back(ChainStartOff); 281 } 282 HashBitmap[I] = Word; 283 } 284 } 285 286 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) 287 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()), 288 GSH(std::make_unique<GSIHashStreamBuilder>()) {} 289 290 GSIStreamBuilder::~GSIStreamBuilder() = default; 291 292 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { 293 uint32_t Size = 0; 294 Size += sizeof(PublicsStreamHeader); 295 Size += PSH->calculateSerializedLength(); 296 Size += Publics.size() * sizeof(uint32_t); // AddrMap 297 // FIXME: Add thunk map and section offsets for incremental linking. 298 299 return Size; 300 } 301 302 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { 303 return GSH->calculateSerializedLength(); 304 } 305 306 Error GSIStreamBuilder::finalizeMsfLayout() { 307 // First we write public symbol records, then we write global symbol records. 308 finalizePublicBuckets(); 309 finalizeGlobalBuckets(PSH->RecordByteSize); 310 311 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize()); 312 if (!Idx) 313 return Idx.takeError(); 314 GlobalsStreamIndex = *Idx; 315 316 Idx = Msf.addStream(calculatePublicsHashStreamSize()); 317 if (!Idx) 318 return Idx.takeError(); 319 PublicsStreamIndex = *Idx; 320 321 uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize; 322 323 Idx = Msf.addStream(RecordBytes); 324 if (!Idx) 325 return Idx.takeError(); 326 RecordStreamIndex = *Idx; 327 return Error::success(); 328 } 329 330 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) { 331 assert(Publics.empty() && PSH->RecordByteSize == 0 && 332 "publics can only be added once"); 333 Publics = std::move(PublicsIn); 334 335 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism. 336 parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) { 337 return L.getName() < R.getName(); 338 }); 339 340 // Assign offsets and calculate the length of the public symbol records. 341 uint32_t SymOffset = 0; 342 for (BulkPublic &Pub : Publics) { 343 Pub.SymOffset = SymOffset; 344 SymOffset += sizeOfPublic(Pub); 345 } 346 347 // Remember the length of the public stream records. 348 PSH->RecordByteSize = SymOffset; 349 } 350 351 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { 352 serializeAndAddGlobal(Sym); 353 } 354 355 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { 356 serializeAndAddGlobal(Sym); 357 } 358 359 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { 360 serializeAndAddGlobal(Sym); 361 } 362 363 template <typename T> 364 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) { 365 T Copy(Symbol); 366 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), 367 CodeViewContainer::Pdb)); 368 } 369 370 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) { 371 // Ignore duplicate typedefs and constants. 372 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { 373 auto Iter = GlobalsSeen.insert(Symbol); 374 if (!Iter.second) 375 return; 376 } 377 GSH->RecordByteSize += Symbol.length(); 378 Globals.push_back(Symbol); 379 } 380 381 // Serialize each public and write it. 382 static Error writePublics(BinaryStreamWriter &Writer, 383 ArrayRef<BulkPublic> Publics) { 384 std::vector<uint8_t> Storage; 385 for (const BulkPublic &Pub : Publics) { 386 Storage.resize(sizeOfPublic(Pub)); 387 serializePublic(Storage.data(), Pub); 388 if (Error E = Writer.writeBytes(Storage)) 389 return E; 390 } 391 return Error::success(); 392 } 393 394 static Error writeRecords(BinaryStreamWriter &Writer, 395 ArrayRef<CVSymbol> Records) { 396 BinaryItemStream<CVSymbol> ItemStream(support::endianness::little); 397 ItemStream.setItems(Records); 398 BinaryStreamRef RecordsRef(ItemStream); 399 return Writer.writeStreamRef(RecordsRef); 400 } 401 402 Error GSIStreamBuilder::commitSymbolRecordStream( 403 WritableBinaryStreamRef Stream) { 404 BinaryStreamWriter Writer(Stream); 405 406 // Write public symbol records first, followed by global symbol records. This 407 // must match the order that we assume in finalizeMsfLayout when computing 408 // PSHZero and GSHZero. 409 if (auto EC = writePublics(Writer, Publics)) 410 return EC; 411 if (auto EC = writeRecords(Writer, Globals)) 412 return EC; 413 414 return Error::success(); 415 } 416 417 static std::vector<support::ulittle32_t> 418 computeAddrMap(ArrayRef<BulkPublic> Publics) { 419 // Build a parallel vector of indices into the Publics vector, and sort it by 420 // address. 421 std::vector<ulittle32_t> PubAddrMap; 422 PubAddrMap.reserve(Publics.size()); 423 for (int I = 0, E = Publics.size(); I < E; ++I) 424 PubAddrMap.push_back(ulittle32_t(I)); 425 426 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) { 427 const BulkPublic &L = Publics[LIdx]; 428 const BulkPublic &R = Publics[RIdx]; 429 if (L.Segment != R.Segment) 430 return L.Segment < R.Segment; 431 if (L.Offset != R.Offset) 432 return L.Offset < R.Offset; 433 // parallelSort is unstable, so we have to do name comparison to ensure 434 // that two names for the same location come out in a deterministic order. 435 return L.getName() < R.getName(); 436 }; 437 parallelSort(PubAddrMap, AddrCmp); 438 439 // Rewrite the public symbol indices into symbol offsets. 440 for (ulittle32_t &Entry : PubAddrMap) 441 Entry = Publics[Entry].SymOffset; 442 return PubAddrMap; 443 } 444 445 Error GSIStreamBuilder::commitPublicsHashStream( 446 WritableBinaryStreamRef Stream) { 447 BinaryStreamWriter Writer(Stream); 448 PublicsStreamHeader Header; 449 450 // FIXME: Fill these in. They are for incremental linking. 451 Header.SymHash = PSH->calculateSerializedLength(); 452 Header.AddrMap = Publics.size() * 4; 453 Header.NumThunks = 0; 454 Header.SizeOfThunk = 0; 455 Header.ISectThunkTable = 0; 456 memset(Header.Padding, 0, sizeof(Header.Padding)); 457 Header.OffThunkTable = 0; 458 Header.NumSections = 0; 459 if (auto EC = Writer.writeObject(Header)) 460 return EC; 461 462 if (auto EC = PSH->commit(Writer)) 463 return EC; 464 465 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics); 466 assert(PubAddrMap.size() == Publics.size()); 467 if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap))) 468 return EC; 469 470 return Error::success(); 471 } 472 473 Error GSIStreamBuilder::commitGlobalsHashStream( 474 WritableBinaryStreamRef Stream) { 475 BinaryStreamWriter Writer(Stream); 476 return GSH->commit(Writer); 477 } 478 479 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, 480 WritableBinaryStreamRef Buffer) { 481 auto GS = WritableMappedBlockStream::createIndexedStream( 482 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); 483 auto PS = WritableMappedBlockStream::createIndexedStream( 484 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); 485 auto PRS = WritableMappedBlockStream::createIndexedStream( 486 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); 487 488 if (auto EC = commitSymbolRecordStream(*PRS)) 489 return EC; 490 if (auto EC = commitGlobalsHashStream(*GS)) 491 return EC; 492 if (auto EC = commitPublicsHashStream(*PS)) 493 return EC; 494 return Error::success(); 495 } 496