1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The data structures defined in this file are based on the reference 10 // implementation which is available at 11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" 16 #include "llvm/DebugInfo/CodeView/RecordName.h" 17 #include "llvm/DebugInfo/CodeView/RecordSerialization.h" 18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h" 19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" 20 #include "llvm/DebugInfo/MSF/MSFBuilder.h" 21 #include "llvm/DebugInfo/MSF/MSFCommon.h" 22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h" 23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" 24 #include "llvm/DebugInfo/PDB/Native/Hash.h" 25 #include "llvm/DebugInfo/PDB/Native/RawTypes.h" 26 #include "llvm/Support/BinaryItemStream.h" 27 #include "llvm/Support/BinaryStreamWriter.h" 28 #include "llvm/Support/Parallel.h" 29 #include "llvm/Support/TimeProfiler.h" 30 #include "llvm/Support/xxhash.h" 31 #include <algorithm> 32 #include <vector> 33 34 using namespace llvm; 35 using namespace llvm::msf; 36 using namespace llvm::pdb; 37 using namespace llvm::codeview; 38 39 // Helper class for building the public and global PDB hash table buckets. 40 struct llvm::pdb::GSIHashStreamBuilder { 41 // Sum of the size of all public or global records. 42 uint32_t RecordByteSize = 0; 43 44 std::vector<PSHashRecord> HashRecords; 45 46 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The 47 // reference implementation builds a hash table with IPHR_HASH buckets in it. 48 // The last bucket is used to link together free hash table cells in a linked 49 // list, but it is always empty in the compressed, on-disk format. However, 50 // the bitmap must have a bit for it. 51 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap; 52 53 std::vector<support::ulittle32_t> HashBuckets; 54 55 uint32_t calculateSerializedLength() const; 56 Error commit(BinaryStreamWriter &Writer); 57 58 void finalizePublicBuckets(); 59 void finalizeGlobalBuckets(uint32_t RecordZeroOffset); 60 61 // Assign public and global symbol records into hash table buckets. 62 // Modifies the list of records to store the bucket index, but does not 63 // change the order. 64 void finalizeBuckets(uint32_t RecordZeroOffset, 65 MutableArrayRef<BulkPublic> Globals); 66 }; 67 68 // DenseMapInfo implementation for deduplicating symbol records. 69 struct llvm::pdb::SymbolDenseMapInfo { 70 static inline CVSymbol getEmptyKey() { 71 static CVSymbol Empty; 72 return Empty; 73 } 74 static inline CVSymbol getTombstoneKey() { 75 static CVSymbol Tombstone( 76 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey()); 77 return Tombstone; 78 } 79 static unsigned getHashValue(const CVSymbol &Val) { 80 return xxh3_64bits(Val.RecordData); 81 } 82 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { 83 return LHS.RecordData == RHS.RecordData; 84 } 85 }; 86 87 namespace { 88 LLVM_PACKED_START 89 struct PublicSym32Layout { 90 RecordPrefix Prefix; 91 PublicSym32Header Pub; 92 // char Name[]; 93 }; 94 LLVM_PACKED_END 95 } // namespace 96 97 // Calculate how much memory this public needs when serialized. 98 static uint32_t sizeOfPublic(const BulkPublic &Pub) { 99 uint32_t NameLen = Pub.NameLen; 100 NameLen = std::min(NameLen, 101 uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 102 return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 103 } 104 105 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) { 106 // Assume the caller has allocated sizeOfPublic bytes. 107 uint32_t NameLen = std::min( 108 Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 109 size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 110 assert(Size == sizeOfPublic(Pub)); 111 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem); 112 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32); 113 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2); 114 FixedMem->Pub.Flags = Pub.Flags; 115 FixedMem->Pub.Offset = Pub.Offset; 116 FixedMem->Pub.Segment = Pub.Segment; 117 char *NameMem = reinterpret_cast<char *>(FixedMem + 1); 118 memcpy(NameMem, Pub.Name, NameLen); 119 // Zero the null terminator and remaining bytes. 120 memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen); 121 return CVSymbol(ArrayRef(reinterpret_cast<uint8_t *>(Mem), Size)); 122 } 123 124 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { 125 uint32_t Size = sizeof(GSIHashHeader); 126 Size += HashRecords.size() * sizeof(PSHashRecord); 127 Size += HashBitmap.size() * sizeof(uint32_t); 128 Size += HashBuckets.size() * sizeof(uint32_t); 129 return Size; 130 } 131 132 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { 133 GSIHashHeader Header; 134 Header.VerSignature = GSIHashHeader::HdrSignature; 135 Header.VerHdr = GSIHashHeader::HdrVersion; 136 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); 137 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; 138 139 if (auto EC = Writer.writeObject(Header)) 140 return EC; 141 142 if (auto EC = Writer.writeArray(ArrayRef(HashRecords))) 143 return EC; 144 if (auto EC = Writer.writeArray(ArrayRef(HashBitmap))) 145 return EC; 146 if (auto EC = Writer.writeArray(ArrayRef(HashBuckets))) 147 return EC; 148 return Error::success(); 149 } 150 151 static bool isAsciiString(StringRef S) { 152 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); 153 } 154 155 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp 156 static int gsiRecordCmp(StringRef S1, StringRef S2) { 157 size_t LS = S1.size(); 158 size_t RS = S2.size(); 159 // Shorter strings always compare less than longer strings. 160 if (LS != RS) 161 return (LS > RS) - (LS < RS); 162 163 // If either string contains non ascii characters, memcmp them. 164 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) 165 return memcmp(S1.data(), S2.data(), LS); 166 167 // Both strings are ascii, perform a case-insensitive comparison. 168 return S1.compare_insensitive(S2.data()); 169 } 170 171 void GSIStreamBuilder::finalizePublicBuckets() { 172 PSH->finalizeBuckets(0, Publics); 173 } 174 175 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) { 176 // Build up a list of globals to be bucketed. Use the BulkPublic data 177 // structure for this purpose, even though these are global records, not 178 // public records. Most of the same fields are required: 179 // - Name 180 // - NameLen 181 // - SymOffset 182 // - BucketIdx 183 // The dead fields are Offset, Segment, and Flags. 184 std::vector<BulkPublic> Records; 185 Records.resize(Globals.size()); 186 uint32_t SymOffset = RecordZeroOffset; 187 for (size_t I = 0, E = Globals.size(); I < E; ++I) { 188 StringRef Name = getSymbolName(Globals[I]); 189 Records[I].Name = Name.data(); 190 Records[I].NameLen = Name.size(); 191 Records[I].SymOffset = SymOffset; 192 SymOffset += Globals[I].length(); 193 } 194 195 GSH->finalizeBuckets(RecordZeroOffset, Records); 196 } 197 198 void GSIHashStreamBuilder::finalizeBuckets( 199 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) { 200 // Hash every name in parallel. 201 parallelFor(0, Records.size(), [&](size_t I) { 202 Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH); 203 }); 204 205 // Count up the size of each bucket. Then, use an exclusive prefix sum to 206 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but 207 // we can't use it yet. 208 uint32_t BucketStarts[IPHR_HASH] = {0}; 209 for (const BulkPublic &P : Records) 210 ++BucketStarts[P.BucketIdx]; 211 uint32_t Sum = 0; 212 for (uint32_t &B : BucketStarts) { 213 uint32_t Size = B; 214 B = Sum; 215 Sum += Size; 216 } 217 218 // Place globals into the hash table in bucket order. When placing a global, 219 // update the bucket start. Every hash table slot should be filled. Always use 220 // a refcount of one for now. 221 HashRecords.resize(Records.size()); 222 uint32_t BucketCursors[IPHR_HASH]; 223 memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors)); 224 for (int I = 0, E = Records.size(); I < E; ++I) { 225 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++; 226 HashRecords[HashIdx].Off = I; 227 HashRecords[HashIdx].CRef = 1; 228 } 229 230 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's 231 // important that we use the same sorting algorithm as is used by the 232 // reference implementation to ensure that the search for a record within a 233 // bucket can properly early-out when it detects the record won't be found. 234 // The algorithm used here corresponds to the function 235 // caseInsensitiveComparePchPchCchCch in the reference implementation. 236 parallelFor(0, IPHR_HASH, [&](size_t I) { 237 auto B = HashRecords.begin() + BucketStarts[I]; 238 auto E = HashRecords.begin() + BucketCursors[I]; 239 if (B == E) 240 return; 241 auto BucketCmp = [Records](const PSHashRecord &LHash, 242 const PSHashRecord &RHash) { 243 const BulkPublic &L = Records[uint32_t(LHash.Off)]; 244 const BulkPublic &R = Records[uint32_t(RHash.Off)]; 245 assert(L.BucketIdx == R.BucketIdx); 246 int Cmp = gsiRecordCmp(L.getName(), R.getName()); 247 if (Cmp != 0) 248 return Cmp < 0; 249 // This comparison is necessary to make the sorting stable in the presence 250 // of two static globals with the same name. The easiest way to observe 251 // this is with S_LDATA32 records. 252 return L.SymOffset < R.SymOffset; 253 }; 254 llvm::sort(B, E, BucketCmp); 255 256 // After we are done sorting, replace the global indices with the stream 257 // offsets of each global. Add one when writing symbol offsets to disk. 258 // See GSI1::fixSymRecs. 259 for (PSHashRecord &HRec : make_range(B, E)) 260 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1; 261 }); 262 263 // For each non-empty bucket, push the bucket start offset into HashBuckets 264 // and set a bit in the hash bitmap. 265 for (uint32_t I = 0; I < HashBitmap.size(); ++I) { 266 uint32_t Word = 0; 267 for (uint32_t J = 0; J < 32; ++J) { 268 // Skip empty buckets. 269 uint32_t BucketIdx = I * 32 + J; 270 if (BucketIdx >= IPHR_HASH || 271 BucketStarts[BucketIdx] == BucketCursors[BucketIdx]) 272 continue; 273 Word |= (1U << J); 274 275 // Calculate what the offset of the first hash record in the chain would 276 // be if it were inflated to contain 32-bit pointers. On a 32-bit system, 277 // each record would be 12 bytes. See HROffsetCalc in gsi.h. 278 const int SizeOfHROffsetCalc = 12; 279 ulittle32_t ChainStartOff = 280 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc); 281 HashBuckets.push_back(ChainStartOff); 282 } 283 HashBitmap[I] = Word; 284 } 285 } 286 287 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) 288 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()), 289 GSH(std::make_unique<GSIHashStreamBuilder>()) {} 290 291 GSIStreamBuilder::~GSIStreamBuilder() = default; 292 293 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { 294 uint32_t Size = 0; 295 Size += sizeof(PublicsStreamHeader); 296 Size += PSH->calculateSerializedLength(); 297 Size += Publics.size() * sizeof(uint32_t); // AddrMap 298 // FIXME: Add thunk map and section offsets for incremental linking. 299 300 return Size; 301 } 302 303 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { 304 return GSH->calculateSerializedLength(); 305 } 306 307 Error GSIStreamBuilder::finalizeMsfLayout() { 308 // First we write public symbol records, then we write global symbol records. 309 finalizePublicBuckets(); 310 finalizeGlobalBuckets(PSH->RecordByteSize); 311 312 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize()); 313 if (!Idx) 314 return Idx.takeError(); 315 GlobalsStreamIndex = *Idx; 316 317 Idx = Msf.addStream(calculatePublicsHashStreamSize()); 318 if (!Idx) 319 return Idx.takeError(); 320 PublicsStreamIndex = *Idx; 321 322 uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize; 323 324 Idx = Msf.addStream(RecordBytes); 325 if (!Idx) 326 return Idx.takeError(); 327 RecordStreamIndex = *Idx; 328 return Error::success(); 329 } 330 331 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) { 332 assert(Publics.empty() && PSH->RecordByteSize == 0 && 333 "publics can only be added once"); 334 Publics = std::move(PublicsIn); 335 336 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism. 337 parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) { 338 return L.getName() < R.getName(); 339 }); 340 341 // Assign offsets and calculate the length of the public symbol records. 342 uint32_t SymOffset = 0; 343 for (BulkPublic &Pub : Publics) { 344 Pub.SymOffset = SymOffset; 345 SymOffset += sizeOfPublic(Pub); 346 } 347 348 // Remember the length of the public stream records. 349 PSH->RecordByteSize = SymOffset; 350 } 351 352 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { 353 serializeAndAddGlobal(Sym); 354 } 355 356 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { 357 serializeAndAddGlobal(Sym); 358 } 359 360 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { 361 serializeAndAddGlobal(Sym); 362 } 363 364 template <typename T> 365 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) { 366 T Copy(Symbol); 367 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), 368 CodeViewContainer::Pdb)); 369 } 370 371 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) { 372 // Ignore duplicate typedefs and constants. 373 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { 374 auto Iter = GlobalsSeen.insert(Symbol); 375 if (!Iter.second) 376 return; 377 } 378 GSH->RecordByteSize += Symbol.length(); 379 Globals.push_back(Symbol); 380 } 381 382 // Serialize each public and write it. 383 static Error writePublics(BinaryStreamWriter &Writer, 384 ArrayRef<BulkPublic> Publics) { 385 std::vector<uint8_t> Storage; 386 for (const BulkPublic &Pub : Publics) { 387 Storage.resize(sizeOfPublic(Pub)); 388 serializePublic(Storage.data(), Pub); 389 if (Error E = Writer.writeBytes(Storage)) 390 return E; 391 } 392 return Error::success(); 393 } 394 395 static Error writeRecords(BinaryStreamWriter &Writer, 396 ArrayRef<CVSymbol> Records) { 397 BinaryItemStream<CVSymbol> ItemStream(llvm::endianness::little); 398 ItemStream.setItems(Records); 399 BinaryStreamRef RecordsRef(ItemStream); 400 return Writer.writeStreamRef(RecordsRef); 401 } 402 403 Error GSIStreamBuilder::commitSymbolRecordStream( 404 WritableBinaryStreamRef Stream) { 405 BinaryStreamWriter Writer(Stream); 406 407 // Write public symbol records first, followed by global symbol records. This 408 // must match the order that we assume in finalizeMsfLayout when computing 409 // PSHZero and GSHZero. 410 if (auto EC = writePublics(Writer, Publics)) 411 return EC; 412 if (auto EC = writeRecords(Writer, Globals)) 413 return EC; 414 415 return Error::success(); 416 } 417 418 static std::vector<support::ulittle32_t> 419 computeAddrMap(ArrayRef<BulkPublic> Publics) { 420 // Build a parallel vector of indices into the Publics vector, and sort it by 421 // address. 422 std::vector<ulittle32_t> PubAddrMap; 423 PubAddrMap.reserve(Publics.size()); 424 for (int I = 0, E = Publics.size(); I < E; ++I) 425 PubAddrMap.push_back(ulittle32_t(I)); 426 427 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) { 428 const BulkPublic &L = Publics[LIdx]; 429 const BulkPublic &R = Publics[RIdx]; 430 if (L.Segment != R.Segment) 431 return L.Segment < R.Segment; 432 if (L.Offset != R.Offset) 433 return L.Offset < R.Offset; 434 // parallelSort is unstable, so we have to do name comparison to ensure 435 // that two names for the same location come out in a deterministic order. 436 return L.getName() < R.getName(); 437 }; 438 parallelSort(PubAddrMap, AddrCmp); 439 440 // Rewrite the public symbol indices into symbol offsets. 441 for (ulittle32_t &Entry : PubAddrMap) 442 Entry = Publics[Entry].SymOffset; 443 return PubAddrMap; 444 } 445 446 Error GSIStreamBuilder::commitPublicsHashStream( 447 WritableBinaryStreamRef Stream) { 448 BinaryStreamWriter Writer(Stream); 449 PublicsStreamHeader Header; 450 451 // FIXME: Fill these in. They are for incremental linking. 452 Header.SymHash = PSH->calculateSerializedLength(); 453 Header.AddrMap = Publics.size() * 4; 454 Header.NumThunks = 0; 455 Header.SizeOfThunk = 0; 456 Header.ISectThunkTable = 0; 457 memset(Header.Padding, 0, sizeof(Header.Padding)); 458 Header.OffThunkTable = 0; 459 Header.NumSections = 0; 460 if (auto EC = Writer.writeObject(Header)) 461 return EC; 462 463 if (auto EC = PSH->commit(Writer)) 464 return EC; 465 466 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics); 467 assert(PubAddrMap.size() == Publics.size()); 468 if (auto EC = Writer.writeArray(ArrayRef(PubAddrMap))) 469 return EC; 470 471 return Error::success(); 472 } 473 474 Error GSIStreamBuilder::commitGlobalsHashStream( 475 WritableBinaryStreamRef Stream) { 476 BinaryStreamWriter Writer(Stream); 477 return GSH->commit(Writer); 478 } 479 480 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, 481 WritableBinaryStreamRef Buffer) { 482 llvm::TimeTraceScope timeScope("Commit GSI stream"); 483 auto GS = WritableMappedBlockStream::createIndexedStream( 484 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); 485 auto PS = WritableMappedBlockStream::createIndexedStream( 486 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); 487 auto PRS = WritableMappedBlockStream::createIndexedStream( 488 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); 489 490 if (auto EC = commitSymbolRecordStream(*PRS)) 491 return EC; 492 if (auto EC = commitGlobalsHashStream(*GS)) 493 return EC; 494 if (auto EC = commitPublicsHashStream(*PS)) 495 return EC; 496 return Error::success(); 497 } 498