1 //===- DbiStreamBuilder.cpp - PDB Dbi Stream Creation -----------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // The data structures defined in this file are based on the reference 10 // implementation which is available at 11 // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h" 16 #include "llvm/DebugInfo/CodeView/RecordName.h" 17 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h" 18 #include "llvm/DebugInfo/CodeView/SymbolRecord.h" 19 #include "llvm/DebugInfo/CodeView/SymbolSerializer.h" 20 #include "llvm/DebugInfo/MSF/MSFBuilder.h" 21 #include "llvm/DebugInfo/MSF/MSFCommon.h" 22 #include "llvm/DebugInfo/MSF/MappedBlockStream.h" 23 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h" 24 #include "llvm/DebugInfo/PDB/Native/Hash.h" 25 #include "llvm/Support/BinaryItemStream.h" 26 #include "llvm/Support/BinaryStreamWriter.h" 27 #include "llvm/Support/Parallel.h" 28 #include "llvm/Support/xxhash.h" 29 #include <algorithm> 30 #include <vector> 31 32 using namespace llvm; 33 using namespace llvm::msf; 34 using namespace llvm::pdb; 35 using namespace llvm::codeview; 36 37 // Helper class for building the public and global PDB hash table buckets. 38 struct llvm::pdb::GSIHashStreamBuilder { 39 // Sum of the size of all public or global records. 40 uint32_t RecordByteSize = 0; 41 42 std::vector<PSHashRecord> HashRecords; 43 44 // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The 45 // reference implementation builds a hash table with IPHR_HASH buckets in it. 46 // The last bucket is used to link together free hash table cells in a linked 47 // list, but it is always empty in the compressed, on-disk format. However, 48 // the bitmap must have a bit for it. 49 std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap; 50 51 std::vector<support::ulittle32_t> HashBuckets; 52 53 uint32_t calculateSerializedLength() const; 54 Error commit(BinaryStreamWriter &Writer); 55 56 void finalizePublicBuckets(); 57 void finalizeGlobalBuckets(uint32_t RecordZeroOffset); 58 59 // Assign public and global symbol records into hash table buckets. 60 // Modifies the list of records to store the bucket index, but does not 61 // change the order. 62 void finalizeBuckets(uint32_t RecordZeroOffset, 63 MutableArrayRef<BulkPublic> Globals); 64 }; 65 66 // DenseMapInfo implementation for deduplicating symbol records. 67 struct llvm::pdb::SymbolDenseMapInfo { 68 static inline CVSymbol getEmptyKey() { 69 static CVSymbol Empty; 70 return Empty; 71 } 72 static inline CVSymbol getTombstoneKey() { 73 static CVSymbol Tombstone( 74 DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey()); 75 return Tombstone; 76 } 77 static unsigned getHashValue(const CVSymbol &Val) { 78 return xxHash64(Val.RecordData); 79 } 80 static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) { 81 return LHS.RecordData == RHS.RecordData; 82 } 83 }; 84 85 namespace { 86 LLVM_PACKED_START 87 struct PublicSym32Layout { 88 RecordPrefix Prefix; 89 PublicSym32Header Pub; 90 // char Name[]; 91 }; 92 LLVM_PACKED_END 93 } // namespace 94 95 // Calculate how much memory this public needs when serialized. 96 static uint32_t sizeOfPublic(const BulkPublic &Pub) { 97 uint32_t NameLen = Pub.NameLen; 98 NameLen = std::min(NameLen, 99 uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 100 return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 101 } 102 103 static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) { 104 // Assume the caller has allocated sizeOfPublic bytes. 105 uint32_t NameLen = std::min( 106 Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1)); 107 size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4); 108 assert(Size == sizeOfPublic(Pub)); 109 auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem); 110 FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32); 111 FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2); 112 FixedMem->Pub.Flags = Pub.Flags; 113 FixedMem->Pub.Offset = Pub.Offset; 114 FixedMem->Pub.Segment = Pub.Segment; 115 char *NameMem = reinterpret_cast<char *>(FixedMem + 1); 116 memcpy(NameMem, Pub.Name, NameLen); 117 // Zero the null terminator and remaining bytes. 118 memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen); 119 return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size)); 120 } 121 122 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const { 123 uint32_t Size = sizeof(GSIHashHeader); 124 Size += HashRecords.size() * sizeof(PSHashRecord); 125 Size += HashBitmap.size() * sizeof(uint32_t); 126 Size += HashBuckets.size() * sizeof(uint32_t); 127 return Size; 128 } 129 130 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) { 131 GSIHashHeader Header; 132 Header.VerSignature = GSIHashHeader::HdrSignature; 133 Header.VerHdr = GSIHashHeader::HdrVersion; 134 Header.HrSize = HashRecords.size() * sizeof(PSHashRecord); 135 Header.NumBuckets = HashBitmap.size() * 4 + HashBuckets.size() * 4; 136 137 if (auto EC = Writer.writeObject(Header)) 138 return EC; 139 140 if (auto EC = Writer.writeArray(makeArrayRef(HashRecords))) 141 return EC; 142 if (auto EC = Writer.writeArray(makeArrayRef(HashBitmap))) 143 return EC; 144 if (auto EC = Writer.writeArray(makeArrayRef(HashBuckets))) 145 return EC; 146 return Error::success(); 147 } 148 149 static bool isAsciiString(StringRef S) { 150 return llvm::all_of(S, [](char C) { return unsigned(C) < 0x80; }); 151 } 152 153 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp 154 static int gsiRecordCmp(StringRef S1, StringRef S2) { 155 size_t LS = S1.size(); 156 size_t RS = S2.size(); 157 // Shorter strings always compare less than longer strings. 158 if (LS != RS) 159 return (LS > RS) - (LS < RS); 160 161 // If either string contains non ascii characters, memcmp them. 162 if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2))) 163 return memcmp(S1.data(), S2.data(), LS); 164 165 // Both strings are ascii, perform a case-insensitive comparison. 166 return S1.compare_insensitive(S2.data()); 167 } 168 169 void GSIStreamBuilder::finalizePublicBuckets() { 170 PSH->finalizeBuckets(0, Publics); 171 } 172 173 void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) { 174 // Build up a list of globals to be bucketed. Use the BulkPublic data 175 // structure for this purpose, even though these are global records, not 176 // public records. Most of the same fields are required: 177 // - Name 178 // - NameLen 179 // - SymOffset 180 // - BucketIdx 181 // The dead fields are Offset, Segment, and Flags. 182 std::vector<BulkPublic> Records; 183 Records.resize(Globals.size()); 184 uint32_t SymOffset = RecordZeroOffset; 185 for (size_t I = 0, E = Globals.size(); I < E; ++I) { 186 StringRef Name = getSymbolName(Globals[I]); 187 Records[I].Name = Name.data(); 188 Records[I].NameLen = Name.size(); 189 Records[I].SymOffset = SymOffset; 190 SymOffset += Globals[I].length(); 191 } 192 193 GSH->finalizeBuckets(RecordZeroOffset, Records); 194 } 195 196 void GSIHashStreamBuilder::finalizeBuckets( 197 uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) { 198 // Hash every name in parallel. 199 parallelForEachN(0, Records.size(), [&](size_t I) { 200 Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH); 201 }); 202 203 // Count up the size of each bucket. Then, use an exclusive prefix sum to 204 // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but 205 // we can't use it yet. 206 uint32_t BucketStarts[IPHR_HASH] = {0}; 207 for (const BulkPublic &P : Records) 208 ++BucketStarts[P.BucketIdx]; 209 uint32_t Sum = 0; 210 for (uint32_t &B : BucketStarts) { 211 uint32_t Size = B; 212 B = Sum; 213 Sum += Size; 214 } 215 216 // Place globals into the hash table in bucket order. When placing a global, 217 // update the bucket start. Every hash table slot should be filled. Always use 218 // a refcount of one for now. 219 HashRecords.resize(Records.size()); 220 uint32_t BucketCursors[IPHR_HASH]; 221 memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors)); 222 for (int I = 0, E = Records.size(); I < E; ++I) { 223 uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++; 224 HashRecords[HashIdx].Off = I; 225 HashRecords[HashIdx].CRef = 1; 226 } 227 228 // Within the buckets, sort each bucket by memcmp of the symbol's name. It's 229 // important that we use the same sorting algorithm as is used by the 230 // reference implementation to ensure that the search for a record within a 231 // bucket can properly early-out when it detects the record won't be found. 232 // The algorithm used here corresponds to the function 233 // caseInsensitiveComparePchPchCchCch in the reference implementation. 234 parallelForEachN(0, IPHR_HASH, [&](size_t I) { 235 auto B = HashRecords.begin() + BucketStarts[I]; 236 auto E = HashRecords.begin() + BucketCursors[I]; 237 if (B == E) 238 return; 239 auto BucketCmp = [Records](const PSHashRecord &LHash, 240 const PSHashRecord &RHash) { 241 const BulkPublic &L = Records[uint32_t(LHash.Off)]; 242 const BulkPublic &R = Records[uint32_t(RHash.Off)]; 243 assert(L.BucketIdx == R.BucketIdx); 244 int Cmp = gsiRecordCmp(L.getName(), R.getName()); 245 if (Cmp != 0) 246 return Cmp < 0; 247 // This comparison is necessary to make the sorting stable in the presence 248 // of two static globals with the same name. The easiest way to observe 249 // this is with S_LDATA32 records. 250 return L.SymOffset < R.SymOffset; 251 }; 252 llvm::sort(B, E, BucketCmp); 253 254 // After we are done sorting, replace the global indices with the stream 255 // offsets of each global. Add one when writing symbol offsets to disk. 256 // See GSI1::fixSymRecs. 257 for (PSHashRecord &HRec : make_range(B, E)) 258 HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1; 259 }); 260 261 // For each non-empty bucket, push the bucket start offset into HashBuckets 262 // and set a bit in the hash bitmap. 263 for (uint32_t I = 0; I < HashBitmap.size(); ++I) { 264 uint32_t Word = 0; 265 for (uint32_t J = 0; J < 32; ++J) { 266 // Skip empty buckets. 267 uint32_t BucketIdx = I * 32 + J; 268 if (BucketIdx >= IPHR_HASH || 269 BucketStarts[BucketIdx] == BucketCursors[BucketIdx]) 270 continue; 271 Word |= (1U << J); 272 273 // Calculate what the offset of the first hash record in the chain would 274 // be if it were inflated to contain 32-bit pointers. On a 32-bit system, 275 // each record would be 12 bytes. See HROffsetCalc in gsi.h. 276 const int SizeOfHROffsetCalc = 12; 277 ulittle32_t ChainStartOff = 278 ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc); 279 HashBuckets.push_back(ChainStartOff); 280 } 281 HashBitmap[I] = Word; 282 } 283 } 284 285 GSIStreamBuilder::GSIStreamBuilder(msf::MSFBuilder &Msf) 286 : Msf(Msf), PSH(std::make_unique<GSIHashStreamBuilder>()), 287 GSH(std::make_unique<GSIHashStreamBuilder>()) {} 288 289 GSIStreamBuilder::~GSIStreamBuilder() {} 290 291 uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const { 292 uint32_t Size = 0; 293 Size += sizeof(PublicsStreamHeader); 294 Size += PSH->calculateSerializedLength(); 295 Size += Publics.size() * sizeof(uint32_t); // AddrMap 296 // FIXME: Add thunk map and section offsets for incremental linking. 297 298 return Size; 299 } 300 301 uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const { 302 return GSH->calculateSerializedLength(); 303 } 304 305 Error GSIStreamBuilder::finalizeMsfLayout() { 306 // First we write public symbol records, then we write global symbol records. 307 finalizePublicBuckets(); 308 finalizeGlobalBuckets(PSH->RecordByteSize); 309 310 Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize()); 311 if (!Idx) 312 return Idx.takeError(); 313 GlobalsStreamIndex = *Idx; 314 315 Idx = Msf.addStream(calculatePublicsHashStreamSize()); 316 if (!Idx) 317 return Idx.takeError(); 318 PublicsStreamIndex = *Idx; 319 320 uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize; 321 322 Idx = Msf.addStream(RecordBytes); 323 if (!Idx) 324 return Idx.takeError(); 325 RecordStreamIndex = *Idx; 326 return Error::success(); 327 } 328 329 void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) { 330 assert(Publics.empty() && PSH->RecordByteSize == 0 && 331 "publics can only be added once"); 332 Publics = std::move(PublicsIn); 333 334 // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism. 335 parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) { 336 return L.getName() < R.getName(); 337 }); 338 339 // Assign offsets and calculate the length of the public symbol records. 340 uint32_t SymOffset = 0; 341 for (BulkPublic &Pub : Publics) { 342 Pub.SymOffset = SymOffset; 343 SymOffset += sizeOfPublic(Pub); 344 } 345 346 // Remember the length of the public stream records. 347 PSH->RecordByteSize = SymOffset; 348 } 349 350 void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) { 351 serializeAndAddGlobal(Sym); 352 } 353 354 void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) { 355 serializeAndAddGlobal(Sym); 356 } 357 358 void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) { 359 serializeAndAddGlobal(Sym); 360 } 361 362 template <typename T> 363 void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) { 364 T Copy(Symbol); 365 addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(), 366 CodeViewContainer::Pdb)); 367 } 368 369 void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) { 370 // Ignore duplicate typedefs and constants. 371 if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) { 372 auto Iter = GlobalsSeen.insert(Symbol); 373 if (!Iter.second) 374 return; 375 } 376 GSH->RecordByteSize += Symbol.length(); 377 Globals.push_back(Symbol); 378 } 379 380 // Serialize each public and write it. 381 static Error writePublics(BinaryStreamWriter &Writer, 382 ArrayRef<BulkPublic> Publics) { 383 std::vector<uint8_t> Storage; 384 for (const BulkPublic &Pub : Publics) { 385 Storage.resize(sizeOfPublic(Pub)); 386 serializePublic(Storage.data(), Pub); 387 if (Error E = Writer.writeBytes(Storage)) 388 return E; 389 } 390 return Error::success(); 391 } 392 393 static Error writeRecords(BinaryStreamWriter &Writer, 394 ArrayRef<CVSymbol> Records) { 395 BinaryItemStream<CVSymbol> ItemStream(support::endianness::little); 396 ItemStream.setItems(Records); 397 BinaryStreamRef RecordsRef(ItemStream); 398 return Writer.writeStreamRef(RecordsRef); 399 } 400 401 Error GSIStreamBuilder::commitSymbolRecordStream( 402 WritableBinaryStreamRef Stream) { 403 BinaryStreamWriter Writer(Stream); 404 405 // Write public symbol records first, followed by global symbol records. This 406 // must match the order that we assume in finalizeMsfLayout when computing 407 // PSHZero and GSHZero. 408 if (auto EC = writePublics(Writer, Publics)) 409 return EC; 410 if (auto EC = writeRecords(Writer, Globals)) 411 return EC; 412 413 return Error::success(); 414 } 415 416 static std::vector<support::ulittle32_t> 417 computeAddrMap(ArrayRef<BulkPublic> Publics) { 418 // Build a parallel vector of indices into the Publics vector, and sort it by 419 // address. 420 std::vector<ulittle32_t> PubAddrMap; 421 PubAddrMap.reserve(Publics.size()); 422 for (int I = 0, E = Publics.size(); I < E; ++I) 423 PubAddrMap.push_back(ulittle32_t(I)); 424 425 auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) { 426 const BulkPublic &L = Publics[LIdx]; 427 const BulkPublic &R = Publics[RIdx]; 428 if (L.Segment != R.Segment) 429 return L.Segment < R.Segment; 430 if (L.Offset != R.Offset) 431 return L.Offset < R.Offset; 432 // parallelSort is unstable, so we have to do name comparison to ensure 433 // that two names for the same location come out in a deterministic order. 434 return L.getName() < R.getName(); 435 }; 436 parallelSort(PubAddrMap, AddrCmp); 437 438 // Rewrite the public symbol indices into symbol offsets. 439 for (ulittle32_t &Entry : PubAddrMap) 440 Entry = Publics[Entry].SymOffset; 441 return PubAddrMap; 442 } 443 444 Error GSIStreamBuilder::commitPublicsHashStream( 445 WritableBinaryStreamRef Stream) { 446 BinaryStreamWriter Writer(Stream); 447 PublicsStreamHeader Header; 448 449 // FIXME: Fill these in. They are for incremental linking. 450 Header.SymHash = PSH->calculateSerializedLength(); 451 Header.AddrMap = Publics.size() * 4; 452 Header.NumThunks = 0; 453 Header.SizeOfThunk = 0; 454 Header.ISectThunkTable = 0; 455 memset(Header.Padding, 0, sizeof(Header.Padding)); 456 Header.OffThunkTable = 0; 457 Header.NumSections = 0; 458 if (auto EC = Writer.writeObject(Header)) 459 return EC; 460 461 if (auto EC = PSH->commit(Writer)) 462 return EC; 463 464 std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics); 465 assert(PubAddrMap.size() == Publics.size()); 466 if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap))) 467 return EC; 468 469 return Error::success(); 470 } 471 472 Error GSIStreamBuilder::commitGlobalsHashStream( 473 WritableBinaryStreamRef Stream) { 474 BinaryStreamWriter Writer(Stream); 475 return GSH->commit(Writer); 476 } 477 478 Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout, 479 WritableBinaryStreamRef Buffer) { 480 auto GS = WritableMappedBlockStream::createIndexedStream( 481 Layout, Buffer, getGlobalsStreamIndex(), Msf.getAllocator()); 482 auto PS = WritableMappedBlockStream::createIndexedStream( 483 Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator()); 484 auto PRS = WritableMappedBlockStream::createIndexedStream( 485 Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator()); 486 487 if (auto EC = commitSymbolRecordStream(*PRS)) 488 return EC; 489 if (auto EC = commitGlobalsHashStream(*GS)) 490 return EC; 491 if (auto EC = commitPublicsHashStream(*PS)) 492 return EC; 493 return Error::success(); 494 } 495