1 //===- BitstreamReader.h - Low-level bitstream reader interface -*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This header defines the BitstreamReader class. This class can be used to 10 // read an arbitrary bitstream, regardless of its contents. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_BITSTREAM_BITSTREAMREADER_H 15 #define LLVM_BITSTREAM_BITSTREAMREADER_H 16 17 #include "llvm/ADT/ArrayRef.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/Bitstream/BitCodes.h" 20 #include "llvm/Support/Endian.h" 21 #include "llvm/Support/Error.h" 22 #include "llvm/Support/MemoryBufferRef.h" 23 #include <algorithm> 24 #include <cassert> 25 #include <climits> 26 #include <cstddef> 27 #include <cstdint> 28 #include <memory> 29 #include <optional> 30 #include <string> 31 #include <utility> 32 #include <vector> 33 34 namespace llvm { 35 36 /// This class maintains the abbreviations read from a block info block. 37 class BitstreamBlockInfo { 38 public: 39 /// This contains information emitted to BLOCKINFO_BLOCK blocks. These 40 /// describe abbreviations that all blocks of the specified ID inherit. 41 struct BlockInfo { 42 unsigned BlockID = 0; 43 std::vector<std::shared_ptr<BitCodeAbbrev>> Abbrevs; 44 std::string Name; 45 std::vector<std::pair<unsigned, std::string>> RecordNames; 46 }; 47 48 private: 49 std::vector<BlockInfo> BlockInfoRecords; 50 51 public: 52 /// If there is block info for the specified ID, return it, otherwise return 53 /// null. 54 const BlockInfo *getBlockInfo(unsigned BlockID) const { 55 // Common case, the most recent entry matches BlockID. 56 if (!BlockInfoRecords.empty() && BlockInfoRecords.back().BlockID == BlockID) 57 return &BlockInfoRecords.back(); 58 59 for (const BlockInfo &BI : BlockInfoRecords) 60 if (BI.BlockID == BlockID) 61 return &BI; 62 return nullptr; 63 } 64 65 BlockInfo &getOrCreateBlockInfo(unsigned BlockID) { 66 if (const BlockInfo *BI = getBlockInfo(BlockID)) 67 return *const_cast<BlockInfo*>(BI); 68 69 // Otherwise, add a new record. 70 BlockInfoRecords.emplace_back(); 71 BlockInfoRecords.back().BlockID = BlockID; 72 return BlockInfoRecords.back(); 73 } 74 }; 75 76 /// This represents a position within a bitstream. There may be multiple 77 /// independent cursors reading within one bitstream, each maintaining their 78 /// own local state. 79 class SimpleBitstreamCursor { 80 ArrayRef<uint8_t> BitcodeBytes; 81 size_t NextChar = 0; 82 83 public: 84 /// This is the current data we have pulled from the stream but have not 85 /// returned to the client. This is specifically and intentionally defined to 86 /// follow the word size of the host machine for efficiency. We use word_t in 87 /// places that are aware of this to make it perfectly explicit what is going 88 /// on. 89 using word_t = size_t; 90 91 private: 92 word_t CurWord = 0; 93 94 /// This is the number of bits in CurWord that are valid. This is always from 95 /// [0...bits_of(size_t)-1] inclusive. 96 unsigned BitsInCurWord = 0; 97 98 public: 99 SimpleBitstreamCursor() = default; 100 explicit SimpleBitstreamCursor(ArrayRef<uint8_t> BitcodeBytes) 101 : BitcodeBytes(BitcodeBytes) {} 102 explicit SimpleBitstreamCursor(StringRef BitcodeBytes) 103 : BitcodeBytes(arrayRefFromStringRef(BitcodeBytes)) {} 104 explicit SimpleBitstreamCursor(MemoryBufferRef BitcodeBytes) 105 : SimpleBitstreamCursor(BitcodeBytes.getBuffer()) {} 106 107 bool canSkipToPos(size_t pos) const { 108 // pos can be skipped to if it is a valid address or one byte past the end. 109 return pos <= BitcodeBytes.size(); 110 } 111 112 bool AtEndOfStream() { 113 return BitsInCurWord == 0 && BitcodeBytes.size() <= NextChar; 114 } 115 116 /// Return the bit # of the bit we are reading. 117 uint64_t GetCurrentBitNo() const { 118 return uint64_t(NextChar)*CHAR_BIT - BitsInCurWord; 119 } 120 121 // Return the byte # of the current bit. 122 uint64_t getCurrentByteNo() const { return GetCurrentBitNo() / 8; } 123 124 ArrayRef<uint8_t> getBitcodeBytes() const { return BitcodeBytes; } 125 126 /// Reset the stream to the specified bit number. 127 Error JumpToBit(uint64_t BitNo) { 128 size_t ByteNo = size_t(BitNo/8) & ~(sizeof(word_t)-1); 129 unsigned WordBitNo = unsigned(BitNo & (sizeof(word_t)*8-1)); 130 assert(canSkipToPos(ByteNo) && "Invalid location"); 131 132 // Move the cursor to the right word. 133 NextChar = ByteNo; 134 BitsInCurWord = 0; 135 136 // Skip over any bits that are already consumed. 137 if (WordBitNo) { 138 if (Expected<word_t> Res = Read(WordBitNo)) 139 return Error::success(); 140 else 141 return Res.takeError(); 142 } 143 144 return Error::success(); 145 } 146 147 /// Get a pointer into the bitstream at the specified byte offset. 148 const uint8_t *getPointerToByte(uint64_t ByteNo, uint64_t NumBytes) { 149 return BitcodeBytes.data() + ByteNo; 150 } 151 152 /// Get a pointer into the bitstream at the specified bit offset. 153 /// 154 /// The bit offset must be on a byte boundary. 155 const uint8_t *getPointerToBit(uint64_t BitNo, uint64_t NumBytes) { 156 assert(!(BitNo % 8) && "Expected bit on byte boundary"); 157 return getPointerToByte(BitNo / 8, NumBytes); 158 } 159 160 Error fillCurWord() { 161 if (NextChar >= BitcodeBytes.size()) 162 return createStringError(std::errc::io_error, 163 "Unexpected end of file reading %u of %u bytes", 164 NextChar, BitcodeBytes.size()); 165 166 // Read the next word from the stream. 167 const uint8_t *NextCharPtr = BitcodeBytes.data() + NextChar; 168 unsigned BytesRead; 169 if (BitcodeBytes.size() >= NextChar + sizeof(word_t)) { 170 BytesRead = sizeof(word_t); 171 CurWord = 172 support::endian::read<word_t, llvm::endianness::little>(NextCharPtr); 173 } else { 174 // Short read. 175 BytesRead = BitcodeBytes.size() - NextChar; 176 CurWord = 0; 177 for (unsigned B = 0; B != BytesRead; ++B) 178 CurWord |= uint64_t(NextCharPtr[B]) << (B * 8); 179 } 180 NextChar += BytesRead; 181 BitsInCurWord = BytesRead * 8; 182 return Error::success(); 183 } 184 185 Expected<word_t> Read(unsigned NumBits) { 186 static const unsigned BitsInWord = sizeof(word_t) * 8; 187 188 assert(NumBits && NumBits <= BitsInWord && 189 "Cannot return zero or more than BitsInWord bits!"); 190 191 static const unsigned Mask = sizeof(word_t) > 4 ? 0x3f : 0x1f; 192 193 // If the field is fully contained by CurWord, return it quickly. 194 if (BitsInCurWord >= NumBits) { 195 word_t R = CurWord & (~word_t(0) >> (BitsInWord - NumBits)); 196 197 // Use a mask to avoid undefined behavior. 198 CurWord >>= (NumBits & Mask); 199 200 BitsInCurWord -= NumBits; 201 return R; 202 } 203 204 word_t R = BitsInCurWord ? CurWord : 0; 205 unsigned BitsLeft = NumBits - BitsInCurWord; 206 207 if (Error fillResult = fillCurWord()) 208 return std::move(fillResult); 209 210 // If we run out of data, abort. 211 if (BitsLeft > BitsInCurWord) 212 return createStringError(std::errc::io_error, 213 "Unexpected end of file reading %u of %u bits", 214 BitsInCurWord, BitsLeft); 215 216 word_t R2 = CurWord & (~word_t(0) >> (BitsInWord - BitsLeft)); 217 218 // Use a mask to avoid undefined behavior. 219 CurWord >>= (BitsLeft & Mask); 220 221 BitsInCurWord -= BitsLeft; 222 223 R |= R2 << (NumBits - BitsLeft); 224 225 return R; 226 } 227 228 Expected<uint32_t> ReadVBR(const unsigned NumBits) { 229 Expected<unsigned> MaybeRead = Read(NumBits); 230 if (!MaybeRead) 231 return MaybeRead; 232 uint32_t Piece = MaybeRead.get(); 233 234 assert(NumBits <= 32 && NumBits >= 1 && "Invalid NumBits value"); 235 const uint32_t MaskBitOrder = (NumBits - 1); 236 const uint32_t Mask = 1UL << MaskBitOrder; 237 238 if ((Piece & Mask) == 0) 239 return Piece; 240 241 uint32_t Result = 0; 242 unsigned NextBit = 0; 243 while (true) { 244 Result |= (Piece & (Mask - 1)) << NextBit; 245 246 if ((Piece & Mask) == 0) 247 return Result; 248 249 NextBit += NumBits-1; 250 if (NextBit >= 32) 251 return createStringError(std::errc::illegal_byte_sequence, 252 "Unterminated VBR"); 253 254 MaybeRead = Read(NumBits); 255 if (!MaybeRead) 256 return MaybeRead; 257 Piece = MaybeRead.get(); 258 } 259 } 260 261 // Read a VBR that may have a value up to 64-bits in size. The chunk size of 262 // the VBR must still be <= 32 bits though. 263 Expected<uint64_t> ReadVBR64(const unsigned NumBits) { 264 Expected<uint64_t> MaybeRead = Read(NumBits); 265 if (!MaybeRead) 266 return MaybeRead; 267 uint32_t Piece = MaybeRead.get(); 268 assert(NumBits <= 32 && NumBits >= 1 && "Invalid NumBits value"); 269 const uint32_t MaskBitOrder = (NumBits - 1); 270 const uint32_t Mask = 1UL << MaskBitOrder; 271 272 if ((Piece & Mask) == 0) 273 return uint64_t(Piece); 274 275 uint64_t Result = 0; 276 unsigned NextBit = 0; 277 while (true) { 278 Result |= uint64_t(Piece & (Mask - 1)) << NextBit; 279 280 if ((Piece & Mask) == 0) 281 return Result; 282 283 NextBit += NumBits-1; 284 if (NextBit >= 64) 285 return createStringError(std::errc::illegal_byte_sequence, 286 "Unterminated VBR"); 287 288 MaybeRead = Read(NumBits); 289 if (!MaybeRead) 290 return MaybeRead; 291 Piece = MaybeRead.get(); 292 } 293 } 294 295 void SkipToFourByteBoundary() { 296 // If word_t is 64-bits and if we've read less than 32 bits, just dump 297 // the bits we have up to the next 32-bit boundary. 298 if (sizeof(word_t) > 4 && 299 BitsInCurWord >= 32) { 300 CurWord >>= BitsInCurWord-32; 301 BitsInCurWord = 32; 302 return; 303 } 304 305 BitsInCurWord = 0; 306 } 307 308 /// Return the size of the stream in bytes. 309 size_t SizeInBytes() const { return BitcodeBytes.size(); } 310 311 /// Skip to the end of the file. 312 void skipToEnd() { NextChar = BitcodeBytes.size(); } 313 314 /// Check whether a reservation of Size elements is plausible. 315 bool isSizePlausible(size_t Size) const { 316 // Don't allow reserving more elements than the number of bits, assuming 317 // at least one bit is needed to encode an element. 318 return Size < BitcodeBytes.size() * 8; 319 } 320 }; 321 322 /// When advancing through a bitstream cursor, each advance can discover a few 323 /// different kinds of entries: 324 struct BitstreamEntry { 325 enum { 326 Error, // Malformed bitcode was found. 327 EndBlock, // We've reached the end of the current block, (or the end of the 328 // file, which is treated like a series of EndBlock records. 329 SubBlock, // This is the start of a new subblock of a specific ID. 330 Record // This is a record with a specific AbbrevID. 331 } Kind; 332 333 unsigned ID; 334 335 static BitstreamEntry getError() { 336 BitstreamEntry E; E.Kind = Error; return E; 337 } 338 339 static BitstreamEntry getEndBlock() { 340 BitstreamEntry E; E.Kind = EndBlock; return E; 341 } 342 343 static BitstreamEntry getSubBlock(unsigned ID) { 344 BitstreamEntry E; E.Kind = SubBlock; E.ID = ID; return E; 345 } 346 347 static BitstreamEntry getRecord(unsigned AbbrevID) { 348 BitstreamEntry E; E.Kind = Record; E.ID = AbbrevID; return E; 349 } 350 }; 351 352 /// This represents a position within a bitcode file, implemented on top of a 353 /// SimpleBitstreamCursor. 354 /// 355 /// Unlike iterators, BitstreamCursors are heavy-weight objects that should not 356 /// be passed by value. 357 class BitstreamCursor : SimpleBitstreamCursor { 358 // This is the declared size of code values used for the current block, in 359 // bits. 360 unsigned CurCodeSize = 2; 361 362 /// Abbrevs installed at in this block. 363 std::vector<std::shared_ptr<BitCodeAbbrev>> CurAbbrevs; 364 365 struct Block { 366 unsigned PrevCodeSize; 367 std::vector<std::shared_ptr<BitCodeAbbrev>> PrevAbbrevs; 368 369 explicit Block(unsigned PCS) : PrevCodeSize(PCS) {} 370 }; 371 372 /// This tracks the codesize of parent blocks. 373 SmallVector<Block, 8> BlockScope; 374 375 BitstreamBlockInfo *BlockInfo = nullptr; 376 377 public: 378 static const size_t MaxChunkSize = 32; 379 380 BitstreamCursor() = default; 381 explicit BitstreamCursor(ArrayRef<uint8_t> BitcodeBytes) 382 : SimpleBitstreamCursor(BitcodeBytes) {} 383 explicit BitstreamCursor(StringRef BitcodeBytes) 384 : SimpleBitstreamCursor(BitcodeBytes) {} 385 explicit BitstreamCursor(MemoryBufferRef BitcodeBytes) 386 : SimpleBitstreamCursor(BitcodeBytes) {} 387 388 using SimpleBitstreamCursor::AtEndOfStream; 389 using SimpleBitstreamCursor::canSkipToPos; 390 using SimpleBitstreamCursor::fillCurWord; 391 using SimpleBitstreamCursor::getBitcodeBytes; 392 using SimpleBitstreamCursor::GetCurrentBitNo; 393 using SimpleBitstreamCursor::getCurrentByteNo; 394 using SimpleBitstreamCursor::getPointerToByte; 395 using SimpleBitstreamCursor::JumpToBit; 396 using SimpleBitstreamCursor::Read; 397 using SimpleBitstreamCursor::ReadVBR; 398 using SimpleBitstreamCursor::ReadVBR64; 399 using SimpleBitstreamCursor::SizeInBytes; 400 using SimpleBitstreamCursor::skipToEnd; 401 402 /// Return the number of bits used to encode an abbrev #. 403 unsigned getAbbrevIDWidth() const { return CurCodeSize; } 404 405 /// Flags that modify the behavior of advance(). 406 enum { 407 /// If this flag is used, the advance() method does not automatically pop 408 /// the block scope when the end of a block is reached. 409 AF_DontPopBlockAtEnd = 1, 410 411 /// If this flag is used, abbrev entries are returned just like normal 412 /// records. 413 AF_DontAutoprocessAbbrevs = 2 414 }; 415 416 /// Advance the current bitstream, returning the next entry in the stream. 417 Expected<BitstreamEntry> advance(unsigned Flags = 0) { 418 while (true) { 419 if (AtEndOfStream()) 420 return BitstreamEntry::getError(); 421 422 Expected<unsigned> MaybeCode = ReadCode(); 423 if (!MaybeCode) 424 return MaybeCode.takeError(); 425 unsigned Code = MaybeCode.get(); 426 427 if (Code == bitc::END_BLOCK) { 428 // Pop the end of the block unless Flags tells us not to. 429 if (!(Flags & AF_DontPopBlockAtEnd) && ReadBlockEnd()) 430 return BitstreamEntry::getError(); 431 return BitstreamEntry::getEndBlock(); 432 } 433 434 if (Code == bitc::ENTER_SUBBLOCK) { 435 if (Expected<unsigned> MaybeSubBlock = ReadSubBlockID()) 436 return BitstreamEntry::getSubBlock(MaybeSubBlock.get()); 437 else 438 return MaybeSubBlock.takeError(); 439 } 440 441 if (Code == bitc::DEFINE_ABBREV && 442 !(Flags & AF_DontAutoprocessAbbrevs)) { 443 // We read and accumulate abbrev's, the client can't do anything with 444 // them anyway. 445 if (Error Err = ReadAbbrevRecord()) 446 return std::move(Err); 447 continue; 448 } 449 450 return BitstreamEntry::getRecord(Code); 451 } 452 } 453 454 /// This is a convenience function for clients that don't expect any 455 /// subblocks. This just skips over them automatically. 456 Expected<BitstreamEntry> advanceSkippingSubblocks(unsigned Flags = 0) { 457 while (true) { 458 // If we found a normal entry, return it. 459 Expected<BitstreamEntry> MaybeEntry = advance(Flags); 460 if (!MaybeEntry) 461 return MaybeEntry; 462 BitstreamEntry Entry = MaybeEntry.get(); 463 464 if (Entry.Kind != BitstreamEntry::SubBlock) 465 return Entry; 466 467 // If we found a sub-block, just skip over it and check the next entry. 468 if (Error Err = SkipBlock()) 469 return std::move(Err); 470 } 471 } 472 473 Expected<unsigned> ReadCode() { return Read(CurCodeSize); } 474 475 // Block header: 476 // [ENTER_SUBBLOCK, blockid, newcodelen, <align4bytes>, blocklen] 477 478 /// Having read the ENTER_SUBBLOCK code, read the BlockID for the block. 479 Expected<unsigned> ReadSubBlockID() { return ReadVBR(bitc::BlockIDWidth); } 480 481 /// Having read the ENTER_SUBBLOCK abbrevid and a BlockID, skip over the body 482 /// of this block. 483 Error SkipBlock() { 484 // Read and ignore the codelen value. 485 if (Expected<uint32_t> Res = ReadVBR(bitc::CodeLenWidth)) 486 ; // Since we are skipping this block, we don't care what code widths are 487 // used inside of it. 488 else 489 return Res.takeError(); 490 491 SkipToFourByteBoundary(); 492 Expected<unsigned> MaybeNum = Read(bitc::BlockSizeWidth); 493 if (!MaybeNum) 494 return MaybeNum.takeError(); 495 size_t NumFourBytes = MaybeNum.get(); 496 497 // Check that the block wasn't partially defined, and that the offset isn't 498 // bogus. 499 size_t SkipTo = GetCurrentBitNo() + NumFourBytes * 4 * 8; 500 if (AtEndOfStream()) 501 return createStringError(std::errc::illegal_byte_sequence, 502 "can't skip block: already at end of stream"); 503 if (!canSkipToPos(SkipTo / 8)) 504 return createStringError(std::errc::illegal_byte_sequence, 505 "can't skip to bit %zu from %" PRIu64, SkipTo, 506 GetCurrentBitNo()); 507 508 if (Error Res = JumpToBit(SkipTo)) 509 return Res; 510 511 return Error::success(); 512 } 513 514 /// Having read the ENTER_SUBBLOCK abbrevid, and enter the block. 515 Error EnterSubBlock(unsigned BlockID, unsigned *NumWordsP = nullptr); 516 517 bool ReadBlockEnd() { 518 if (BlockScope.empty()) return true; 519 520 // Block tail: 521 // [END_BLOCK, <align4bytes>] 522 SkipToFourByteBoundary(); 523 524 popBlockScope(); 525 return false; 526 } 527 528 private: 529 void popBlockScope() { 530 CurCodeSize = BlockScope.back().PrevCodeSize; 531 532 CurAbbrevs = std::move(BlockScope.back().PrevAbbrevs); 533 BlockScope.pop_back(); 534 } 535 536 //===--------------------------------------------------------------------===// 537 // Record Processing 538 //===--------------------------------------------------------------------===// 539 540 public: 541 /// Return the abbreviation for the specified AbbrevId. 542 Expected<const BitCodeAbbrev *> getAbbrev(unsigned AbbrevID) { 543 unsigned AbbrevNo = AbbrevID - bitc::FIRST_APPLICATION_ABBREV; 544 if (AbbrevNo >= CurAbbrevs.size()) 545 return createStringError( 546 std::errc::illegal_byte_sequence, "Invalid abbrev number"); 547 return CurAbbrevs[AbbrevNo].get(); 548 } 549 550 /// Read the current record and discard it, returning the code for the record. 551 Expected<unsigned> skipRecord(unsigned AbbrevID); 552 553 Expected<unsigned> readRecord(unsigned AbbrevID, 554 SmallVectorImpl<uint64_t> &Vals, 555 StringRef *Blob = nullptr); 556 557 //===--------------------------------------------------------------------===// 558 // Abbrev Processing 559 //===--------------------------------------------------------------------===// 560 Error ReadAbbrevRecord(); 561 562 /// Read and return a block info block from the bitstream. If an error was 563 /// encountered, return std::nullopt. 564 /// 565 /// \param ReadBlockInfoNames Whether to read block/record name information in 566 /// the BlockInfo block. Only llvm-bcanalyzer uses this. 567 Expected<std::optional<BitstreamBlockInfo>> 568 ReadBlockInfoBlock(bool ReadBlockInfoNames = false); 569 570 /// Set the block info to be used by this BitstreamCursor to interpret 571 /// abbreviated records. 572 void setBlockInfo(BitstreamBlockInfo *BI) { BlockInfo = BI; } 573 }; 574 575 } // end llvm namespace 576 577 #endif // LLVM_BITSTREAM_BITSTREAMREADER_H 578