1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/STLExtras.h" 17 #include "llvm/ADT/SmallString.h" 18 #include "llvm/ADT/SmallVector.h" 19 #include "llvm/ADT/StringExtras.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/ADT/Twine.h" 22 #include "llvm/Support/Compiler.h" 23 #include "llvm/Support/ErrorHandling.h" 24 #include "llvm/Support/MemoryBuffer.h" 25 #include "llvm/Support/SMLoc.h" 26 #include "llvm/Support/SourceMgr.h" 27 #include "llvm/Support/Unicode.h" 28 #include "llvm/Support/raw_ostream.h" 29 #include <cassert> 30 #include <cstddef> 31 #include <cstdint> 32 #include <map> 33 #include <memory> 34 #include <string> 35 #include <system_error> 36 #include <utility> 37 38 using namespace llvm; 39 using namespace yaml; 40 41 enum UnicodeEncodingForm { 42 UEF_UTF32_LE, ///< UTF-32 Little Endian 43 UEF_UTF32_BE, ///< UTF-32 Big Endian 44 UEF_UTF16_LE, ///< UTF-16 Little Endian 45 UEF_UTF16_BE, ///< UTF-16 Big Endian 46 UEF_UTF8, ///< UTF-8 or ascii. 47 UEF_Unknown ///< Not a valid Unicode encoding. 48 }; 49 50 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 51 /// it exists. Length is in {0, 2, 3, 4}. 52 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 53 54 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 55 /// encoding form of \a Input. 56 /// 57 /// @param Input A string of length 0 or more. 58 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 59 /// and how long the byte order mark is if one exists. 60 static EncodingInfo getUnicodeEncoding(StringRef Input) { 61 if (Input.empty()) 62 return std::make_pair(UEF_Unknown, 0); 63 64 switch (uint8_t(Input[0])) { 65 case 0x00: 66 if (Input.size() >= 4) { 67 if ( Input[1] == 0 68 && uint8_t(Input[2]) == 0xFE 69 && uint8_t(Input[3]) == 0xFF) 70 return std::make_pair(UEF_UTF32_BE, 4); 71 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 72 return std::make_pair(UEF_UTF32_BE, 0); 73 } 74 75 if (Input.size() >= 2 && Input[1] != 0) 76 return std::make_pair(UEF_UTF16_BE, 0); 77 return std::make_pair(UEF_Unknown, 0); 78 case 0xFF: 79 if ( Input.size() >= 4 80 && uint8_t(Input[1]) == 0xFE 81 && Input[2] == 0 82 && Input[3] == 0) 83 return std::make_pair(UEF_UTF32_LE, 4); 84 85 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 86 return std::make_pair(UEF_UTF16_LE, 2); 87 return std::make_pair(UEF_Unknown, 0); 88 case 0xFE: 89 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 90 return std::make_pair(UEF_UTF16_BE, 2); 91 return std::make_pair(UEF_Unknown, 0); 92 case 0xEF: 93 if ( Input.size() >= 3 94 && uint8_t(Input[1]) == 0xBB 95 && uint8_t(Input[2]) == 0xBF) 96 return std::make_pair(UEF_UTF8, 3); 97 return std::make_pair(UEF_Unknown, 0); 98 } 99 100 // It could still be utf-32 or utf-16. 101 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 102 return std::make_pair(UEF_UTF32_LE, 0); 103 104 if (Input.size() >= 2 && Input[1] == 0) 105 return std::make_pair(UEF_UTF16_LE, 0); 106 107 return std::make_pair(UEF_UTF8, 0); 108 } 109 110 /// Pin the vtables to this file. 111 void Node::anchor() {} 112 void NullNode::anchor() {} 113 void ScalarNode::anchor() {} 114 void BlockScalarNode::anchor() {} 115 void KeyValueNode::anchor() {} 116 void MappingNode::anchor() {} 117 void SequenceNode::anchor() {} 118 void AliasNode::anchor() {} 119 120 namespace llvm { 121 namespace yaml { 122 123 /// Token - A single YAML token. 124 struct Token { 125 enum TokenKind { 126 TK_Error, // Uninitialized token. 127 TK_StreamStart, 128 TK_StreamEnd, 129 TK_VersionDirective, 130 TK_TagDirective, 131 TK_DocumentStart, 132 TK_DocumentEnd, 133 TK_BlockEntry, 134 TK_BlockEnd, 135 TK_BlockSequenceStart, 136 TK_BlockMappingStart, 137 TK_FlowEntry, 138 TK_FlowSequenceStart, 139 TK_FlowSequenceEnd, 140 TK_FlowMappingStart, 141 TK_FlowMappingEnd, 142 TK_Key, 143 TK_Value, 144 TK_Scalar, 145 TK_BlockScalar, 146 TK_Alias, 147 TK_Anchor, 148 TK_Tag 149 } Kind = TK_Error; 150 151 /// A string of length 0 or more whose begin() points to the logical location 152 /// of the token in the input. 153 StringRef Range; 154 155 /// The value of a block scalar node. 156 std::string Value; 157 158 Token() = default; 159 }; 160 161 } // end namespace yaml 162 } // end namespace llvm 163 164 using TokenQueueT = BumpPtrList<Token>; 165 166 namespace { 167 168 /// This struct is used to track simple keys. 169 /// 170 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 171 /// which could legally be the start of a simple key. When peekNext is called, 172 /// if the Token To be returned is referenced by a SimpleKey, we continue 173 /// tokenizing until that potential simple key has either been found to not be 174 /// a simple key (we moved on to the next line or went further than 1024 chars). 175 /// Or when we run into a Value, and then insert a Key token (and possibly 176 /// others) before the SimpleKey's Tok. 177 struct SimpleKey { 178 TokenQueueT::iterator Tok; 179 unsigned Column = 0; 180 unsigned Line = 0; 181 unsigned FlowLevel = 0; 182 bool IsRequired = false; 183 184 bool operator ==(const SimpleKey &Other) { 185 return Tok == Other.Tok; 186 } 187 }; 188 189 } // end anonymous namespace 190 191 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 192 /// subsequence and the subsequence's length in code units (uint8_t). 193 /// A length of 0 represents an error. 194 using UTF8Decoded = std::pair<uint32_t, unsigned>; 195 196 static UTF8Decoded decodeUTF8(StringRef Range) { 197 StringRef::iterator Position= Range.begin(); 198 StringRef::iterator End = Range.end(); 199 // 1 byte: [0x00, 0x7f] 200 // Bit pattern: 0xxxxxxx 201 if (Position < End && (*Position & 0x80) == 0) { 202 return std::make_pair(*Position, 1); 203 } 204 // 2 bytes: [0x80, 0x7ff] 205 // Bit pattern: 110xxxxx 10xxxxxx 206 if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && 207 ((*(Position + 1) & 0xC0) == 0x80)) { 208 uint32_t codepoint = ((*Position & 0x1F) << 6) | 209 (*(Position + 1) & 0x3F); 210 if (codepoint >= 0x80) 211 return std::make_pair(codepoint, 2); 212 } 213 // 3 bytes: [0x8000, 0xffff] 214 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 215 if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && 216 ((*(Position + 1) & 0xC0) == 0x80) && 217 ((*(Position + 2) & 0xC0) == 0x80)) { 218 uint32_t codepoint = ((*Position & 0x0F) << 12) | 219 ((*(Position + 1) & 0x3F) << 6) | 220 (*(Position + 2) & 0x3F); 221 // Codepoints between 0xD800 and 0xDFFF are invalid, as 222 // they are high / low surrogate halves used by UTF-16. 223 if (codepoint >= 0x800 && 224 (codepoint < 0xD800 || codepoint > 0xDFFF)) 225 return std::make_pair(codepoint, 3); 226 } 227 // 4 bytes: [0x10000, 0x10FFFF] 228 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 229 if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && 230 ((*(Position + 1) & 0xC0) == 0x80) && 231 ((*(Position + 2) & 0xC0) == 0x80) && 232 ((*(Position + 3) & 0xC0) == 0x80)) { 233 uint32_t codepoint = ((*Position & 0x07) << 18) | 234 ((*(Position + 1) & 0x3F) << 12) | 235 ((*(Position + 2) & 0x3F) << 6) | 236 (*(Position + 3) & 0x3F); 237 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 238 return std::make_pair(codepoint, 4); 239 } 240 return std::make_pair(0, 0); 241 } 242 243 namespace llvm { 244 namespace yaml { 245 246 /// Scans YAML tokens from a MemoryBuffer. 247 class Scanner { 248 public: 249 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 250 std::error_code *EC = nullptr); 251 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 252 std::error_code *EC = nullptr); 253 254 /// Parse the next token and return it without popping it. 255 Token &peekNext(); 256 257 /// Parse the next token and pop it from the queue. 258 Token getNext(); 259 260 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 261 ArrayRef<SMRange> Ranges = std::nullopt) { 262 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ std::nullopt, 263 ShowColors); 264 } 265 266 void setError(const Twine &Message, StringRef::iterator Position) { 267 if (Position >= End) 268 Position = End - 1; 269 270 // propagate the error if possible 271 if (EC) 272 *EC = make_error_code(std::errc::invalid_argument); 273 274 // Don't print out more errors after the first one we encounter. The rest 275 // are just the result of the first, and have no meaning. 276 if (!Failed) 277 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 278 Failed = true; 279 } 280 281 /// Returns true if an error occurred while parsing. 282 bool failed() { 283 return Failed; 284 } 285 286 private: 287 void init(MemoryBufferRef Buffer); 288 289 StringRef currentInput() { 290 return StringRef(Current, End - Current); 291 } 292 293 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 294 /// at \a Position. 295 /// 296 /// If the UTF-8 code units starting at Position do not form a well-formed 297 /// code unit subsequence, then the Unicode scalar value is 0, and the length 298 /// is 0. 299 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 300 return ::decodeUTF8(StringRef(Position, End - Position)); 301 } 302 303 // The following functions are based on the gramar rules in the YAML spec. The 304 // style of the function names it meant to closely match how they are written 305 // in the spec. The number within the [] is the number of the grammar rule in 306 // the spec. 307 // 308 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 309 // 310 // c- 311 // A production starting and ending with a special character. 312 // b- 313 // A production matching a single line break. 314 // nb- 315 // A production starting and ending with a non-break character. 316 // s- 317 // A production starting and ending with a white space character. 318 // ns- 319 // A production starting and ending with a non-space character. 320 // l- 321 // A production matching complete line(s). 322 323 /// Skip a single nb-char[27] starting at Position. 324 /// 325 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 326 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 327 /// 328 /// @returns The code unit after the nb-char, or Position if it's not an 329 /// nb-char. 330 StringRef::iterator skip_nb_char(StringRef::iterator Position); 331 332 /// Skip a single b-break[28] starting at Position. 333 /// 334 /// A b-break is 0xD 0xA | 0xD | 0xA 335 /// 336 /// @returns The code unit after the b-break, or Position if it's not a 337 /// b-break. 338 StringRef::iterator skip_b_break(StringRef::iterator Position); 339 340 /// Skip a single s-space[31] starting at Position. 341 /// 342 /// An s-space is 0x20 343 /// 344 /// @returns The code unit after the s-space, or Position if it's not a 345 /// s-space. 346 StringRef::iterator skip_s_space(StringRef::iterator Position); 347 348 /// Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 365 366 /// Skip minimal well-formed code unit subsequences until Func 367 /// returns its input. 368 /// 369 /// @returns The code unit after the last minimal well-formed code unit 370 /// subsequence that Func accepted. 371 StringRef::iterator skip_while( SkipWhileFunc Func 372 , StringRef::iterator Position); 373 374 /// Skip minimal well-formed code unit subsequences until Func returns its 375 /// input. 376 void advanceWhile(SkipWhileFunc Func); 377 378 /// Scan ns-uri-char[39]s starting at Cur. 379 /// 380 /// This updates Cur and Column while scanning. 381 void scan_ns_uri_char(); 382 383 /// Consume a minimal well-formed code unit subsequence starting at 384 /// \a Cur. Return false if it is not the same Unicode scalar value as 385 /// \a Expected. This updates \a Column. 386 bool consume(uint32_t Expected); 387 388 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 389 void skip(uint32_t Distance); 390 391 /// Return true if the minimal well-formed code unit subsequence at 392 /// Pos is whitespace or a new line 393 bool isBlankOrBreak(StringRef::iterator Position); 394 395 /// Return true if the minimal well-formed code unit subsequence at 396 /// Pos is considered a "safe" character for plain scalars. 397 bool isPlainSafeNonBlank(StringRef::iterator Position); 398 399 /// Return true if the line is a line break, false otherwise. 400 bool isLineEmpty(StringRef Line); 401 402 /// Consume a single b-break[28] if it's present at the current position. 403 /// 404 /// Return false if the code unit at the current position isn't a line break. 405 bool consumeLineBreakIfPresent(); 406 407 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 408 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 409 , unsigned AtColumn 410 , bool IsRequired); 411 412 /// Remove simple keys that can no longer be valid simple keys. 413 /// 414 /// Invalid simple keys are not on the current line or are further than 1024 415 /// columns back. 416 void removeStaleSimpleKeyCandidates(); 417 418 /// Remove all simple keys on FlowLevel \a Level. 419 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 420 421 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 422 /// tokens if needed. 423 bool unrollIndent(int ToColumn); 424 425 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 426 /// if needed. 427 bool rollIndent( int ToColumn 428 , Token::TokenKind Kind 429 , TokenQueueT::iterator InsertPoint); 430 431 /// Skip a single-line comment when the comment starts at the current 432 /// position of the scanner. 433 void skipComment(); 434 435 /// Skip whitespace and comments until the start of the next token. 436 void scanToNextToken(); 437 438 /// Must be the first token generated. 439 bool scanStreamStart(); 440 441 /// Generate tokens needed to close out the stream. 442 bool scanStreamEnd(); 443 444 /// Scan a %BLAH directive. 445 bool scanDirective(); 446 447 /// Scan a ... or ---. 448 bool scanDocumentIndicator(bool IsStart); 449 450 /// Scan a [ or { and generate the proper flow collection start token. 451 bool scanFlowCollectionStart(bool IsSequence); 452 453 /// Scan a ] or } and generate the proper flow collection end token. 454 bool scanFlowCollectionEnd(bool IsSequence); 455 456 /// Scan the , that separates entries in a flow collection. 457 bool scanFlowEntry(); 458 459 /// Scan the - that starts block sequence entries. 460 bool scanBlockEntry(); 461 462 /// Scan an explicit ? indicating a key. 463 bool scanKey(); 464 465 /// Scan an explicit : indicating a value. 466 bool scanValue(); 467 468 /// Scan a quoted scalar. 469 bool scanFlowScalar(bool IsDoubleQuoted); 470 471 /// Scan an unquoted scalar. 472 bool scanPlainScalar(); 473 474 /// Scan an Alias or Anchor starting with * or &. 475 bool scanAliasOrAnchor(bool IsAlias); 476 477 /// Scan a block scalar starting with | or >. 478 bool scanBlockScalar(bool IsLiteral); 479 480 /// Scan a block scalar style indicator and header. 481 /// 482 /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that 483 /// YAML does not consider the style indicator to be a part of the header. 484 /// 485 /// Return false if an error occurred. 486 bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator, 487 unsigned &IndentIndicator, bool &IsDone); 488 489 /// Scan a style indicator in a block scalar header. 490 char scanBlockStyleIndicator(); 491 492 /// Scan a chomping indicator in a block scalar header. 493 char scanBlockChompingIndicator(); 494 495 /// Scan an indentation indicator in a block scalar header. 496 unsigned scanBlockIndentationIndicator(); 497 498 /// Scan a block scalar header. 499 /// 500 /// Return false if an error occurred. 501 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 502 bool &IsDone); 503 504 /// Look for the indentation level of a block scalar. 505 /// 506 /// Return false if an error occurred. 507 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 508 unsigned &LineBreaks, bool &IsDone); 509 510 /// Scan the indentation of a text line in a block scalar. 511 /// 512 /// Return false if an error occurred. 513 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 514 bool &IsDone); 515 516 /// Scan a tag of the form !stuff. 517 bool scanTag(); 518 519 /// Dispatch to the next scanning function based on \a *Cur. 520 bool fetchMoreTokens(); 521 522 /// The SourceMgr used for diagnostics and buffer management. 523 SourceMgr &SM; 524 525 /// The original input. 526 MemoryBufferRef InputBuffer; 527 528 /// The current position of the scanner. 529 StringRef::iterator Current; 530 531 /// The end of the input (one past the last character). 532 StringRef::iterator End; 533 534 /// Current YAML indentation level in spaces. 535 int Indent; 536 537 /// Current column number in Unicode code points. 538 unsigned Column; 539 540 /// Current line number. 541 unsigned Line; 542 543 /// How deep we are in flow style containers. 0 Means at block level. 544 unsigned FlowLevel; 545 546 /// Are we at the start of the stream? 547 bool IsStartOfStream; 548 549 /// Can the next token be the start of a simple key? 550 bool IsSimpleKeyAllowed; 551 552 /// Can the next token be a value indicator even if it does not have a 553 /// trailing space? 554 bool IsAdjacentValueAllowedInFlow; 555 556 /// True if an error has occurred. 557 bool Failed; 558 559 /// Should colors be used when printing out the diagnostic messages? 560 bool ShowColors; 561 562 /// Queue of tokens. This is required to queue up tokens while looking 563 /// for the end of a simple key. And for cases where a single character 564 /// can produce multiple tokens (e.g. BlockEnd). 565 TokenQueueT TokenQueue; 566 567 /// Indentation levels. 568 SmallVector<int, 4> Indents; 569 570 /// Potential simple keys. 571 SmallVector<SimpleKey, 4> SimpleKeys; 572 573 std::error_code *EC; 574 }; 575 576 } // end namespace yaml 577 } // end namespace llvm 578 579 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 580 static void encodeUTF8( uint32_t UnicodeScalarValue 581 , SmallVectorImpl<char> &Result) { 582 if (UnicodeScalarValue <= 0x7F) { 583 Result.push_back(UnicodeScalarValue & 0x7F); 584 } else if (UnicodeScalarValue <= 0x7FF) { 585 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 586 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 587 Result.push_back(FirstByte); 588 Result.push_back(SecondByte); 589 } else if (UnicodeScalarValue <= 0xFFFF) { 590 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 591 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 592 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 593 Result.push_back(FirstByte); 594 Result.push_back(SecondByte); 595 Result.push_back(ThirdByte); 596 } else if (UnicodeScalarValue <= 0x10FFFF) { 597 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 598 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 599 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 600 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 601 Result.push_back(FirstByte); 602 Result.push_back(SecondByte); 603 Result.push_back(ThirdByte); 604 Result.push_back(FourthByte); 605 } 606 } 607 608 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 609 SourceMgr SM; 610 Scanner scanner(Input, SM); 611 while (true) { 612 Token T = scanner.getNext(); 613 switch (T.Kind) { 614 case Token::TK_StreamStart: 615 OS << "Stream-Start: "; 616 break; 617 case Token::TK_StreamEnd: 618 OS << "Stream-End: "; 619 break; 620 case Token::TK_VersionDirective: 621 OS << "Version-Directive: "; 622 break; 623 case Token::TK_TagDirective: 624 OS << "Tag-Directive: "; 625 break; 626 case Token::TK_DocumentStart: 627 OS << "Document-Start: "; 628 break; 629 case Token::TK_DocumentEnd: 630 OS << "Document-End: "; 631 break; 632 case Token::TK_BlockEntry: 633 OS << "Block-Entry: "; 634 break; 635 case Token::TK_BlockEnd: 636 OS << "Block-End: "; 637 break; 638 case Token::TK_BlockSequenceStart: 639 OS << "Block-Sequence-Start: "; 640 break; 641 case Token::TK_BlockMappingStart: 642 OS << "Block-Mapping-Start: "; 643 break; 644 case Token::TK_FlowEntry: 645 OS << "Flow-Entry: "; 646 break; 647 case Token::TK_FlowSequenceStart: 648 OS << "Flow-Sequence-Start: "; 649 break; 650 case Token::TK_FlowSequenceEnd: 651 OS << "Flow-Sequence-End: "; 652 break; 653 case Token::TK_FlowMappingStart: 654 OS << "Flow-Mapping-Start: "; 655 break; 656 case Token::TK_FlowMappingEnd: 657 OS << "Flow-Mapping-End: "; 658 break; 659 case Token::TK_Key: 660 OS << "Key: "; 661 break; 662 case Token::TK_Value: 663 OS << "Value: "; 664 break; 665 case Token::TK_Scalar: 666 OS << "Scalar: "; 667 break; 668 case Token::TK_BlockScalar: 669 OS << "Block Scalar: "; 670 break; 671 case Token::TK_Alias: 672 OS << "Alias: "; 673 break; 674 case Token::TK_Anchor: 675 OS << "Anchor: "; 676 break; 677 case Token::TK_Tag: 678 OS << "Tag: "; 679 break; 680 case Token::TK_Error: 681 break; 682 } 683 OS << T.Range << "\n"; 684 if (T.Kind == Token::TK_StreamEnd) 685 break; 686 else if (T.Kind == Token::TK_Error) 687 return false; 688 } 689 return true; 690 } 691 692 bool yaml::scanTokens(StringRef Input) { 693 SourceMgr SM; 694 Scanner scanner(Input, SM); 695 while (true) { 696 Token T = scanner.getNext(); 697 if (T.Kind == Token::TK_StreamEnd) 698 break; 699 else if (T.Kind == Token::TK_Error) 700 return false; 701 } 702 return true; 703 } 704 705 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 706 std::string EscapedInput; 707 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 708 if (*i == '\\') 709 EscapedInput += "\\\\"; 710 else if (*i == '"') 711 EscapedInput += "\\\""; 712 else if (*i == 0) 713 EscapedInput += "\\0"; 714 else if (*i == 0x07) 715 EscapedInput += "\\a"; 716 else if (*i == 0x08) 717 EscapedInput += "\\b"; 718 else if (*i == 0x09) 719 EscapedInput += "\\t"; 720 else if (*i == 0x0A) 721 EscapedInput += "\\n"; 722 else if (*i == 0x0B) 723 EscapedInput += "\\v"; 724 else if (*i == 0x0C) 725 EscapedInput += "\\f"; 726 else if (*i == 0x0D) 727 EscapedInput += "\\r"; 728 else if (*i == 0x1B) 729 EscapedInput += "\\e"; 730 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 731 std::string HexStr = utohexstr(*i); 732 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 733 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 734 UTF8Decoded UnicodeScalarValue 735 = decodeUTF8(StringRef(i, Input.end() - i)); 736 if (UnicodeScalarValue.second == 0) { 737 // Found invalid char. 738 SmallString<4> Val; 739 encodeUTF8(0xFFFD, Val); 740 llvm::append_range(EscapedInput, Val); 741 // FIXME: Error reporting. 742 return EscapedInput; 743 } 744 if (UnicodeScalarValue.first == 0x85) 745 EscapedInput += "\\N"; 746 else if (UnicodeScalarValue.first == 0xA0) 747 EscapedInput += "\\_"; 748 else if (UnicodeScalarValue.first == 0x2028) 749 EscapedInput += "\\L"; 750 else if (UnicodeScalarValue.first == 0x2029) 751 EscapedInput += "\\P"; 752 else if (!EscapePrintable && 753 sys::unicode::isPrintable(UnicodeScalarValue.first)) 754 EscapedInput += StringRef(i, UnicodeScalarValue.second); 755 else { 756 std::string HexStr = utohexstr(UnicodeScalarValue.first); 757 if (HexStr.size() <= 2) 758 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 759 else if (HexStr.size() <= 4) 760 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 761 else if (HexStr.size() <= 8) 762 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 763 } 764 i += UnicodeScalarValue.second - 1; 765 } else 766 EscapedInput.push_back(*i); 767 } 768 return EscapedInput; 769 } 770 771 std::optional<bool> yaml::parseBool(StringRef S) { 772 switch (S.size()) { 773 case 1: 774 switch (S.front()) { 775 case 'y': 776 case 'Y': 777 return true; 778 case 'n': 779 case 'N': 780 return false; 781 default: 782 return std::nullopt; 783 } 784 case 2: 785 switch (S.front()) { 786 case 'O': 787 if (S[1] == 'N') // ON 788 return true; 789 [[fallthrough]]; 790 case 'o': 791 if (S[1] == 'n') //[Oo]n 792 return true; 793 return std::nullopt; 794 case 'N': 795 if (S[1] == 'O') // NO 796 return false; 797 [[fallthrough]]; 798 case 'n': 799 if (S[1] == 'o') //[Nn]o 800 return false; 801 return std::nullopt; 802 default: 803 return std::nullopt; 804 } 805 case 3: 806 switch (S.front()) { 807 case 'O': 808 if (S.drop_front() == "FF") // OFF 809 return false; 810 [[fallthrough]]; 811 case 'o': 812 if (S.drop_front() == "ff") //[Oo]ff 813 return false; 814 return std::nullopt; 815 case 'Y': 816 if (S.drop_front() == "ES") // YES 817 return true; 818 [[fallthrough]]; 819 case 'y': 820 if (S.drop_front() == "es") //[Yy]es 821 return true; 822 return std::nullopt; 823 default: 824 return std::nullopt; 825 } 826 case 4: 827 switch (S.front()) { 828 case 'T': 829 if (S.drop_front() == "RUE") // TRUE 830 return true; 831 [[fallthrough]]; 832 case 't': 833 if (S.drop_front() == "rue") //[Tt]rue 834 return true; 835 return std::nullopt; 836 default: 837 return std::nullopt; 838 } 839 case 5: 840 switch (S.front()) { 841 case 'F': 842 if (S.drop_front() == "ALSE") // FALSE 843 return false; 844 [[fallthrough]]; 845 case 'f': 846 if (S.drop_front() == "alse") //[Ff]alse 847 return false; 848 return std::nullopt; 849 default: 850 return std::nullopt; 851 } 852 default: 853 return std::nullopt; 854 } 855 } 856 857 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 858 std::error_code *EC) 859 : SM(sm), ShowColors(ShowColors), EC(EC) { 860 init(MemoryBufferRef(Input, "YAML")); 861 } 862 863 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 864 std::error_code *EC) 865 : SM(SM_), ShowColors(ShowColors), EC(EC) { 866 init(Buffer); 867 } 868 869 void Scanner::init(MemoryBufferRef Buffer) { 870 InputBuffer = Buffer; 871 Current = InputBuffer.getBufferStart(); 872 End = InputBuffer.getBufferEnd(); 873 Indent = -1; 874 Column = 0; 875 Line = 0; 876 FlowLevel = 0; 877 IsStartOfStream = true; 878 IsSimpleKeyAllowed = true; 879 IsAdjacentValueAllowedInFlow = false; 880 Failed = false; 881 std::unique_ptr<MemoryBuffer> InputBufferOwner = 882 MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); 883 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 884 } 885 886 Token &Scanner::peekNext() { 887 // If the current token is a possible simple key, keep parsing until we 888 // can confirm. 889 bool NeedMore = false; 890 while (true) { 891 if (TokenQueue.empty() || NeedMore) { 892 if (!fetchMoreTokens()) { 893 TokenQueue.clear(); 894 SimpleKeys.clear(); 895 TokenQueue.push_back(Token()); 896 return TokenQueue.front(); 897 } 898 } 899 assert(!TokenQueue.empty() && 900 "fetchMoreTokens lied about getting tokens!"); 901 902 removeStaleSimpleKeyCandidates(); 903 SimpleKey SK; 904 SK.Tok = TokenQueue.begin(); 905 if (!is_contained(SimpleKeys, SK)) 906 break; 907 else 908 NeedMore = true; 909 } 910 return TokenQueue.front(); 911 } 912 913 Token Scanner::getNext() { 914 Token Ret = peekNext(); 915 // TokenQueue can be empty if there was an error getting the next token. 916 if (!TokenQueue.empty()) 917 TokenQueue.pop_front(); 918 919 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 920 // quick deallocation of them all. 921 if (TokenQueue.empty()) 922 TokenQueue.resetAlloc(); 923 924 return Ret; 925 } 926 927 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 928 if (Position == End) 929 return Position; 930 // Check 7 bit c-printable - b-char. 931 if ( *Position == 0x09 932 || (*Position >= 0x20 && *Position <= 0x7E)) 933 return Position + 1; 934 935 // Check for valid UTF-8. 936 if (uint8_t(*Position) & 0x80) { 937 UTF8Decoded u8d = decodeUTF8(Position); 938 if ( u8d.second != 0 939 && u8d.first != 0xFEFF 940 && ( u8d.first == 0x85 941 || ( u8d.first >= 0xA0 942 && u8d.first <= 0xD7FF) 943 || ( u8d.first >= 0xE000 944 && u8d.first <= 0xFFFD) 945 || ( u8d.first >= 0x10000 946 && u8d.first <= 0x10FFFF))) 947 return Position + u8d.second; 948 } 949 return Position; 950 } 951 952 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 953 if (Position == End) 954 return Position; 955 if (*Position == 0x0D) { 956 if (Position + 1 != End && *(Position + 1) == 0x0A) 957 return Position + 2; 958 return Position + 1; 959 } 960 961 if (*Position == 0x0A) 962 return Position + 1; 963 return Position; 964 } 965 966 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 967 if (Position == End) 968 return Position; 969 if (*Position == ' ') 970 return Position + 1; 971 return Position; 972 } 973 974 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 975 if (Position == End) 976 return Position; 977 if (*Position == ' ' || *Position == '\t') 978 return Position + 1; 979 return Position; 980 } 981 982 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 983 if (Position == End) 984 return Position; 985 if (*Position == ' ' || *Position == '\t') 986 return Position; 987 return skip_nb_char(Position); 988 } 989 990 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 991 , StringRef::iterator Position) { 992 while (true) { 993 StringRef::iterator i = (this->*Func)(Position); 994 if (i == Position) 995 break; 996 Position = i; 997 } 998 return Position; 999 } 1000 1001 void Scanner::advanceWhile(SkipWhileFunc Func) { 1002 auto Final = skip_while(Func, Current); 1003 Column += Final - Current; 1004 Current = Final; 1005 } 1006 1007 static bool is_ns_hex_digit(const char C) { return isAlnum(C); } 1008 1009 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); } 1010 1011 void Scanner::scan_ns_uri_char() { 1012 while (true) { 1013 if (Current == End) 1014 break; 1015 if (( *Current == '%' 1016 && Current + 2 < End 1017 && is_ns_hex_digit(*(Current + 1)) 1018 && is_ns_hex_digit(*(Current + 2))) 1019 || is_ns_word_char(*Current) 1020 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 1021 != StringRef::npos) { 1022 ++Current; 1023 ++Column; 1024 } else 1025 break; 1026 } 1027 } 1028 1029 bool Scanner::consume(uint32_t Expected) { 1030 if (Expected >= 0x80) { 1031 setError("Cannot consume non-ascii characters", Current); 1032 return false; 1033 } 1034 if (Current == End) 1035 return false; 1036 if (uint8_t(*Current) >= 0x80) { 1037 setError("Cannot consume non-ascii characters", Current); 1038 return false; 1039 } 1040 if (uint8_t(*Current) == Expected) { 1041 ++Current; 1042 ++Column; 1043 return true; 1044 } 1045 return false; 1046 } 1047 1048 void Scanner::skip(uint32_t Distance) { 1049 Current += Distance; 1050 Column += Distance; 1051 assert(Current <= End && "Skipped past the end"); 1052 } 1053 1054 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 1055 if (Position == End) 1056 return false; 1057 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 1058 *Position == '\n'; 1059 } 1060 1061 bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) { 1062 if (Position == End || isBlankOrBreak(Position)) 1063 return false; 1064 if (FlowLevel && 1065 StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos) 1066 return false; 1067 return true; 1068 } 1069 1070 bool Scanner::isLineEmpty(StringRef Line) { 1071 for (const auto *Position = Line.begin(); Position != Line.end(); ++Position) 1072 if (!isBlankOrBreak(Position)) 1073 return false; 1074 return true; 1075 } 1076 1077 bool Scanner::consumeLineBreakIfPresent() { 1078 auto Next = skip_b_break(Current); 1079 if (Next == Current) 1080 return false; 1081 Column = 0; 1082 ++Line; 1083 Current = Next; 1084 return true; 1085 } 1086 1087 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 1088 , unsigned AtColumn 1089 , bool IsRequired) { 1090 if (IsSimpleKeyAllowed) { 1091 SimpleKey SK; 1092 SK.Tok = Tok; 1093 SK.Line = Line; 1094 SK.Column = AtColumn; 1095 SK.IsRequired = IsRequired; 1096 SK.FlowLevel = FlowLevel; 1097 SimpleKeys.push_back(SK); 1098 } 1099 } 1100 1101 void Scanner::removeStaleSimpleKeyCandidates() { 1102 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 1103 i != SimpleKeys.end();) { 1104 if (i->Line != Line || i->Column + 1024 < Column) { 1105 if (i->IsRequired) 1106 setError( "Could not find expected : for simple key" 1107 , i->Tok->Range.begin()); 1108 i = SimpleKeys.erase(i); 1109 } else 1110 ++i; 1111 } 1112 } 1113 1114 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1115 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1116 SimpleKeys.pop_back(); 1117 } 1118 1119 bool Scanner::unrollIndent(int ToColumn) { 1120 Token T; 1121 // Indentation is ignored in flow. 1122 if (FlowLevel != 0) 1123 return true; 1124 1125 while (Indent > ToColumn) { 1126 T.Kind = Token::TK_BlockEnd; 1127 T.Range = StringRef(Current, 1); 1128 TokenQueue.push_back(T); 1129 Indent = Indents.pop_back_val(); 1130 } 1131 1132 return true; 1133 } 1134 1135 bool Scanner::rollIndent( int ToColumn 1136 , Token::TokenKind Kind 1137 , TokenQueueT::iterator InsertPoint) { 1138 if (FlowLevel) 1139 return true; 1140 if (Indent < ToColumn) { 1141 Indents.push_back(Indent); 1142 Indent = ToColumn; 1143 1144 Token T; 1145 T.Kind = Kind; 1146 T.Range = StringRef(Current, 0); 1147 TokenQueue.insert(InsertPoint, T); 1148 } 1149 return true; 1150 } 1151 1152 void Scanner::skipComment() { 1153 if (Current == End || *Current != '#') 1154 return; 1155 while (true) { 1156 // This may skip more than one byte, thus Column is only incremented 1157 // for code points. 1158 StringRef::iterator I = skip_nb_char(Current); 1159 if (I == Current) 1160 break; 1161 Current = I; 1162 ++Column; 1163 } 1164 } 1165 1166 void Scanner::scanToNextToken() { 1167 while (true) { 1168 while (Current != End && (*Current == ' ' || *Current == '\t')) { 1169 skip(1); 1170 } 1171 1172 skipComment(); 1173 1174 // Skip EOL. 1175 StringRef::iterator i = skip_b_break(Current); 1176 if (i == Current) 1177 break; 1178 Current = i; 1179 ++Line; 1180 Column = 0; 1181 // New lines may start a simple key. 1182 if (!FlowLevel) 1183 IsSimpleKeyAllowed = true; 1184 } 1185 } 1186 1187 bool Scanner::scanStreamStart() { 1188 IsStartOfStream = false; 1189 1190 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1191 1192 Token T; 1193 T.Kind = Token::TK_StreamStart; 1194 T.Range = StringRef(Current, EI.second); 1195 TokenQueue.push_back(T); 1196 Current += EI.second; 1197 return true; 1198 } 1199 1200 bool Scanner::scanStreamEnd() { 1201 // Force an ending new line if one isn't present. 1202 if (Column != 0) { 1203 Column = 0; 1204 ++Line; 1205 } 1206 1207 unrollIndent(-1); 1208 SimpleKeys.clear(); 1209 IsSimpleKeyAllowed = false; 1210 IsAdjacentValueAllowedInFlow = false; 1211 1212 Token T; 1213 T.Kind = Token::TK_StreamEnd; 1214 T.Range = StringRef(Current, 0); 1215 TokenQueue.push_back(T); 1216 return true; 1217 } 1218 1219 bool Scanner::scanDirective() { 1220 // Reset the indentation level. 1221 unrollIndent(-1); 1222 SimpleKeys.clear(); 1223 IsSimpleKeyAllowed = false; 1224 IsAdjacentValueAllowedInFlow = false; 1225 1226 StringRef::iterator Start = Current; 1227 consume('%'); 1228 StringRef::iterator NameStart = Current; 1229 Current = skip_while(&Scanner::skip_ns_char, Current); 1230 StringRef Name(NameStart, Current - NameStart); 1231 Current = skip_while(&Scanner::skip_s_white, Current); 1232 1233 Token T; 1234 if (Name == "YAML") { 1235 Current = skip_while(&Scanner::skip_ns_char, Current); 1236 T.Kind = Token::TK_VersionDirective; 1237 T.Range = StringRef(Start, Current - Start); 1238 TokenQueue.push_back(T); 1239 return true; 1240 } else if(Name == "TAG") { 1241 Current = skip_while(&Scanner::skip_ns_char, Current); 1242 Current = skip_while(&Scanner::skip_s_white, Current); 1243 Current = skip_while(&Scanner::skip_ns_char, Current); 1244 T.Kind = Token::TK_TagDirective; 1245 T.Range = StringRef(Start, Current - Start); 1246 TokenQueue.push_back(T); 1247 return true; 1248 } 1249 return false; 1250 } 1251 1252 bool Scanner::scanDocumentIndicator(bool IsStart) { 1253 unrollIndent(-1); 1254 SimpleKeys.clear(); 1255 IsSimpleKeyAllowed = false; 1256 IsAdjacentValueAllowedInFlow = false; 1257 1258 Token T; 1259 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1260 T.Range = StringRef(Current, 3); 1261 skip(3); 1262 TokenQueue.push_back(T); 1263 return true; 1264 } 1265 1266 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1267 Token T; 1268 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1269 : Token::TK_FlowMappingStart; 1270 T.Range = StringRef(Current, 1); 1271 skip(1); 1272 TokenQueue.push_back(T); 1273 1274 // [ and { may begin a simple key. 1275 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1276 1277 // And may also be followed by a simple key. 1278 IsSimpleKeyAllowed = true; 1279 // Adjacent values are allowed in flows only after JSON-style keys. 1280 IsAdjacentValueAllowedInFlow = false; 1281 ++FlowLevel; 1282 return true; 1283 } 1284 1285 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1286 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1287 IsSimpleKeyAllowed = false; 1288 IsAdjacentValueAllowedInFlow = true; 1289 Token T; 1290 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1291 : Token::TK_FlowMappingEnd; 1292 T.Range = StringRef(Current, 1); 1293 skip(1); 1294 TokenQueue.push_back(T); 1295 if (FlowLevel) 1296 --FlowLevel; 1297 return true; 1298 } 1299 1300 bool Scanner::scanFlowEntry() { 1301 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1302 IsSimpleKeyAllowed = true; 1303 IsAdjacentValueAllowedInFlow = false; 1304 Token T; 1305 T.Kind = Token::TK_FlowEntry; 1306 T.Range = StringRef(Current, 1); 1307 skip(1); 1308 TokenQueue.push_back(T); 1309 return true; 1310 } 1311 1312 bool Scanner::scanBlockEntry() { 1313 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1314 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1315 IsSimpleKeyAllowed = true; 1316 IsAdjacentValueAllowedInFlow = false; 1317 Token T; 1318 T.Kind = Token::TK_BlockEntry; 1319 T.Range = StringRef(Current, 1); 1320 skip(1); 1321 TokenQueue.push_back(T); 1322 return true; 1323 } 1324 1325 bool Scanner::scanKey() { 1326 if (!FlowLevel) 1327 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1328 1329 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1330 IsSimpleKeyAllowed = !FlowLevel; 1331 IsAdjacentValueAllowedInFlow = false; 1332 1333 Token T; 1334 T.Kind = Token::TK_Key; 1335 T.Range = StringRef(Current, 1); 1336 skip(1); 1337 TokenQueue.push_back(T); 1338 return true; 1339 } 1340 1341 bool Scanner::scanValue() { 1342 // If the previous token could have been a simple key, insert the key token 1343 // into the token queue. 1344 if (!SimpleKeys.empty()) { 1345 SimpleKey SK = SimpleKeys.pop_back_val(); 1346 Token T; 1347 T.Kind = Token::TK_Key; 1348 T.Range = SK.Tok->Range; 1349 TokenQueueT::iterator i, e; 1350 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1351 if (i == SK.Tok) 1352 break; 1353 } 1354 if (i == e) { 1355 Failed = true; 1356 return false; 1357 } 1358 i = TokenQueue.insert(i, T); 1359 1360 // We may also need to add a Block-Mapping-Start token. 1361 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1362 1363 IsSimpleKeyAllowed = false; 1364 } else { 1365 if (!FlowLevel) 1366 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1367 IsSimpleKeyAllowed = !FlowLevel; 1368 } 1369 IsAdjacentValueAllowedInFlow = false; 1370 1371 Token T; 1372 T.Kind = Token::TK_Value; 1373 T.Range = StringRef(Current, 1); 1374 skip(1); 1375 TokenQueue.push_back(T); 1376 return true; 1377 } 1378 1379 // Forbidding inlining improves performance by roughly 20%. 1380 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1381 LLVM_ATTRIBUTE_NOINLINE static bool 1382 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1383 1384 // Returns whether a character at 'Position' was escaped with a leading '\'. 1385 // 'First' specifies the position of the first character in the string. 1386 static bool wasEscaped(StringRef::iterator First, 1387 StringRef::iterator Position) { 1388 assert(Position - 1 >= First); 1389 StringRef::iterator I = Position - 1; 1390 // We calculate the number of consecutive '\'s before the current position 1391 // by iterating backwards through our string. 1392 while (I >= First && *I == '\\') --I; 1393 // (Position - 1 - I) now contains the number of '\'s before the current 1394 // position. If it is odd, the character at 'Position' was escaped. 1395 return (Position - 1 - I) % 2 == 1; 1396 } 1397 1398 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1399 StringRef::iterator Start = Current; 1400 unsigned ColStart = Column; 1401 if (IsDoubleQuoted) { 1402 do { 1403 ++Current; 1404 while (Current != End && *Current != '"') 1405 ++Current; 1406 // Repeat until the previous character was not a '\' or was an escaped 1407 // backslash. 1408 } while ( Current != End 1409 && *(Current - 1) == '\\' 1410 && wasEscaped(Start + 1, Current)); 1411 } else { 1412 skip(1); 1413 while (Current != End) { 1414 // Skip a ' followed by another '. 1415 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1416 skip(2); 1417 continue; 1418 } else if (*Current == '\'') 1419 break; 1420 StringRef::iterator i = skip_nb_char(Current); 1421 if (i == Current) { 1422 i = skip_b_break(Current); 1423 if (i == Current) 1424 break; 1425 Current = i; 1426 Column = 0; 1427 ++Line; 1428 } else { 1429 if (i == End) 1430 break; 1431 Current = i; 1432 ++Column; 1433 } 1434 } 1435 } 1436 1437 if (Current == End) { 1438 setError("Expected quote at end of scalar", Current); 1439 return false; 1440 } 1441 1442 skip(1); // Skip ending quote. 1443 Token T; 1444 T.Kind = Token::TK_Scalar; 1445 T.Range = StringRef(Start, Current - Start); 1446 TokenQueue.push_back(T); 1447 1448 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1449 1450 IsSimpleKeyAllowed = false; 1451 IsAdjacentValueAllowedInFlow = true; 1452 1453 return true; 1454 } 1455 1456 bool Scanner::scanPlainScalar() { 1457 StringRef::iterator Start = Current; 1458 unsigned ColStart = Column; 1459 unsigned LeadingBlanks = 0; 1460 assert(Indent >= -1 && "Indent must be >= -1 !"); 1461 unsigned indent = static_cast<unsigned>(Indent + 1); 1462 while (Current != End) { 1463 if (*Current == '#') 1464 break; 1465 1466 while (Current != End && 1467 ((*Current != ':' && isPlainSafeNonBlank(Current)) || 1468 (*Current == ':' && isPlainSafeNonBlank(Current + 1)))) { 1469 StringRef::iterator i = skip_nb_char(Current); 1470 if (i == Current) 1471 break; 1472 Current = i; 1473 ++Column; 1474 } 1475 1476 // Are we at the end? 1477 if (!isBlankOrBreak(Current)) 1478 break; 1479 1480 // Eat blanks. 1481 StringRef::iterator Tmp = Current; 1482 while (isBlankOrBreak(Tmp)) { 1483 StringRef::iterator i = skip_s_white(Tmp); 1484 if (i != Tmp) { 1485 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1486 setError("Found invalid tab character in indentation", Tmp); 1487 return false; 1488 } 1489 Tmp = i; 1490 ++Column; 1491 } else { 1492 i = skip_b_break(Tmp); 1493 if (!LeadingBlanks) 1494 LeadingBlanks = 1; 1495 Tmp = i; 1496 Column = 0; 1497 ++Line; 1498 } 1499 } 1500 1501 if (!FlowLevel && Column < indent) 1502 break; 1503 1504 Current = Tmp; 1505 } 1506 if (Start == Current) { 1507 setError("Got empty plain scalar", Start); 1508 return false; 1509 } 1510 Token T; 1511 T.Kind = Token::TK_Scalar; 1512 T.Range = StringRef(Start, Current - Start); 1513 TokenQueue.push_back(T); 1514 1515 // Plain scalars can be simple keys. 1516 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1517 1518 IsSimpleKeyAllowed = false; 1519 IsAdjacentValueAllowedInFlow = false; 1520 1521 return true; 1522 } 1523 1524 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1525 StringRef::iterator Start = Current; 1526 unsigned ColStart = Column; 1527 skip(1); 1528 while (Current != End) { 1529 if ( *Current == '[' || *Current == ']' 1530 || *Current == '{' || *Current == '}' 1531 || *Current == ',' 1532 || *Current == ':') 1533 break; 1534 StringRef::iterator i = skip_ns_char(Current); 1535 if (i == Current) 1536 break; 1537 Current = i; 1538 ++Column; 1539 } 1540 1541 if (Start + 1 == Current) { 1542 setError("Got empty alias or anchor", Start); 1543 return false; 1544 } 1545 1546 Token T; 1547 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1548 T.Range = StringRef(Start, Current - Start); 1549 TokenQueue.push_back(T); 1550 1551 // Alias and anchors can be simple keys. 1552 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1553 1554 IsSimpleKeyAllowed = false; 1555 IsAdjacentValueAllowedInFlow = false; 1556 1557 return true; 1558 } 1559 1560 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator, 1561 char &ChompingIndicator, 1562 unsigned &IndentIndicator, 1563 bool &IsDone) { 1564 StyleIndicator = scanBlockStyleIndicator(); 1565 if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone)) 1566 return false; 1567 return true; 1568 } 1569 1570 char Scanner::scanBlockStyleIndicator() { 1571 char Indicator = ' '; 1572 if (Current != End && (*Current == '>' || *Current == '|')) { 1573 Indicator = *Current; 1574 skip(1); 1575 } 1576 return Indicator; 1577 } 1578 1579 char Scanner::scanBlockChompingIndicator() { 1580 char Indicator = ' '; 1581 if (Current != End && (*Current == '+' || *Current == '-')) { 1582 Indicator = *Current; 1583 skip(1); 1584 } 1585 return Indicator; 1586 } 1587 1588 /// Get the number of line breaks after chomping. 1589 /// 1590 /// Return the number of trailing line breaks to emit, depending on 1591 /// \p ChompingIndicator. 1592 static unsigned getChompedLineBreaks(char ChompingIndicator, 1593 unsigned LineBreaks, StringRef Str) { 1594 if (ChompingIndicator == '-') // Strip all line breaks. 1595 return 0; 1596 if (ChompingIndicator == '+') // Keep all line breaks. 1597 return LineBreaks; 1598 // Clip trailing lines. 1599 return Str.empty() ? 0 : 1; 1600 } 1601 1602 unsigned Scanner::scanBlockIndentationIndicator() { 1603 unsigned Indent = 0; 1604 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1605 Indent = unsigned(*Current - '0'); 1606 skip(1); 1607 } 1608 return Indent; 1609 } 1610 1611 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1612 unsigned &IndentIndicator, bool &IsDone) { 1613 auto Start = Current; 1614 1615 ChompingIndicator = scanBlockChompingIndicator(); 1616 IndentIndicator = scanBlockIndentationIndicator(); 1617 // Check for the chomping indicator once again. 1618 if (ChompingIndicator == ' ') 1619 ChompingIndicator = scanBlockChompingIndicator(); 1620 Current = skip_while(&Scanner::skip_s_white, Current); 1621 skipComment(); 1622 1623 if (Current == End) { // EOF, we have an empty scalar. 1624 Token T; 1625 T.Kind = Token::TK_BlockScalar; 1626 T.Range = StringRef(Start, Current - Start); 1627 TokenQueue.push_back(T); 1628 IsDone = true; 1629 return true; 1630 } 1631 1632 if (!consumeLineBreakIfPresent()) { 1633 setError("Expected a line break after block scalar header", Current); 1634 return false; 1635 } 1636 return true; 1637 } 1638 1639 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1640 unsigned BlockExitIndent, 1641 unsigned &LineBreaks, bool &IsDone) { 1642 unsigned MaxAllSpaceLineCharacters = 0; 1643 StringRef::iterator LongestAllSpaceLine; 1644 1645 while (true) { 1646 advanceWhile(&Scanner::skip_s_space); 1647 if (skip_nb_char(Current) != Current) { 1648 // This line isn't empty, so try and find the indentation. 1649 if (Column <= BlockExitIndent) { // End of the block literal. 1650 IsDone = true; 1651 return true; 1652 } 1653 // We found the block's indentation. 1654 BlockIndent = Column; 1655 if (MaxAllSpaceLineCharacters > BlockIndent) { 1656 setError( 1657 "Leading all-spaces line must be smaller than the block indent", 1658 LongestAllSpaceLine); 1659 return false; 1660 } 1661 return true; 1662 } 1663 if (skip_b_break(Current) != Current && 1664 Column > MaxAllSpaceLineCharacters) { 1665 // Record the longest all-space line in case it's longer than the 1666 // discovered block indent. 1667 MaxAllSpaceLineCharacters = Column; 1668 LongestAllSpaceLine = Current; 1669 } 1670 1671 // Check for EOF. 1672 if (Current == End) { 1673 IsDone = true; 1674 return true; 1675 } 1676 1677 if (!consumeLineBreakIfPresent()) { 1678 IsDone = true; 1679 return true; 1680 } 1681 ++LineBreaks; 1682 } 1683 return true; 1684 } 1685 1686 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1687 unsigned BlockExitIndent, bool &IsDone) { 1688 // Skip the indentation. 1689 while (Column < BlockIndent) { 1690 auto I = skip_s_space(Current); 1691 if (I == Current) 1692 break; 1693 Current = I; 1694 ++Column; 1695 } 1696 1697 if (skip_nb_char(Current) == Current) 1698 return true; 1699 1700 if (Column <= BlockExitIndent) { // End of the block literal. 1701 IsDone = true; 1702 return true; 1703 } 1704 1705 if (Column < BlockIndent) { 1706 if (Current != End && *Current == '#') { // Trailing comment. 1707 IsDone = true; 1708 return true; 1709 } 1710 setError("A text line is less indented than the block scalar", Current); 1711 return false; 1712 } 1713 return true; // A normal text line. 1714 } 1715 1716 bool Scanner::scanBlockScalar(bool IsLiteral) { 1717 assert(*Current == '|' || *Current == '>'); 1718 char StyleIndicator; 1719 char ChompingIndicator; 1720 unsigned BlockIndent; 1721 bool IsDone = false; 1722 if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent, 1723 IsDone)) 1724 return false; 1725 if (IsDone) 1726 return true; 1727 bool IsFolded = StyleIndicator == '>'; 1728 1729 const auto *Start = Current; 1730 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1731 unsigned LineBreaks = 0; 1732 if (BlockIndent == 0) { 1733 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1734 IsDone)) 1735 return false; 1736 } 1737 1738 // Scan the block's scalars body. 1739 SmallString<256> Str; 1740 while (!IsDone) { 1741 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1742 return false; 1743 if (IsDone) 1744 break; 1745 1746 // Parse the current line. 1747 auto LineStart = Current; 1748 advanceWhile(&Scanner::skip_nb_char); 1749 if (LineStart != Current) { 1750 if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) { 1751 // The folded style "folds" any single line break between content into a 1752 // single space, except when that content is "empty" (only contains 1753 // whitespace) in which case the line break is left as-is. 1754 if (LineBreaks == 1) { 1755 Str.append(LineBreaks, 1756 isLineEmpty(StringRef(LineStart, Current - LineStart)) 1757 ? '\n' 1758 : ' '); 1759 } 1760 // If we saw a single line break, we are completely replacing it and so 1761 // want `LineBreaks == 0`. Otherwise this decrement accounts for the 1762 // fact that the first line break is "trimmed", only being used to 1763 // signal a sequence of line breaks which should not be folded. 1764 LineBreaks--; 1765 } 1766 Str.append(LineBreaks, '\n'); 1767 Str.append(StringRef(LineStart, Current - LineStart)); 1768 LineBreaks = 0; 1769 } 1770 1771 // Check for EOF. 1772 if (Current == End) 1773 break; 1774 1775 if (!consumeLineBreakIfPresent()) 1776 break; 1777 ++LineBreaks; 1778 } 1779 1780 if (Current == End && !LineBreaks) 1781 // Ensure that there is at least one line break before the end of file. 1782 LineBreaks = 1; 1783 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1784 1785 // New lines may start a simple key. 1786 if (!FlowLevel) 1787 IsSimpleKeyAllowed = true; 1788 IsAdjacentValueAllowedInFlow = false; 1789 1790 Token T; 1791 T.Kind = Token::TK_BlockScalar; 1792 T.Range = StringRef(Start, Current - Start); 1793 T.Value = std::string(Str); 1794 TokenQueue.push_back(T); 1795 return true; 1796 } 1797 1798 bool Scanner::scanTag() { 1799 StringRef::iterator Start = Current; 1800 unsigned ColStart = Column; 1801 skip(1); // Eat !. 1802 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1803 else if (*Current == '<') { 1804 skip(1); 1805 scan_ns_uri_char(); 1806 if (!consume('>')) 1807 return false; 1808 } else { 1809 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1810 Current = skip_while(&Scanner::skip_ns_char, Current); 1811 } 1812 1813 Token T; 1814 T.Kind = Token::TK_Tag; 1815 T.Range = StringRef(Start, Current - Start); 1816 TokenQueue.push_back(T); 1817 1818 // Tags can be simple keys. 1819 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1820 1821 IsSimpleKeyAllowed = false; 1822 IsAdjacentValueAllowedInFlow = false; 1823 1824 return true; 1825 } 1826 1827 bool Scanner::fetchMoreTokens() { 1828 if (IsStartOfStream) 1829 return scanStreamStart(); 1830 1831 scanToNextToken(); 1832 1833 if (Current == End) 1834 return scanStreamEnd(); 1835 1836 removeStaleSimpleKeyCandidates(); 1837 1838 unrollIndent(Column); 1839 1840 if (Column == 0 && *Current == '%') 1841 return scanDirective(); 1842 1843 if (Column == 0 && Current + 4 <= End 1844 && *Current == '-' 1845 && *(Current + 1) == '-' 1846 && *(Current + 2) == '-' 1847 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1848 return scanDocumentIndicator(true); 1849 1850 if (Column == 0 && Current + 4 <= End 1851 && *Current == '.' 1852 && *(Current + 1) == '.' 1853 && *(Current + 2) == '.' 1854 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1855 return scanDocumentIndicator(false); 1856 1857 if (*Current == '[') 1858 return scanFlowCollectionStart(true); 1859 1860 if (*Current == '{') 1861 return scanFlowCollectionStart(false); 1862 1863 if (*Current == ']') 1864 return scanFlowCollectionEnd(true); 1865 1866 if (*Current == '}') 1867 return scanFlowCollectionEnd(false); 1868 1869 if (*Current == ',') 1870 return scanFlowEntry(); 1871 1872 if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End)) 1873 return scanBlockEntry(); 1874 1875 if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1))) 1876 return scanKey(); 1877 1878 if (*Current == ':' && 1879 (!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow)) 1880 return scanValue(); 1881 1882 if (*Current == '*') 1883 return scanAliasOrAnchor(true); 1884 1885 if (*Current == '&') 1886 return scanAliasOrAnchor(false); 1887 1888 if (*Current == '!') 1889 return scanTag(); 1890 1891 if (*Current == '|' && !FlowLevel) 1892 return scanBlockScalar(true); 1893 1894 if (*Current == '>' && !FlowLevel) 1895 return scanBlockScalar(false); 1896 1897 if (*Current == '\'') 1898 return scanFlowScalar(false); 1899 1900 if (*Current == '"') 1901 return scanFlowScalar(true); 1902 1903 // Get a plain scalar. 1904 StringRef FirstChar(Current, 1); 1905 if ((!isBlankOrBreak(Current) && 1906 FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) || 1907 (FirstChar.find_first_of("?:-") != StringRef::npos && 1908 isPlainSafeNonBlank(Current + 1))) 1909 return scanPlainScalar(); 1910 1911 setError("Unrecognized character while tokenizing.", Current); 1912 return false; 1913 } 1914 1915 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1916 std::error_code *EC) 1917 : scanner(new Scanner(Input, SM, ShowColors, EC)) {} 1918 1919 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1920 std::error_code *EC) 1921 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {} 1922 1923 Stream::~Stream() = default; 1924 1925 bool Stream::failed() { return scanner->failed(); } 1926 1927 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { 1928 printError(N ? N->getSourceRange() : SMRange(), Msg, Kind); 1929 } 1930 1931 void Stream::printError(const SMRange &Range, const Twine &Msg, 1932 SourceMgr::DiagKind Kind) { 1933 scanner->printError(Range.Start, Kind, Msg, Range); 1934 } 1935 1936 document_iterator Stream::begin() { 1937 if (CurrentDoc) 1938 report_fatal_error("Can only iterate over the stream once"); 1939 1940 // Skip Stream-Start. 1941 scanner->getNext(); 1942 1943 CurrentDoc.reset(new Document(*this)); 1944 return document_iterator(CurrentDoc); 1945 } 1946 1947 document_iterator Stream::end() { 1948 return document_iterator(); 1949 } 1950 1951 void Stream::skip() { 1952 for (Document &Doc : *this) 1953 Doc.skip(); 1954 } 1955 1956 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1957 StringRef T) 1958 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1959 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1960 SourceRange = SMRange(Start, Start); 1961 } 1962 1963 std::string Node::getVerbatimTag() const { 1964 StringRef Raw = getRawTag(); 1965 if (!Raw.empty() && Raw != "!") { 1966 std::string Ret; 1967 if (Raw.find_last_of('!') == 0) { 1968 Ret = std::string(Doc->getTagMap().find("!")->second); 1969 Ret += Raw.substr(1); 1970 return Ret; 1971 } else if (Raw.starts_with("!!")) { 1972 Ret = std::string(Doc->getTagMap().find("!!")->second); 1973 Ret += Raw.substr(2); 1974 return Ret; 1975 } else { 1976 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1977 std::map<StringRef, StringRef>::const_iterator It = 1978 Doc->getTagMap().find(TagHandle); 1979 if (It != Doc->getTagMap().end()) 1980 Ret = std::string(It->second); 1981 else { 1982 Token T; 1983 T.Kind = Token::TK_Tag; 1984 T.Range = TagHandle; 1985 setError(Twine("Unknown tag handle ") + TagHandle, T); 1986 } 1987 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1988 return Ret; 1989 } 1990 } 1991 1992 switch (getType()) { 1993 case NK_Null: 1994 return "tag:yaml.org,2002:null"; 1995 case NK_Scalar: 1996 case NK_BlockScalar: 1997 // TODO: Tag resolution. 1998 return "tag:yaml.org,2002:str"; 1999 case NK_Mapping: 2000 return "tag:yaml.org,2002:map"; 2001 case NK_Sequence: 2002 return "tag:yaml.org,2002:seq"; 2003 } 2004 2005 return ""; 2006 } 2007 2008 Token &Node::peekNext() { 2009 return Doc->peekNext(); 2010 } 2011 2012 Token Node::getNext() { 2013 return Doc->getNext(); 2014 } 2015 2016 Node *Node::parseBlockNode() { 2017 return Doc->parseBlockNode(); 2018 } 2019 2020 BumpPtrAllocator &Node::getAllocator() { 2021 return Doc->NodeAllocator; 2022 } 2023 2024 void Node::setError(const Twine &Msg, Token &Tok) const { 2025 Doc->setError(Msg, Tok); 2026 } 2027 2028 bool Node::failed() const { 2029 return Doc->failed(); 2030 } 2031 2032 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 2033 if (Value[0] == '"') 2034 return getDoubleQuotedValue(Value, Storage); 2035 if (Value[0] == '\'') 2036 return getSingleQuotedValue(Value, Storage); 2037 return getPlainValue(Value, Storage); 2038 } 2039 2040 /// parseScalarValue - A common parsing routine for all flow scalar styles. 2041 /// It handles line break characters by itself, adds regular content characters 2042 /// to the result, and forwards escaped sequences to the provided routine for 2043 /// the style-specific processing. 2044 /// 2045 /// \param UnquotedValue - An input value without quotation marks. 2046 /// \param Storage - A storage for the result if the input value is multiline or 2047 /// contains escaped characters. 2048 /// \param LookupChars - A set of special characters to search in the input 2049 /// string. Should include line break characters and the escape character 2050 /// specific for the processing scalar style, if any. 2051 /// \param UnescapeCallback - This is called when the escape character is found 2052 /// in the input. 2053 /// \returns - The unfolded and unescaped value. 2054 static StringRef 2055 parseScalarValue(StringRef UnquotedValue, SmallVectorImpl<char> &Storage, 2056 StringRef LookupChars, 2057 std::function<StringRef(StringRef, SmallVectorImpl<char> &)> 2058 UnescapeCallback) { 2059 size_t I = UnquotedValue.find_first_of(LookupChars); 2060 if (I == StringRef::npos) 2061 return UnquotedValue; 2062 2063 Storage.clear(); 2064 Storage.reserve(UnquotedValue.size()); 2065 char LastNewLineAddedAs = '\0'; 2066 for (; I != StringRef::npos; I = UnquotedValue.find_first_of(LookupChars)) { 2067 if (UnquotedValue[I] != '\r' && UnquotedValue[I] != '\n') { 2068 llvm::append_range(Storage, UnquotedValue.take_front(I)); 2069 UnquotedValue = UnescapeCallback(UnquotedValue.drop_front(I), Storage); 2070 LastNewLineAddedAs = '\0'; 2071 continue; 2072 } 2073 if (size_t LastNonSWhite = UnquotedValue.find_last_not_of(" \t", I); 2074 LastNonSWhite != StringRef::npos) { 2075 llvm::append_range(Storage, UnquotedValue.take_front(LastNonSWhite + 1)); 2076 Storage.push_back(' '); 2077 LastNewLineAddedAs = ' '; 2078 } else { 2079 // Note: we can't just check if the last character in Storage is ' ', 2080 // '\n', or something else; that would give a wrong result for double 2081 // quoted values containing an escaped space character before a new-line 2082 // character. 2083 switch (LastNewLineAddedAs) { 2084 case ' ': 2085 assert(!Storage.empty() && Storage.back() == ' '); 2086 Storage.back() = '\n'; 2087 LastNewLineAddedAs = '\n'; 2088 break; 2089 case '\n': 2090 assert(!Storage.empty() && Storage.back() == '\n'); 2091 Storage.push_back('\n'); 2092 break; 2093 default: 2094 Storage.push_back(' '); 2095 LastNewLineAddedAs = ' '; 2096 break; 2097 } 2098 } 2099 // Handle Windows-style EOL 2100 if (UnquotedValue.substr(I, 2) == "\r\n") 2101 I++; 2102 UnquotedValue = UnquotedValue.drop_front(I + 1).ltrim(" \t"); 2103 } 2104 llvm::append_range(Storage, UnquotedValue); 2105 return StringRef(Storage.begin(), Storage.size()); 2106 } 2107 2108 StringRef 2109 ScalarNode::getDoubleQuotedValue(StringRef RawValue, 2110 SmallVectorImpl<char> &Storage) const { 2111 assert(RawValue.size() >= 2 && RawValue.front() == '"' && 2112 RawValue.back() == '"'); 2113 StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); 2114 2115 auto UnescapeFunc = [this](StringRef UnquotedValue, 2116 SmallVectorImpl<char> &Storage) { 2117 assert(UnquotedValue.take_front(1) == "\\"); 2118 if (UnquotedValue.size() == 1) { 2119 Token T; 2120 T.Range = UnquotedValue; 2121 setError("Unrecognized escape code", T); 2122 Storage.clear(); 2123 return StringRef(); 2124 } 2125 UnquotedValue = UnquotedValue.drop_front(1); 2126 switch (UnquotedValue[0]) { 2127 default: { 2128 Token T; 2129 T.Range = UnquotedValue.take_front(1); 2130 setError("Unrecognized escape code", T); 2131 Storage.clear(); 2132 return StringRef(); 2133 } 2134 case '\r': 2135 // Shrink the Windows-style EOL. 2136 if (UnquotedValue.size() >= 2 && UnquotedValue[1] == '\n') 2137 UnquotedValue = UnquotedValue.drop_front(1); 2138 [[fallthrough]]; 2139 case '\n': 2140 return UnquotedValue.drop_front(1).ltrim(" \t"); 2141 case '0': 2142 Storage.push_back(0x00); 2143 break; 2144 case 'a': 2145 Storage.push_back(0x07); 2146 break; 2147 case 'b': 2148 Storage.push_back(0x08); 2149 break; 2150 case 't': 2151 case 0x09: 2152 Storage.push_back(0x09); 2153 break; 2154 case 'n': 2155 Storage.push_back(0x0A); 2156 break; 2157 case 'v': 2158 Storage.push_back(0x0B); 2159 break; 2160 case 'f': 2161 Storage.push_back(0x0C); 2162 break; 2163 case 'r': 2164 Storage.push_back(0x0D); 2165 break; 2166 case 'e': 2167 Storage.push_back(0x1B); 2168 break; 2169 case ' ': 2170 Storage.push_back(0x20); 2171 break; 2172 case '"': 2173 Storage.push_back(0x22); 2174 break; 2175 case '/': 2176 Storage.push_back(0x2F); 2177 break; 2178 case '\\': 2179 Storage.push_back(0x5C); 2180 break; 2181 case 'N': 2182 encodeUTF8(0x85, Storage); 2183 break; 2184 case '_': 2185 encodeUTF8(0xA0, Storage); 2186 break; 2187 case 'L': 2188 encodeUTF8(0x2028, Storage); 2189 break; 2190 case 'P': 2191 encodeUTF8(0x2029, Storage); 2192 break; 2193 case 'x': { 2194 if (UnquotedValue.size() < 3) 2195 // TODO: Report error. 2196 break; 2197 unsigned int UnicodeScalarValue; 2198 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2199 // TODO: Report error. 2200 UnicodeScalarValue = 0xFFFD; 2201 encodeUTF8(UnicodeScalarValue, Storage); 2202 return UnquotedValue.drop_front(3); 2203 } 2204 case 'u': { 2205 if (UnquotedValue.size() < 5) 2206 // TODO: Report error. 2207 break; 2208 unsigned int UnicodeScalarValue; 2209 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2210 // TODO: Report error. 2211 UnicodeScalarValue = 0xFFFD; 2212 encodeUTF8(UnicodeScalarValue, Storage); 2213 return UnquotedValue.drop_front(5); 2214 } 2215 case 'U': { 2216 if (UnquotedValue.size() < 9) 2217 // TODO: Report error. 2218 break; 2219 unsigned int UnicodeScalarValue; 2220 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2221 // TODO: Report error. 2222 UnicodeScalarValue = 0xFFFD; 2223 encodeUTF8(UnicodeScalarValue, Storage); 2224 return UnquotedValue.drop_front(9); 2225 } 2226 } 2227 return UnquotedValue.drop_front(1); 2228 }; 2229 2230 return parseScalarValue(UnquotedValue, Storage, "\\\r\n", UnescapeFunc); 2231 } 2232 2233 StringRef ScalarNode::getSingleQuotedValue(StringRef RawValue, 2234 SmallVectorImpl<char> &Storage) { 2235 assert(RawValue.size() >= 2 && RawValue.front() == '\'' && 2236 RawValue.back() == '\''); 2237 StringRef UnquotedValue = RawValue.substr(1, RawValue.size() - 2); 2238 2239 auto UnescapeFunc = [](StringRef UnquotedValue, 2240 SmallVectorImpl<char> &Storage) { 2241 assert(UnquotedValue.take_front(2) == "''"); 2242 Storage.push_back('\''); 2243 return UnquotedValue.drop_front(2); 2244 }; 2245 2246 return parseScalarValue(UnquotedValue, Storage, "'\r\n", UnescapeFunc); 2247 } 2248 2249 StringRef ScalarNode::getPlainValue(StringRef RawValue, 2250 SmallVectorImpl<char> &Storage) { 2251 // Trim trailing whitespace ('b-char' and 's-white'). 2252 // NOTE: Alternatively we could change the scanner to not include whitespace 2253 // here in the first place. 2254 RawValue = RawValue.rtrim("\r\n \t"); 2255 return parseScalarValue(RawValue, Storage, "\r\n", nullptr); 2256 } 2257 2258 Node *KeyValueNode::getKey() { 2259 if (Key) 2260 return Key; 2261 // Handle implicit null keys. 2262 { 2263 Token &t = peekNext(); 2264 if ( t.Kind == Token::TK_BlockEnd 2265 || t.Kind == Token::TK_Value 2266 || t.Kind == Token::TK_Error) { 2267 return Key = new (getAllocator()) NullNode(Doc); 2268 } 2269 if (t.Kind == Token::TK_Key) 2270 getNext(); // skip TK_Key. 2271 } 2272 2273 // Handle explicit null keys. 2274 Token &t = peekNext(); 2275 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2276 return Key = new (getAllocator()) NullNode(Doc); 2277 } 2278 2279 // We've got a normal key. 2280 return Key = parseBlockNode(); 2281 } 2282 2283 Node *KeyValueNode::getValue() { 2284 if (Value) 2285 return Value; 2286 2287 if (Node* Key = getKey()) 2288 Key->skip(); 2289 else { 2290 setError("Null key in Key Value.", peekNext()); 2291 return Value = new (getAllocator()) NullNode(Doc); 2292 } 2293 2294 if (failed()) 2295 return Value = new (getAllocator()) NullNode(Doc); 2296 2297 // Handle implicit null values. 2298 { 2299 Token &t = peekNext(); 2300 if ( t.Kind == Token::TK_BlockEnd 2301 || t.Kind == Token::TK_FlowMappingEnd 2302 || t.Kind == Token::TK_Key 2303 || t.Kind == Token::TK_FlowEntry 2304 || t.Kind == Token::TK_Error) { 2305 return Value = new (getAllocator()) NullNode(Doc); 2306 } 2307 2308 if (t.Kind != Token::TK_Value) { 2309 setError("Unexpected token in Key Value.", t); 2310 return Value = new (getAllocator()) NullNode(Doc); 2311 } 2312 getNext(); // skip TK_Value. 2313 } 2314 2315 // Handle explicit null values. 2316 Token &t = peekNext(); 2317 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2318 return Value = new (getAllocator()) NullNode(Doc); 2319 } 2320 2321 // We got a normal value. 2322 return Value = parseBlockNode(); 2323 } 2324 2325 void MappingNode::increment() { 2326 if (failed()) { 2327 IsAtEnd = true; 2328 CurrentEntry = nullptr; 2329 return; 2330 } 2331 if (CurrentEntry) { 2332 CurrentEntry->skip(); 2333 if (Type == MT_Inline) { 2334 IsAtEnd = true; 2335 CurrentEntry = nullptr; 2336 return; 2337 } 2338 } 2339 Token T = peekNext(); 2340 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2341 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2342 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2343 } else if (Type == MT_Block) { 2344 switch (T.Kind) { 2345 case Token::TK_BlockEnd: 2346 getNext(); 2347 IsAtEnd = true; 2348 CurrentEntry = nullptr; 2349 break; 2350 default: 2351 setError("Unexpected token. Expected Key or Block End", T); 2352 [[fallthrough]]; 2353 case Token::TK_Error: 2354 IsAtEnd = true; 2355 CurrentEntry = nullptr; 2356 } 2357 } else { 2358 switch (T.Kind) { 2359 case Token::TK_FlowEntry: 2360 // Eat the flow entry and recurse. 2361 getNext(); 2362 return increment(); 2363 case Token::TK_FlowMappingEnd: 2364 getNext(); 2365 [[fallthrough]]; 2366 case Token::TK_Error: 2367 // Set this to end iterator. 2368 IsAtEnd = true; 2369 CurrentEntry = nullptr; 2370 break; 2371 default: 2372 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2373 "Mapping End." 2374 , T); 2375 IsAtEnd = true; 2376 CurrentEntry = nullptr; 2377 } 2378 } 2379 } 2380 2381 void SequenceNode::increment() { 2382 if (failed()) { 2383 IsAtEnd = true; 2384 CurrentEntry = nullptr; 2385 return; 2386 } 2387 if (CurrentEntry) 2388 CurrentEntry->skip(); 2389 Token T = peekNext(); 2390 if (SeqType == ST_Block) { 2391 switch (T.Kind) { 2392 case Token::TK_BlockEntry: 2393 getNext(); 2394 CurrentEntry = parseBlockNode(); 2395 if (!CurrentEntry) { // An error occurred. 2396 IsAtEnd = true; 2397 CurrentEntry = nullptr; 2398 } 2399 break; 2400 case Token::TK_BlockEnd: 2401 getNext(); 2402 IsAtEnd = true; 2403 CurrentEntry = nullptr; 2404 break; 2405 default: 2406 setError( "Unexpected token. Expected Block Entry or Block End." 2407 , T); 2408 [[fallthrough]]; 2409 case Token::TK_Error: 2410 IsAtEnd = true; 2411 CurrentEntry = nullptr; 2412 } 2413 } else if (SeqType == ST_Indentless) { 2414 switch (T.Kind) { 2415 case Token::TK_BlockEntry: 2416 getNext(); 2417 CurrentEntry = parseBlockNode(); 2418 if (!CurrentEntry) { // An error occurred. 2419 IsAtEnd = true; 2420 CurrentEntry = nullptr; 2421 } 2422 break; 2423 default: 2424 case Token::TK_Error: 2425 IsAtEnd = true; 2426 CurrentEntry = nullptr; 2427 } 2428 } else if (SeqType == ST_Flow) { 2429 switch (T.Kind) { 2430 case Token::TK_FlowEntry: 2431 // Eat the flow entry and recurse. 2432 getNext(); 2433 WasPreviousTokenFlowEntry = true; 2434 return increment(); 2435 case Token::TK_FlowSequenceEnd: 2436 getNext(); 2437 [[fallthrough]]; 2438 case Token::TK_Error: 2439 // Set this to end iterator. 2440 IsAtEnd = true; 2441 CurrentEntry = nullptr; 2442 break; 2443 case Token::TK_StreamEnd: 2444 case Token::TK_DocumentEnd: 2445 case Token::TK_DocumentStart: 2446 setError("Could not find closing ]!", T); 2447 // Set this to end iterator. 2448 IsAtEnd = true; 2449 CurrentEntry = nullptr; 2450 break; 2451 default: 2452 if (!WasPreviousTokenFlowEntry) { 2453 setError("Expected , between entries!", T); 2454 IsAtEnd = true; 2455 CurrentEntry = nullptr; 2456 break; 2457 } 2458 // Otherwise it must be a flow entry. 2459 CurrentEntry = parseBlockNode(); 2460 if (!CurrentEntry) { 2461 IsAtEnd = true; 2462 } 2463 WasPreviousTokenFlowEntry = false; 2464 break; 2465 } 2466 } 2467 } 2468 2469 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2470 // Tag maps starts with two default mappings. 2471 TagMap["!"] = "!"; 2472 TagMap["!!"] = "tag:yaml.org,2002:"; 2473 2474 if (parseDirectives()) 2475 expectToken(Token::TK_DocumentStart); 2476 Token &T = peekNext(); 2477 if (T.Kind == Token::TK_DocumentStart) 2478 getNext(); 2479 } 2480 2481 bool Document::skip() { 2482 if (stream.scanner->failed()) 2483 return false; 2484 if (!Root && !getRoot()) 2485 return false; 2486 Root->skip(); 2487 Token &T = peekNext(); 2488 if (T.Kind == Token::TK_StreamEnd) 2489 return false; 2490 if (T.Kind == Token::TK_DocumentEnd) { 2491 getNext(); 2492 return skip(); 2493 } 2494 return true; 2495 } 2496 2497 Token &Document::peekNext() { 2498 return stream.scanner->peekNext(); 2499 } 2500 2501 Token Document::getNext() { 2502 return stream.scanner->getNext(); 2503 } 2504 2505 void Document::setError(const Twine &Message, Token &Location) const { 2506 stream.scanner->setError(Message, Location.Range.begin()); 2507 } 2508 2509 bool Document::failed() const { 2510 return stream.scanner->failed(); 2511 } 2512 2513 Node *Document::parseBlockNode() { 2514 Token T = peekNext(); 2515 // Handle properties. 2516 Token AnchorInfo; 2517 Token TagInfo; 2518 parse_property: 2519 switch (T.Kind) { 2520 case Token::TK_Alias: 2521 getNext(); 2522 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2523 case Token::TK_Anchor: 2524 if (AnchorInfo.Kind == Token::TK_Anchor) { 2525 setError("Already encountered an anchor for this node!", T); 2526 return nullptr; 2527 } 2528 AnchorInfo = getNext(); // Consume TK_Anchor. 2529 T = peekNext(); 2530 goto parse_property; 2531 case Token::TK_Tag: 2532 if (TagInfo.Kind == Token::TK_Tag) { 2533 setError("Already encountered a tag for this node!", T); 2534 return nullptr; 2535 } 2536 TagInfo = getNext(); // Consume TK_Tag. 2537 T = peekNext(); 2538 goto parse_property; 2539 default: 2540 break; 2541 } 2542 2543 switch (T.Kind) { 2544 case Token::TK_BlockEntry: 2545 // We got an unindented BlockEntry sequence. This is not terminated with 2546 // a BlockEnd. 2547 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2548 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2549 , AnchorInfo.Range.substr(1) 2550 , TagInfo.Range 2551 , SequenceNode::ST_Indentless); 2552 case Token::TK_BlockSequenceStart: 2553 getNext(); 2554 return new (NodeAllocator) 2555 SequenceNode( stream.CurrentDoc 2556 , AnchorInfo.Range.substr(1) 2557 , TagInfo.Range 2558 , SequenceNode::ST_Block); 2559 case Token::TK_BlockMappingStart: 2560 getNext(); 2561 return new (NodeAllocator) 2562 MappingNode( stream.CurrentDoc 2563 , AnchorInfo.Range.substr(1) 2564 , TagInfo.Range 2565 , MappingNode::MT_Block); 2566 case Token::TK_FlowSequenceStart: 2567 getNext(); 2568 return new (NodeAllocator) 2569 SequenceNode( stream.CurrentDoc 2570 , AnchorInfo.Range.substr(1) 2571 , TagInfo.Range 2572 , SequenceNode::ST_Flow); 2573 case Token::TK_FlowMappingStart: 2574 getNext(); 2575 return new (NodeAllocator) 2576 MappingNode( stream.CurrentDoc 2577 , AnchorInfo.Range.substr(1) 2578 , TagInfo.Range 2579 , MappingNode::MT_Flow); 2580 case Token::TK_Scalar: 2581 getNext(); 2582 return new (NodeAllocator) 2583 ScalarNode( stream.CurrentDoc 2584 , AnchorInfo.Range.substr(1) 2585 , TagInfo.Range 2586 , T.Range); 2587 case Token::TK_BlockScalar: { 2588 getNext(); 2589 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2590 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2591 return new (NodeAllocator) 2592 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2593 TagInfo.Range, StrCopy, T.Range); 2594 } 2595 case Token::TK_Key: 2596 // Don't eat the TK_Key, KeyValueNode expects it. 2597 return new (NodeAllocator) 2598 MappingNode( stream.CurrentDoc 2599 , AnchorInfo.Range.substr(1) 2600 , TagInfo.Range 2601 , MappingNode::MT_Inline); 2602 case Token::TK_DocumentStart: 2603 case Token::TK_DocumentEnd: 2604 case Token::TK_StreamEnd: 2605 default: 2606 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2607 // !!null null. 2608 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2609 case Token::TK_FlowMappingEnd: 2610 case Token::TK_FlowSequenceEnd: 2611 case Token::TK_FlowEntry: { 2612 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2613 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2614 2615 setError("Unexpected token", T); 2616 return nullptr; 2617 } 2618 case Token::TK_Error: 2619 return nullptr; 2620 } 2621 llvm_unreachable("Control flow shouldn't reach here."); 2622 return nullptr; 2623 } 2624 2625 bool Document::parseDirectives() { 2626 bool isDirective = false; 2627 while (true) { 2628 Token T = peekNext(); 2629 if (T.Kind == Token::TK_TagDirective) { 2630 parseTAGDirective(); 2631 isDirective = true; 2632 } else if (T.Kind == Token::TK_VersionDirective) { 2633 parseYAMLDirective(); 2634 isDirective = true; 2635 } else 2636 break; 2637 } 2638 return isDirective; 2639 } 2640 2641 void Document::parseYAMLDirective() { 2642 getNext(); // Eat %YAML <version> 2643 } 2644 2645 void Document::parseTAGDirective() { 2646 Token Tag = getNext(); // %TAG <handle> <prefix> 2647 StringRef T = Tag.Range; 2648 // Strip %TAG 2649 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2650 std::size_t HandleEnd = T.find_first_of(" \t"); 2651 StringRef TagHandle = T.substr(0, HandleEnd); 2652 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2653 TagMap[TagHandle] = TagPrefix; 2654 } 2655 2656 bool Document::expectToken(int TK) { 2657 Token T = getNext(); 2658 if (T.Kind != TK) { 2659 setError("Unexpected token", T); 2660 return false; 2661 } 2662 return true; 2663 } 2664