1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringExtras.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/MemoryBuffer.h" 26 #include "llvm/Support/SMLoc.h" 27 #include "llvm/Support/SourceMgr.h" 28 #include "llvm/Support/Unicode.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <algorithm> 31 #include <cassert> 32 #include <cstddef> 33 #include <cstdint> 34 #include <map> 35 #include <memory> 36 #include <string> 37 #include <system_error> 38 #include <utility> 39 40 using namespace llvm; 41 using namespace yaml; 42 43 enum UnicodeEncodingForm { 44 UEF_UTF32_LE, ///< UTF-32 Little Endian 45 UEF_UTF32_BE, ///< UTF-32 Big Endian 46 UEF_UTF16_LE, ///< UTF-16 Little Endian 47 UEF_UTF16_BE, ///< UTF-16 Big Endian 48 UEF_UTF8, ///< UTF-8 or ascii. 49 UEF_Unknown ///< Not a valid Unicode encoding. 50 }; 51 52 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 53 /// it exists. Length is in {0, 2, 3, 4}. 54 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 55 56 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 57 /// encoding form of \a Input. 58 /// 59 /// @param Input A string of length 0 or more. 60 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 61 /// and how long the byte order mark is if one exists. 62 static EncodingInfo getUnicodeEncoding(StringRef Input) { 63 if (Input.empty()) 64 return std::make_pair(UEF_Unknown, 0); 65 66 switch (uint8_t(Input[0])) { 67 case 0x00: 68 if (Input.size() >= 4) { 69 if ( Input[1] == 0 70 && uint8_t(Input[2]) == 0xFE 71 && uint8_t(Input[3]) == 0xFF) 72 return std::make_pair(UEF_UTF32_BE, 4); 73 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 74 return std::make_pair(UEF_UTF32_BE, 0); 75 } 76 77 if (Input.size() >= 2 && Input[1] != 0) 78 return std::make_pair(UEF_UTF16_BE, 0); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xFF: 81 if ( Input.size() >= 4 82 && uint8_t(Input[1]) == 0xFE 83 && Input[2] == 0 84 && Input[3] == 0) 85 return std::make_pair(UEF_UTF32_LE, 4); 86 87 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 88 return std::make_pair(UEF_UTF16_LE, 2); 89 return std::make_pair(UEF_Unknown, 0); 90 case 0xFE: 91 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 92 return std::make_pair(UEF_UTF16_BE, 2); 93 return std::make_pair(UEF_Unknown, 0); 94 case 0xEF: 95 if ( Input.size() >= 3 96 && uint8_t(Input[1]) == 0xBB 97 && uint8_t(Input[2]) == 0xBF) 98 return std::make_pair(UEF_UTF8, 3); 99 return std::make_pair(UEF_Unknown, 0); 100 } 101 102 // It could still be utf-32 or utf-16. 103 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 104 return std::make_pair(UEF_UTF32_LE, 0); 105 106 if (Input.size() >= 2 && Input[1] == 0) 107 return std::make_pair(UEF_UTF16_LE, 0); 108 109 return std::make_pair(UEF_UTF8, 0); 110 } 111 112 /// Pin the vtables to this file. 113 void Node::anchor() {} 114 void NullNode::anchor() {} 115 void ScalarNode::anchor() {} 116 void BlockScalarNode::anchor() {} 117 void KeyValueNode::anchor() {} 118 void MappingNode::anchor() {} 119 void SequenceNode::anchor() {} 120 void AliasNode::anchor() {} 121 122 namespace llvm { 123 namespace yaml { 124 125 /// Token - A single YAML token. 126 struct Token { 127 enum TokenKind { 128 TK_Error, // Uninitialized token. 129 TK_StreamStart, 130 TK_StreamEnd, 131 TK_VersionDirective, 132 TK_TagDirective, 133 TK_DocumentStart, 134 TK_DocumentEnd, 135 TK_BlockEntry, 136 TK_BlockEnd, 137 TK_BlockSequenceStart, 138 TK_BlockMappingStart, 139 TK_FlowEntry, 140 TK_FlowSequenceStart, 141 TK_FlowSequenceEnd, 142 TK_FlowMappingStart, 143 TK_FlowMappingEnd, 144 TK_Key, 145 TK_Value, 146 TK_Scalar, 147 TK_BlockScalar, 148 TK_Alias, 149 TK_Anchor, 150 TK_Tag 151 } Kind = TK_Error; 152 153 /// A string of length 0 or more whose begin() points to the logical location 154 /// of the token in the input. 155 StringRef Range; 156 157 /// The value of a block scalar node. 158 std::string Value; 159 160 Token() = default; 161 }; 162 163 } // end namespace yaml 164 } // end namespace llvm 165 166 using TokenQueueT = BumpPtrList<Token>; 167 168 namespace { 169 170 /// This struct is used to track simple keys. 171 /// 172 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 173 /// which could legally be the start of a simple key. When peekNext is called, 174 /// if the Token To be returned is referenced by a SimpleKey, we continue 175 /// tokenizing until that potential simple key has either been found to not be 176 /// a simple key (we moved on to the next line or went further than 1024 chars). 177 /// Or when we run into a Value, and then insert a Key token (and possibly 178 /// others) before the SimpleKey's Tok. 179 struct SimpleKey { 180 TokenQueueT::iterator Tok; 181 unsigned Column = 0; 182 unsigned Line = 0; 183 unsigned FlowLevel = 0; 184 bool IsRequired = false; 185 186 bool operator ==(const SimpleKey &Other) { 187 return Tok == Other.Tok; 188 } 189 }; 190 191 } // end anonymous namespace 192 193 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 194 /// subsequence and the subsequence's length in code units (uint8_t). 195 /// A length of 0 represents an error. 196 using UTF8Decoded = std::pair<uint32_t, unsigned>; 197 198 static UTF8Decoded decodeUTF8(StringRef Range) { 199 StringRef::iterator Position= Range.begin(); 200 StringRef::iterator End = Range.end(); 201 // 1 byte: [0x00, 0x7f] 202 // Bit pattern: 0xxxxxxx 203 if ((*Position & 0x80) == 0) { 204 return std::make_pair(*Position, 1); 205 } 206 // 2 bytes: [0x80, 0x7ff] 207 // Bit pattern: 110xxxxx 10xxxxxx 208 if (Position + 1 != End && 209 ((*Position & 0xE0) == 0xC0) && 210 ((*(Position + 1) & 0xC0) == 0x80)) { 211 uint32_t codepoint = ((*Position & 0x1F) << 6) | 212 (*(Position + 1) & 0x3F); 213 if (codepoint >= 0x80) 214 return std::make_pair(codepoint, 2); 215 } 216 // 3 bytes: [0x8000, 0xffff] 217 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 218 if (Position + 2 != End && 219 ((*Position & 0xF0) == 0xE0) && 220 ((*(Position + 1) & 0xC0) == 0x80) && 221 ((*(Position + 2) & 0xC0) == 0x80)) { 222 uint32_t codepoint = ((*Position & 0x0F) << 12) | 223 ((*(Position + 1) & 0x3F) << 6) | 224 (*(Position + 2) & 0x3F); 225 // Codepoints between 0xD800 and 0xDFFF are invalid, as 226 // they are high / low surrogate halves used by UTF-16. 227 if (codepoint >= 0x800 && 228 (codepoint < 0xD800 || codepoint > 0xDFFF)) 229 return std::make_pair(codepoint, 3); 230 } 231 // 4 bytes: [0x10000, 0x10FFFF] 232 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 233 if (Position + 3 != End && 234 ((*Position & 0xF8) == 0xF0) && 235 ((*(Position + 1) & 0xC0) == 0x80) && 236 ((*(Position + 2) & 0xC0) == 0x80) && 237 ((*(Position + 3) & 0xC0) == 0x80)) { 238 uint32_t codepoint = ((*Position & 0x07) << 18) | 239 ((*(Position + 1) & 0x3F) << 12) | 240 ((*(Position + 2) & 0x3F) << 6) | 241 (*(Position + 3) & 0x3F); 242 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 243 return std::make_pair(codepoint, 4); 244 } 245 return std::make_pair(0, 0); 246 } 247 248 namespace llvm { 249 namespace yaml { 250 251 /// Scans YAML tokens from a MemoryBuffer. 252 class Scanner { 253 public: 254 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 255 std::error_code *EC = nullptr); 256 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 257 std::error_code *EC = nullptr); 258 259 /// Parse the next token and return it without popping it. 260 Token &peekNext(); 261 262 /// Parse the next token and pop it from the queue. 263 Token getNext(); 264 265 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 266 ArrayRef<SMRange> Ranges = None) { 267 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 268 } 269 270 void setError(const Twine &Message, StringRef::iterator Position) { 271 if (Current >= End) 272 Current = End - 1; 273 274 // propagate the error if possible 275 if (EC) 276 *EC = make_error_code(std::errc::invalid_argument); 277 278 // Don't print out more errors after the first one we encounter. The rest 279 // are just the result of the first, and have no meaning. 280 if (!Failed) 281 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 282 Failed = true; 283 } 284 285 void setError(const Twine &Message) { 286 setError(Message, Current); 287 } 288 289 /// Returns true if an error occurred while parsing. 290 bool failed() { 291 return Failed; 292 } 293 294 private: 295 void init(MemoryBufferRef Buffer); 296 297 StringRef currentInput() { 298 return StringRef(Current, End - Current); 299 } 300 301 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 302 /// at \a Position. 303 /// 304 /// If the UTF-8 code units starting at Position do not form a well-formed 305 /// code unit subsequence, then the Unicode scalar value is 0, and the length 306 /// is 0. 307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308 return ::decodeUTF8(StringRef(Position, End - Position)); 309 } 310 311 // The following functions are based on the gramar rules in the YAML spec. The 312 // style of the function names it meant to closely match how they are written 313 // in the spec. The number within the [] is the number of the grammar rule in 314 // the spec. 315 // 316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317 // 318 // c- 319 // A production starting and ending with a special character. 320 // b- 321 // A production matching a single line break. 322 // nb- 323 // A production starting and ending with a non-break character. 324 // s- 325 // A production starting and ending with a white space character. 326 // ns- 327 // A production starting and ending with a non-space character. 328 // l- 329 // A production matching complete line(s). 330 331 /// Skip a single nb-char[27] starting at Position. 332 /// 333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335 /// 336 /// @returns The code unit after the nb-char, or Position if it's not an 337 /// nb-char. 338 StringRef::iterator skip_nb_char(StringRef::iterator Position); 339 340 /// Skip a single b-break[28] starting at Position. 341 /// 342 /// A b-break is 0xD 0xA | 0xD | 0xA 343 /// 344 /// @returns The code unit after the b-break, or Position if it's not a 345 /// b-break. 346 StringRef::iterator skip_b_break(StringRef::iterator Position); 347 348 /// Skip a single s-space[31] starting at Position. 349 /// 350 /// An s-space is 0x20 351 /// 352 /// @returns The code unit after the s-space, or Position if it's not a 353 /// s-space. 354 StringRef::iterator skip_s_space(StringRef::iterator Position); 355 356 /// Skip a single s-white[33] starting at Position. 357 /// 358 /// A s-white is 0x20 | 0x9 359 /// 360 /// @returns The code unit after the s-white, or Position if it's not a 361 /// s-white. 362 StringRef::iterator skip_s_white(StringRef::iterator Position); 363 364 /// Skip a single ns-char[34] starting at Position. 365 /// 366 /// A ns-char is nb-char - s-white 367 /// 368 /// @returns The code unit after the ns-char, or Position if it's not a 369 /// ns-char. 370 StringRef::iterator skip_ns_char(StringRef::iterator Position); 371 372 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 373 374 /// Skip minimal well-formed code unit subsequences until Func 375 /// returns its input. 376 /// 377 /// @returns The code unit after the last minimal well-formed code unit 378 /// subsequence that Func accepted. 379 StringRef::iterator skip_while( SkipWhileFunc Func 380 , StringRef::iterator Position); 381 382 /// Skip minimal well-formed code unit subsequences until Func returns its 383 /// input. 384 void advanceWhile(SkipWhileFunc Func); 385 386 /// Scan ns-uri-char[39]s starting at Cur. 387 /// 388 /// This updates Cur and Column while scanning. 389 void scan_ns_uri_char(); 390 391 /// Consume a minimal well-formed code unit subsequence starting at 392 /// \a Cur. Return false if it is not the same Unicode scalar value as 393 /// \a Expected. This updates \a Column. 394 bool consume(uint32_t Expected); 395 396 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 397 void skip(uint32_t Distance); 398 399 /// Return true if the minimal well-formed code unit subsequence at 400 /// Pos is whitespace or a new line 401 bool isBlankOrBreak(StringRef::iterator Position); 402 403 /// Consume a single b-break[28] if it's present at the current position. 404 /// 405 /// Return false if the code unit at the current position isn't a line break. 406 bool consumeLineBreakIfPresent(); 407 408 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 409 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 410 , unsigned AtColumn 411 , bool IsRequired); 412 413 /// Remove simple keys that can no longer be valid simple keys. 414 /// 415 /// Invalid simple keys are not on the current line or are further than 1024 416 /// columns back. 417 void removeStaleSimpleKeyCandidates(); 418 419 /// Remove all simple keys on FlowLevel \a Level. 420 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 421 422 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 423 /// tokens if needed. 424 bool unrollIndent(int ToColumn); 425 426 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 427 /// if needed. 428 bool rollIndent( int ToColumn 429 , Token::TokenKind Kind 430 , TokenQueueT::iterator InsertPoint); 431 432 /// Skip a single-line comment when the comment starts at the current 433 /// position of the scanner. 434 void skipComment(); 435 436 /// Skip whitespace and comments until the start of the next token. 437 void scanToNextToken(); 438 439 /// Must be the first token generated. 440 bool scanStreamStart(); 441 442 /// Generate tokens needed to close out the stream. 443 bool scanStreamEnd(); 444 445 /// Scan a %BLAH directive. 446 bool scanDirective(); 447 448 /// Scan a ... or ---. 449 bool scanDocumentIndicator(bool IsStart); 450 451 /// Scan a [ or { and generate the proper flow collection start token. 452 bool scanFlowCollectionStart(bool IsSequence); 453 454 /// Scan a ] or } and generate the proper flow collection end token. 455 bool scanFlowCollectionEnd(bool IsSequence); 456 457 /// Scan the , that separates entries in a flow collection. 458 bool scanFlowEntry(); 459 460 /// Scan the - that starts block sequence entries. 461 bool scanBlockEntry(); 462 463 /// Scan an explicit ? indicating a key. 464 bool scanKey(); 465 466 /// Scan an explicit : indicating a value. 467 bool scanValue(); 468 469 /// Scan a quoted scalar. 470 bool scanFlowScalar(bool IsDoubleQuoted); 471 472 /// Scan an unquoted scalar. 473 bool scanPlainScalar(); 474 475 /// Scan an Alias or Anchor starting with * or &. 476 bool scanAliasOrAnchor(bool IsAlias); 477 478 /// Scan a block scalar starting with | or >. 479 bool scanBlockScalar(bool IsLiteral); 480 481 /// Scan a chomping indicator in a block scalar header. 482 char scanBlockChompingIndicator(); 483 484 /// Scan an indentation indicator in a block scalar header. 485 unsigned scanBlockIndentationIndicator(); 486 487 /// Scan a block scalar header. 488 /// 489 /// Return false if an error occurred. 490 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 491 bool &IsDone); 492 493 /// Look for the indentation level of a block scalar. 494 /// 495 /// Return false if an error occurred. 496 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 497 unsigned &LineBreaks, bool &IsDone); 498 499 /// Scan the indentation of a text line in a block scalar. 500 /// 501 /// Return false if an error occurred. 502 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 503 bool &IsDone); 504 505 /// Scan a tag of the form !stuff. 506 bool scanTag(); 507 508 /// Dispatch to the next scanning function based on \a *Cur. 509 bool fetchMoreTokens(); 510 511 /// The SourceMgr used for diagnostics and buffer management. 512 SourceMgr &SM; 513 514 /// The original input. 515 MemoryBufferRef InputBuffer; 516 517 /// The current position of the scanner. 518 StringRef::iterator Current; 519 520 /// The end of the input (one past the last character). 521 StringRef::iterator End; 522 523 /// Current YAML indentation level in spaces. 524 int Indent; 525 526 /// Current column number in Unicode code points. 527 unsigned Column; 528 529 /// Current line number. 530 unsigned Line; 531 532 /// How deep we are in flow style containers. 0 Means at block level. 533 unsigned FlowLevel; 534 535 /// Are we at the start of the stream? 536 bool IsStartOfStream; 537 538 /// Can the next token be the start of a simple key? 539 bool IsSimpleKeyAllowed; 540 541 /// True if an error has occurred. 542 bool Failed; 543 544 /// Should colors be used when printing out the diagnostic messages? 545 bool ShowColors; 546 547 /// Queue of tokens. This is required to queue up tokens while looking 548 /// for the end of a simple key. And for cases where a single character 549 /// can produce multiple tokens (e.g. BlockEnd). 550 TokenQueueT TokenQueue; 551 552 /// Indentation levels. 553 SmallVector<int, 4> Indents; 554 555 /// Potential simple keys. 556 SmallVector<SimpleKey, 4> SimpleKeys; 557 558 std::error_code *EC; 559 }; 560 561 } // end namespace yaml 562 } // end namespace llvm 563 564 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 565 static void encodeUTF8( uint32_t UnicodeScalarValue 566 , SmallVectorImpl<char> &Result) { 567 if (UnicodeScalarValue <= 0x7F) { 568 Result.push_back(UnicodeScalarValue & 0x7F); 569 } else if (UnicodeScalarValue <= 0x7FF) { 570 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 571 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 572 Result.push_back(FirstByte); 573 Result.push_back(SecondByte); 574 } else if (UnicodeScalarValue <= 0xFFFF) { 575 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 576 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 577 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 578 Result.push_back(FirstByte); 579 Result.push_back(SecondByte); 580 Result.push_back(ThirdByte); 581 } else if (UnicodeScalarValue <= 0x10FFFF) { 582 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 583 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 584 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 585 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 586 Result.push_back(FirstByte); 587 Result.push_back(SecondByte); 588 Result.push_back(ThirdByte); 589 Result.push_back(FourthByte); 590 } 591 } 592 593 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 594 SourceMgr SM; 595 Scanner scanner(Input, SM); 596 while (true) { 597 Token T = scanner.getNext(); 598 switch (T.Kind) { 599 case Token::TK_StreamStart: 600 OS << "Stream-Start: "; 601 break; 602 case Token::TK_StreamEnd: 603 OS << "Stream-End: "; 604 break; 605 case Token::TK_VersionDirective: 606 OS << "Version-Directive: "; 607 break; 608 case Token::TK_TagDirective: 609 OS << "Tag-Directive: "; 610 break; 611 case Token::TK_DocumentStart: 612 OS << "Document-Start: "; 613 break; 614 case Token::TK_DocumentEnd: 615 OS << "Document-End: "; 616 break; 617 case Token::TK_BlockEntry: 618 OS << "Block-Entry: "; 619 break; 620 case Token::TK_BlockEnd: 621 OS << "Block-End: "; 622 break; 623 case Token::TK_BlockSequenceStart: 624 OS << "Block-Sequence-Start: "; 625 break; 626 case Token::TK_BlockMappingStart: 627 OS << "Block-Mapping-Start: "; 628 break; 629 case Token::TK_FlowEntry: 630 OS << "Flow-Entry: "; 631 break; 632 case Token::TK_FlowSequenceStart: 633 OS << "Flow-Sequence-Start: "; 634 break; 635 case Token::TK_FlowSequenceEnd: 636 OS << "Flow-Sequence-End: "; 637 break; 638 case Token::TK_FlowMappingStart: 639 OS << "Flow-Mapping-Start: "; 640 break; 641 case Token::TK_FlowMappingEnd: 642 OS << "Flow-Mapping-End: "; 643 break; 644 case Token::TK_Key: 645 OS << "Key: "; 646 break; 647 case Token::TK_Value: 648 OS << "Value: "; 649 break; 650 case Token::TK_Scalar: 651 OS << "Scalar: "; 652 break; 653 case Token::TK_BlockScalar: 654 OS << "Block Scalar: "; 655 break; 656 case Token::TK_Alias: 657 OS << "Alias: "; 658 break; 659 case Token::TK_Anchor: 660 OS << "Anchor: "; 661 break; 662 case Token::TK_Tag: 663 OS << "Tag: "; 664 break; 665 case Token::TK_Error: 666 break; 667 } 668 OS << T.Range << "\n"; 669 if (T.Kind == Token::TK_StreamEnd) 670 break; 671 else if (T.Kind == Token::TK_Error) 672 return false; 673 } 674 return true; 675 } 676 677 bool yaml::scanTokens(StringRef Input) { 678 SourceMgr SM; 679 Scanner scanner(Input, SM); 680 while (true) { 681 Token T = scanner.getNext(); 682 if (T.Kind == Token::TK_StreamEnd) 683 break; 684 else if (T.Kind == Token::TK_Error) 685 return false; 686 } 687 return true; 688 } 689 690 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 691 std::string EscapedInput; 692 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 693 if (*i == '\\') 694 EscapedInput += "\\\\"; 695 else if (*i == '"') 696 EscapedInput += "\\\""; 697 else if (*i == 0) 698 EscapedInput += "\\0"; 699 else if (*i == 0x07) 700 EscapedInput += "\\a"; 701 else if (*i == 0x08) 702 EscapedInput += "\\b"; 703 else if (*i == 0x09) 704 EscapedInput += "\\t"; 705 else if (*i == 0x0A) 706 EscapedInput += "\\n"; 707 else if (*i == 0x0B) 708 EscapedInput += "\\v"; 709 else if (*i == 0x0C) 710 EscapedInput += "\\f"; 711 else if (*i == 0x0D) 712 EscapedInput += "\\r"; 713 else if (*i == 0x1B) 714 EscapedInput += "\\e"; 715 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 716 std::string HexStr = utohexstr(*i); 717 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 718 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 719 UTF8Decoded UnicodeScalarValue 720 = decodeUTF8(StringRef(i, Input.end() - i)); 721 if (UnicodeScalarValue.second == 0) { 722 // Found invalid char. 723 SmallString<4> Val; 724 encodeUTF8(0xFFFD, Val); 725 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 726 // FIXME: Error reporting. 727 return EscapedInput; 728 } 729 if (UnicodeScalarValue.first == 0x85) 730 EscapedInput += "\\N"; 731 else if (UnicodeScalarValue.first == 0xA0) 732 EscapedInput += "\\_"; 733 else if (UnicodeScalarValue.first == 0x2028) 734 EscapedInput += "\\L"; 735 else if (UnicodeScalarValue.first == 0x2029) 736 EscapedInput += "\\P"; 737 else if (!EscapePrintable && 738 sys::unicode::isPrintable(UnicodeScalarValue.first)) 739 EscapedInput += StringRef(i, UnicodeScalarValue.second); 740 else { 741 std::string HexStr = utohexstr(UnicodeScalarValue.first); 742 if (HexStr.size() <= 2) 743 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 744 else if (HexStr.size() <= 4) 745 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 746 else if (HexStr.size() <= 8) 747 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 748 } 749 i += UnicodeScalarValue.second - 1; 750 } else 751 EscapedInput.push_back(*i); 752 } 753 return EscapedInput; 754 } 755 756 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 757 std::error_code *EC) 758 : SM(sm), ShowColors(ShowColors), EC(EC) { 759 init(MemoryBufferRef(Input, "YAML")); 760 } 761 762 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 763 std::error_code *EC) 764 : SM(SM_), ShowColors(ShowColors), EC(EC) { 765 init(Buffer); 766 } 767 768 void Scanner::init(MemoryBufferRef Buffer) { 769 InputBuffer = Buffer; 770 Current = InputBuffer.getBufferStart(); 771 End = InputBuffer.getBufferEnd(); 772 Indent = -1; 773 Column = 0; 774 Line = 0; 775 FlowLevel = 0; 776 IsStartOfStream = true; 777 IsSimpleKeyAllowed = true; 778 Failed = false; 779 std::unique_ptr<MemoryBuffer> InputBufferOwner = 780 MemoryBuffer::getMemBuffer(Buffer); 781 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 782 } 783 784 Token &Scanner::peekNext() { 785 // If the current token is a possible simple key, keep parsing until we 786 // can confirm. 787 bool NeedMore = false; 788 while (true) { 789 if (TokenQueue.empty() || NeedMore) { 790 if (!fetchMoreTokens()) { 791 TokenQueue.clear(); 792 SimpleKeys.clear(); 793 TokenQueue.push_back(Token()); 794 return TokenQueue.front(); 795 } 796 } 797 assert(!TokenQueue.empty() && 798 "fetchMoreTokens lied about getting tokens!"); 799 800 removeStaleSimpleKeyCandidates(); 801 SimpleKey SK; 802 SK.Tok = TokenQueue.begin(); 803 if (!is_contained(SimpleKeys, SK)) 804 break; 805 else 806 NeedMore = true; 807 } 808 return TokenQueue.front(); 809 } 810 811 Token Scanner::getNext() { 812 Token Ret = peekNext(); 813 // TokenQueue can be empty if there was an error getting the next token. 814 if (!TokenQueue.empty()) 815 TokenQueue.pop_front(); 816 817 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 818 // quick deallocation of them all. 819 if (TokenQueue.empty()) 820 TokenQueue.resetAlloc(); 821 822 return Ret; 823 } 824 825 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 826 if (Position == End) 827 return Position; 828 // Check 7 bit c-printable - b-char. 829 if ( *Position == 0x09 830 || (*Position >= 0x20 && *Position <= 0x7E)) 831 return Position + 1; 832 833 // Check for valid UTF-8. 834 if (uint8_t(*Position) & 0x80) { 835 UTF8Decoded u8d = decodeUTF8(Position); 836 if ( u8d.second != 0 837 && u8d.first != 0xFEFF 838 && ( u8d.first == 0x85 839 || ( u8d.first >= 0xA0 840 && u8d.first <= 0xD7FF) 841 || ( u8d.first >= 0xE000 842 && u8d.first <= 0xFFFD) 843 || ( u8d.first >= 0x10000 844 && u8d.first <= 0x10FFFF))) 845 return Position + u8d.second; 846 } 847 return Position; 848 } 849 850 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 851 if (Position == End) 852 return Position; 853 if (*Position == 0x0D) { 854 if (Position + 1 != End && *(Position + 1) == 0x0A) 855 return Position + 2; 856 return Position + 1; 857 } 858 859 if (*Position == 0x0A) 860 return Position + 1; 861 return Position; 862 } 863 864 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 865 if (Position == End) 866 return Position; 867 if (*Position == ' ') 868 return Position + 1; 869 return Position; 870 } 871 872 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 873 if (Position == End) 874 return Position; 875 if (*Position == ' ' || *Position == '\t') 876 return Position + 1; 877 return Position; 878 } 879 880 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 881 if (Position == End) 882 return Position; 883 if (*Position == ' ' || *Position == '\t') 884 return Position; 885 return skip_nb_char(Position); 886 } 887 888 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 889 , StringRef::iterator Position) { 890 while (true) { 891 StringRef::iterator i = (this->*Func)(Position); 892 if (i == Position) 893 break; 894 Position = i; 895 } 896 return Position; 897 } 898 899 void Scanner::advanceWhile(SkipWhileFunc Func) { 900 auto Final = skip_while(Func, Current); 901 Column += Final - Current; 902 Current = Final; 903 } 904 905 static bool is_ns_hex_digit(const char C) { 906 return (C >= '0' && C <= '9') 907 || (C >= 'a' && C <= 'z') 908 || (C >= 'A' && C <= 'Z'); 909 } 910 911 static bool is_ns_word_char(const char C) { 912 return C == '-' 913 || (C >= 'a' && C <= 'z') 914 || (C >= 'A' && C <= 'Z'); 915 } 916 917 void Scanner::scan_ns_uri_char() { 918 while (true) { 919 if (Current == End) 920 break; 921 if (( *Current == '%' 922 && Current + 2 < End 923 && is_ns_hex_digit(*(Current + 1)) 924 && is_ns_hex_digit(*(Current + 2))) 925 || is_ns_word_char(*Current) 926 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 927 != StringRef::npos) { 928 ++Current; 929 ++Column; 930 } else 931 break; 932 } 933 } 934 935 bool Scanner::consume(uint32_t Expected) { 936 if (Expected >= 0x80) { 937 setError("Cannot consume non-ascii characters"); 938 return false; 939 } 940 if (Current == End) 941 return false; 942 if (uint8_t(*Current) >= 0x80) { 943 setError("Cannot consume non-ascii characters"); 944 return false; 945 } 946 if (uint8_t(*Current) == Expected) { 947 ++Current; 948 ++Column; 949 return true; 950 } 951 return false; 952 } 953 954 void Scanner::skip(uint32_t Distance) { 955 Current += Distance; 956 Column += Distance; 957 assert(Current <= End && "Skipped past the end"); 958 } 959 960 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 961 if (Position == End) 962 return false; 963 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 964 *Position == '\n'; 965 } 966 967 bool Scanner::consumeLineBreakIfPresent() { 968 auto Next = skip_b_break(Current); 969 if (Next == Current) 970 return false; 971 Column = 0; 972 ++Line; 973 Current = Next; 974 return true; 975 } 976 977 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 978 , unsigned AtColumn 979 , bool IsRequired) { 980 if (IsSimpleKeyAllowed) { 981 SimpleKey SK; 982 SK.Tok = Tok; 983 SK.Line = Line; 984 SK.Column = AtColumn; 985 SK.IsRequired = IsRequired; 986 SK.FlowLevel = FlowLevel; 987 SimpleKeys.push_back(SK); 988 } 989 } 990 991 void Scanner::removeStaleSimpleKeyCandidates() { 992 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 993 i != SimpleKeys.end();) { 994 if (i->Line != Line || i->Column + 1024 < Column) { 995 if (i->IsRequired) 996 setError( "Could not find expected : for simple key" 997 , i->Tok->Range.begin()); 998 i = SimpleKeys.erase(i); 999 } else 1000 ++i; 1001 } 1002 } 1003 1004 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1005 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1006 SimpleKeys.pop_back(); 1007 } 1008 1009 bool Scanner::unrollIndent(int ToColumn) { 1010 Token T; 1011 // Indentation is ignored in flow. 1012 if (FlowLevel != 0) 1013 return true; 1014 1015 while (Indent > ToColumn) { 1016 T.Kind = Token::TK_BlockEnd; 1017 T.Range = StringRef(Current, 1); 1018 TokenQueue.push_back(T); 1019 Indent = Indents.pop_back_val(); 1020 } 1021 1022 return true; 1023 } 1024 1025 bool Scanner::rollIndent( int ToColumn 1026 , Token::TokenKind Kind 1027 , TokenQueueT::iterator InsertPoint) { 1028 if (FlowLevel) 1029 return true; 1030 if (Indent < ToColumn) { 1031 Indents.push_back(Indent); 1032 Indent = ToColumn; 1033 1034 Token T; 1035 T.Kind = Kind; 1036 T.Range = StringRef(Current, 0); 1037 TokenQueue.insert(InsertPoint, T); 1038 } 1039 return true; 1040 } 1041 1042 void Scanner::skipComment() { 1043 if (*Current != '#') 1044 return; 1045 while (true) { 1046 // This may skip more than one byte, thus Column is only incremented 1047 // for code points. 1048 StringRef::iterator I = skip_nb_char(Current); 1049 if (I == Current) 1050 break; 1051 Current = I; 1052 ++Column; 1053 } 1054 } 1055 1056 void Scanner::scanToNextToken() { 1057 while (true) { 1058 while (*Current == ' ' || *Current == '\t') { 1059 skip(1); 1060 } 1061 1062 skipComment(); 1063 1064 // Skip EOL. 1065 StringRef::iterator i = skip_b_break(Current); 1066 if (i == Current) 1067 break; 1068 Current = i; 1069 ++Line; 1070 Column = 0; 1071 // New lines may start a simple key. 1072 if (!FlowLevel) 1073 IsSimpleKeyAllowed = true; 1074 } 1075 } 1076 1077 bool Scanner::scanStreamStart() { 1078 IsStartOfStream = false; 1079 1080 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1081 1082 Token T; 1083 T.Kind = Token::TK_StreamStart; 1084 T.Range = StringRef(Current, EI.second); 1085 TokenQueue.push_back(T); 1086 Current += EI.second; 1087 return true; 1088 } 1089 1090 bool Scanner::scanStreamEnd() { 1091 // Force an ending new line if one isn't present. 1092 if (Column != 0) { 1093 Column = 0; 1094 ++Line; 1095 } 1096 1097 unrollIndent(-1); 1098 SimpleKeys.clear(); 1099 IsSimpleKeyAllowed = false; 1100 1101 Token T; 1102 T.Kind = Token::TK_StreamEnd; 1103 T.Range = StringRef(Current, 0); 1104 TokenQueue.push_back(T); 1105 return true; 1106 } 1107 1108 bool Scanner::scanDirective() { 1109 // Reset the indentation level. 1110 unrollIndent(-1); 1111 SimpleKeys.clear(); 1112 IsSimpleKeyAllowed = false; 1113 1114 StringRef::iterator Start = Current; 1115 consume('%'); 1116 StringRef::iterator NameStart = Current; 1117 Current = skip_while(&Scanner::skip_ns_char, Current); 1118 StringRef Name(NameStart, Current - NameStart); 1119 Current = skip_while(&Scanner::skip_s_white, Current); 1120 1121 Token T; 1122 if (Name == "YAML") { 1123 Current = skip_while(&Scanner::skip_ns_char, Current); 1124 T.Kind = Token::TK_VersionDirective; 1125 T.Range = StringRef(Start, Current - Start); 1126 TokenQueue.push_back(T); 1127 return true; 1128 } else if(Name == "TAG") { 1129 Current = skip_while(&Scanner::skip_ns_char, Current); 1130 Current = skip_while(&Scanner::skip_s_white, Current); 1131 Current = skip_while(&Scanner::skip_ns_char, Current); 1132 T.Kind = Token::TK_TagDirective; 1133 T.Range = StringRef(Start, Current - Start); 1134 TokenQueue.push_back(T); 1135 return true; 1136 } 1137 return false; 1138 } 1139 1140 bool Scanner::scanDocumentIndicator(bool IsStart) { 1141 unrollIndent(-1); 1142 SimpleKeys.clear(); 1143 IsSimpleKeyAllowed = false; 1144 1145 Token T; 1146 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1147 T.Range = StringRef(Current, 3); 1148 skip(3); 1149 TokenQueue.push_back(T); 1150 return true; 1151 } 1152 1153 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1154 Token T; 1155 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1156 : Token::TK_FlowMappingStart; 1157 T.Range = StringRef(Current, 1); 1158 skip(1); 1159 TokenQueue.push_back(T); 1160 1161 // [ and { may begin a simple key. 1162 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1163 1164 // And may also be followed by a simple key. 1165 IsSimpleKeyAllowed = true; 1166 ++FlowLevel; 1167 return true; 1168 } 1169 1170 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1171 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1172 IsSimpleKeyAllowed = false; 1173 Token T; 1174 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1175 : Token::TK_FlowMappingEnd; 1176 T.Range = StringRef(Current, 1); 1177 skip(1); 1178 TokenQueue.push_back(T); 1179 if (FlowLevel) 1180 --FlowLevel; 1181 return true; 1182 } 1183 1184 bool Scanner::scanFlowEntry() { 1185 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1186 IsSimpleKeyAllowed = true; 1187 Token T; 1188 T.Kind = Token::TK_FlowEntry; 1189 T.Range = StringRef(Current, 1); 1190 skip(1); 1191 TokenQueue.push_back(T); 1192 return true; 1193 } 1194 1195 bool Scanner::scanBlockEntry() { 1196 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1197 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1198 IsSimpleKeyAllowed = true; 1199 Token T; 1200 T.Kind = Token::TK_BlockEntry; 1201 T.Range = StringRef(Current, 1); 1202 skip(1); 1203 TokenQueue.push_back(T); 1204 return true; 1205 } 1206 1207 bool Scanner::scanKey() { 1208 if (!FlowLevel) 1209 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1210 1211 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1212 IsSimpleKeyAllowed = !FlowLevel; 1213 1214 Token T; 1215 T.Kind = Token::TK_Key; 1216 T.Range = StringRef(Current, 1); 1217 skip(1); 1218 TokenQueue.push_back(T); 1219 return true; 1220 } 1221 1222 bool Scanner::scanValue() { 1223 // If the previous token could have been a simple key, insert the key token 1224 // into the token queue. 1225 if (!SimpleKeys.empty()) { 1226 SimpleKey SK = SimpleKeys.pop_back_val(); 1227 Token T; 1228 T.Kind = Token::TK_Key; 1229 T.Range = SK.Tok->Range; 1230 TokenQueueT::iterator i, e; 1231 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1232 if (i == SK.Tok) 1233 break; 1234 } 1235 if (i == e) { 1236 Failed = true; 1237 return false; 1238 } 1239 i = TokenQueue.insert(i, T); 1240 1241 // We may also need to add a Block-Mapping-Start token. 1242 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1243 1244 IsSimpleKeyAllowed = false; 1245 } else { 1246 if (!FlowLevel) 1247 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1248 IsSimpleKeyAllowed = !FlowLevel; 1249 } 1250 1251 Token T; 1252 T.Kind = Token::TK_Value; 1253 T.Range = StringRef(Current, 1); 1254 skip(1); 1255 TokenQueue.push_back(T); 1256 return true; 1257 } 1258 1259 // Forbidding inlining improves performance by roughly 20%. 1260 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1261 LLVM_ATTRIBUTE_NOINLINE static bool 1262 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1263 1264 // Returns whether a character at 'Position' was escaped with a leading '\'. 1265 // 'First' specifies the position of the first character in the string. 1266 static bool wasEscaped(StringRef::iterator First, 1267 StringRef::iterator Position) { 1268 assert(Position - 1 >= First); 1269 StringRef::iterator I = Position - 1; 1270 // We calculate the number of consecutive '\'s before the current position 1271 // by iterating backwards through our string. 1272 while (I >= First && *I == '\\') --I; 1273 // (Position - 1 - I) now contains the number of '\'s before the current 1274 // position. If it is odd, the character at 'Position' was escaped. 1275 return (Position - 1 - I) % 2 == 1; 1276 } 1277 1278 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1279 StringRef::iterator Start = Current; 1280 unsigned ColStart = Column; 1281 if (IsDoubleQuoted) { 1282 do { 1283 ++Current; 1284 while (Current != End && *Current != '"') 1285 ++Current; 1286 // Repeat until the previous character was not a '\' or was an escaped 1287 // backslash. 1288 } while ( Current != End 1289 && *(Current - 1) == '\\' 1290 && wasEscaped(Start + 1, Current)); 1291 } else { 1292 skip(1); 1293 while (true) { 1294 // Skip a ' followed by another '. 1295 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1296 skip(2); 1297 continue; 1298 } else if (*Current == '\'') 1299 break; 1300 StringRef::iterator i = skip_nb_char(Current); 1301 if (i == Current) { 1302 i = skip_b_break(Current); 1303 if (i == Current) 1304 break; 1305 Current = i; 1306 Column = 0; 1307 ++Line; 1308 } else { 1309 if (i == End) 1310 break; 1311 Current = i; 1312 ++Column; 1313 } 1314 } 1315 } 1316 1317 if (Current == End) { 1318 setError("Expected quote at end of scalar", Current); 1319 return false; 1320 } 1321 1322 skip(1); // Skip ending quote. 1323 Token T; 1324 T.Kind = Token::TK_Scalar; 1325 T.Range = StringRef(Start, Current - Start); 1326 TokenQueue.push_back(T); 1327 1328 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1329 1330 IsSimpleKeyAllowed = false; 1331 1332 return true; 1333 } 1334 1335 bool Scanner::scanPlainScalar() { 1336 StringRef::iterator Start = Current; 1337 unsigned ColStart = Column; 1338 unsigned LeadingBlanks = 0; 1339 assert(Indent >= -1 && "Indent must be >= -1 !"); 1340 unsigned indent = static_cast<unsigned>(Indent + 1); 1341 while (true) { 1342 if (*Current == '#') 1343 break; 1344 1345 while (!isBlankOrBreak(Current)) { 1346 if ( FlowLevel && *Current == ':' 1347 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1348 setError("Found unexpected ':' while scanning a plain scalar", Current); 1349 return false; 1350 } 1351 1352 // Check for the end of the plain scalar. 1353 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1354 || ( FlowLevel 1355 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1356 != StringRef::npos))) 1357 break; 1358 1359 StringRef::iterator i = skip_nb_char(Current); 1360 if (i == Current) 1361 break; 1362 Current = i; 1363 ++Column; 1364 } 1365 1366 // Are we at the end? 1367 if (!isBlankOrBreak(Current)) 1368 break; 1369 1370 // Eat blanks. 1371 StringRef::iterator Tmp = Current; 1372 while (isBlankOrBreak(Tmp)) { 1373 StringRef::iterator i = skip_s_white(Tmp); 1374 if (i != Tmp) { 1375 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1376 setError("Found invalid tab character in indentation", Tmp); 1377 return false; 1378 } 1379 Tmp = i; 1380 ++Column; 1381 } else { 1382 i = skip_b_break(Tmp); 1383 if (!LeadingBlanks) 1384 LeadingBlanks = 1; 1385 Tmp = i; 1386 Column = 0; 1387 ++Line; 1388 } 1389 } 1390 1391 if (!FlowLevel && Column < indent) 1392 break; 1393 1394 Current = Tmp; 1395 } 1396 if (Start == Current) { 1397 setError("Got empty plain scalar", Start); 1398 return false; 1399 } 1400 Token T; 1401 T.Kind = Token::TK_Scalar; 1402 T.Range = StringRef(Start, Current - Start); 1403 TokenQueue.push_back(T); 1404 1405 // Plain scalars can be simple keys. 1406 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1407 1408 IsSimpleKeyAllowed = false; 1409 1410 return true; 1411 } 1412 1413 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1414 StringRef::iterator Start = Current; 1415 unsigned ColStart = Column; 1416 skip(1); 1417 while(true) { 1418 if ( *Current == '[' || *Current == ']' 1419 || *Current == '{' || *Current == '}' 1420 || *Current == ',' 1421 || *Current == ':') 1422 break; 1423 StringRef::iterator i = skip_ns_char(Current); 1424 if (i == Current) 1425 break; 1426 Current = i; 1427 ++Column; 1428 } 1429 1430 if (Start == Current) { 1431 setError("Got empty alias or anchor", Start); 1432 return false; 1433 } 1434 1435 Token T; 1436 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1437 T.Range = StringRef(Start, Current - Start); 1438 TokenQueue.push_back(T); 1439 1440 // Alias and anchors can be simple keys. 1441 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1442 1443 IsSimpleKeyAllowed = false; 1444 1445 return true; 1446 } 1447 1448 char Scanner::scanBlockChompingIndicator() { 1449 char Indicator = ' '; 1450 if (Current != End && (*Current == '+' || *Current == '-')) { 1451 Indicator = *Current; 1452 skip(1); 1453 } 1454 return Indicator; 1455 } 1456 1457 /// Get the number of line breaks after chomping. 1458 /// 1459 /// Return the number of trailing line breaks to emit, depending on 1460 /// \p ChompingIndicator. 1461 static unsigned getChompedLineBreaks(char ChompingIndicator, 1462 unsigned LineBreaks, StringRef Str) { 1463 if (ChompingIndicator == '-') // Strip all line breaks. 1464 return 0; 1465 if (ChompingIndicator == '+') // Keep all line breaks. 1466 return LineBreaks; 1467 // Clip trailing lines. 1468 return Str.empty() ? 0 : 1; 1469 } 1470 1471 unsigned Scanner::scanBlockIndentationIndicator() { 1472 unsigned Indent = 0; 1473 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1474 Indent = unsigned(*Current - '0'); 1475 skip(1); 1476 } 1477 return Indent; 1478 } 1479 1480 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1481 unsigned &IndentIndicator, bool &IsDone) { 1482 auto Start = Current; 1483 1484 ChompingIndicator = scanBlockChompingIndicator(); 1485 IndentIndicator = scanBlockIndentationIndicator(); 1486 // Check for the chomping indicator once again. 1487 if (ChompingIndicator == ' ') 1488 ChompingIndicator = scanBlockChompingIndicator(); 1489 Current = skip_while(&Scanner::skip_s_white, Current); 1490 skipComment(); 1491 1492 if (Current == End) { // EOF, we have an empty scalar. 1493 Token T; 1494 T.Kind = Token::TK_BlockScalar; 1495 T.Range = StringRef(Start, Current - Start); 1496 TokenQueue.push_back(T); 1497 IsDone = true; 1498 return true; 1499 } 1500 1501 if (!consumeLineBreakIfPresent()) { 1502 setError("Expected a line break after block scalar header", Current); 1503 return false; 1504 } 1505 return true; 1506 } 1507 1508 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1509 unsigned BlockExitIndent, 1510 unsigned &LineBreaks, bool &IsDone) { 1511 unsigned MaxAllSpaceLineCharacters = 0; 1512 StringRef::iterator LongestAllSpaceLine; 1513 1514 while (true) { 1515 advanceWhile(&Scanner::skip_s_space); 1516 if (skip_nb_char(Current) != Current) { 1517 // This line isn't empty, so try and find the indentation. 1518 if (Column <= BlockExitIndent) { // End of the block literal. 1519 IsDone = true; 1520 return true; 1521 } 1522 // We found the block's indentation. 1523 BlockIndent = Column; 1524 if (MaxAllSpaceLineCharacters > BlockIndent) { 1525 setError( 1526 "Leading all-spaces line must be smaller than the block indent", 1527 LongestAllSpaceLine); 1528 return false; 1529 } 1530 return true; 1531 } 1532 if (skip_b_break(Current) != Current && 1533 Column > MaxAllSpaceLineCharacters) { 1534 // Record the longest all-space line in case it's longer than the 1535 // discovered block indent. 1536 MaxAllSpaceLineCharacters = Column; 1537 LongestAllSpaceLine = Current; 1538 } 1539 1540 // Check for EOF. 1541 if (Current == End) { 1542 IsDone = true; 1543 return true; 1544 } 1545 1546 if (!consumeLineBreakIfPresent()) { 1547 IsDone = true; 1548 return true; 1549 } 1550 ++LineBreaks; 1551 } 1552 return true; 1553 } 1554 1555 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1556 unsigned BlockExitIndent, bool &IsDone) { 1557 // Skip the indentation. 1558 while (Column < BlockIndent) { 1559 auto I = skip_s_space(Current); 1560 if (I == Current) 1561 break; 1562 Current = I; 1563 ++Column; 1564 } 1565 1566 if (skip_nb_char(Current) == Current) 1567 return true; 1568 1569 if (Column <= BlockExitIndent) { // End of the block literal. 1570 IsDone = true; 1571 return true; 1572 } 1573 1574 if (Column < BlockIndent) { 1575 if (Current != End && *Current == '#') { // Trailing comment. 1576 IsDone = true; 1577 return true; 1578 } 1579 setError("A text line is less indented than the block scalar", Current); 1580 return false; 1581 } 1582 return true; // A normal text line. 1583 } 1584 1585 bool Scanner::scanBlockScalar(bool IsLiteral) { 1586 // Eat '|' or '>' 1587 assert(*Current == '|' || *Current == '>'); 1588 skip(1); 1589 1590 char ChompingIndicator; 1591 unsigned BlockIndent; 1592 bool IsDone = false; 1593 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1594 return false; 1595 if (IsDone) 1596 return true; 1597 1598 auto Start = Current; 1599 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1600 unsigned LineBreaks = 0; 1601 if (BlockIndent == 0) { 1602 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1603 IsDone)) 1604 return false; 1605 } 1606 1607 // Scan the block's scalars body. 1608 SmallString<256> Str; 1609 while (!IsDone) { 1610 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1611 return false; 1612 if (IsDone) 1613 break; 1614 1615 // Parse the current line. 1616 auto LineStart = Current; 1617 advanceWhile(&Scanner::skip_nb_char); 1618 if (LineStart != Current) { 1619 Str.append(LineBreaks, '\n'); 1620 Str.append(StringRef(LineStart, Current - LineStart)); 1621 LineBreaks = 0; 1622 } 1623 1624 // Check for EOF. 1625 if (Current == End) 1626 break; 1627 1628 if (!consumeLineBreakIfPresent()) 1629 break; 1630 ++LineBreaks; 1631 } 1632 1633 if (Current == End && !LineBreaks) 1634 // Ensure that there is at least one line break before the end of file. 1635 LineBreaks = 1; 1636 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1637 1638 // New lines may start a simple key. 1639 if (!FlowLevel) 1640 IsSimpleKeyAllowed = true; 1641 1642 Token T; 1643 T.Kind = Token::TK_BlockScalar; 1644 T.Range = StringRef(Start, Current - Start); 1645 T.Value = Str.str().str(); 1646 TokenQueue.push_back(T); 1647 return true; 1648 } 1649 1650 bool Scanner::scanTag() { 1651 StringRef::iterator Start = Current; 1652 unsigned ColStart = Column; 1653 skip(1); // Eat !. 1654 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1655 else if (*Current == '<') { 1656 skip(1); 1657 scan_ns_uri_char(); 1658 if (!consume('>')) 1659 return false; 1660 } else { 1661 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1662 Current = skip_while(&Scanner::skip_ns_char, Current); 1663 } 1664 1665 Token T; 1666 T.Kind = Token::TK_Tag; 1667 T.Range = StringRef(Start, Current - Start); 1668 TokenQueue.push_back(T); 1669 1670 // Tags can be simple keys. 1671 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1672 1673 IsSimpleKeyAllowed = false; 1674 1675 return true; 1676 } 1677 1678 bool Scanner::fetchMoreTokens() { 1679 if (IsStartOfStream) 1680 return scanStreamStart(); 1681 1682 scanToNextToken(); 1683 1684 if (Current == End) 1685 return scanStreamEnd(); 1686 1687 removeStaleSimpleKeyCandidates(); 1688 1689 unrollIndent(Column); 1690 1691 if (Column == 0 && *Current == '%') 1692 return scanDirective(); 1693 1694 if (Column == 0 && Current + 4 <= End 1695 && *Current == '-' 1696 && *(Current + 1) == '-' 1697 && *(Current + 2) == '-' 1698 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1699 return scanDocumentIndicator(true); 1700 1701 if (Column == 0 && Current + 4 <= End 1702 && *Current == '.' 1703 && *(Current + 1) == '.' 1704 && *(Current + 2) == '.' 1705 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1706 return scanDocumentIndicator(false); 1707 1708 if (*Current == '[') 1709 return scanFlowCollectionStart(true); 1710 1711 if (*Current == '{') 1712 return scanFlowCollectionStart(false); 1713 1714 if (*Current == ']') 1715 return scanFlowCollectionEnd(true); 1716 1717 if (*Current == '}') 1718 return scanFlowCollectionEnd(false); 1719 1720 if (*Current == ',') 1721 return scanFlowEntry(); 1722 1723 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1724 return scanBlockEntry(); 1725 1726 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1727 return scanKey(); 1728 1729 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1730 return scanValue(); 1731 1732 if (*Current == '*') 1733 return scanAliasOrAnchor(true); 1734 1735 if (*Current == '&') 1736 return scanAliasOrAnchor(false); 1737 1738 if (*Current == '!') 1739 return scanTag(); 1740 1741 if (*Current == '|' && !FlowLevel) 1742 return scanBlockScalar(true); 1743 1744 if (*Current == '>' && !FlowLevel) 1745 return scanBlockScalar(false); 1746 1747 if (*Current == '\'') 1748 return scanFlowScalar(false); 1749 1750 if (*Current == '"') 1751 return scanFlowScalar(true); 1752 1753 // Get a plain scalar. 1754 StringRef FirstChar(Current, 1); 1755 if (!(isBlankOrBreak(Current) 1756 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1757 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1758 || (!FlowLevel && (*Current == '?' || *Current == ':') 1759 && isBlankOrBreak(Current + 1)) 1760 || (!FlowLevel && *Current == ':' 1761 && Current + 2 < End 1762 && *(Current + 1) == ':' 1763 && !isBlankOrBreak(Current + 2))) 1764 return scanPlainScalar(); 1765 1766 setError("Unrecognized character while tokenizing."); 1767 return false; 1768 } 1769 1770 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1771 std::error_code *EC) 1772 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1773 1774 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1775 std::error_code *EC) 1776 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1777 1778 Stream::~Stream() = default; 1779 1780 bool Stream::failed() { return scanner->failed(); } 1781 1782 void Stream::printError(Node *N, const Twine &Msg) { 1783 SMRange Range = N ? N->getSourceRange() : SMRange(); 1784 scanner->printError( Range.Start 1785 , SourceMgr::DK_Error 1786 , Msg 1787 , Range); 1788 } 1789 1790 document_iterator Stream::begin() { 1791 if (CurrentDoc) 1792 report_fatal_error("Can only iterate over the stream once"); 1793 1794 // Skip Stream-Start. 1795 scanner->getNext(); 1796 1797 CurrentDoc.reset(new Document(*this)); 1798 return document_iterator(CurrentDoc); 1799 } 1800 1801 document_iterator Stream::end() { 1802 return document_iterator(); 1803 } 1804 1805 void Stream::skip() { 1806 for (document_iterator i = begin(), e = end(); i != e; ++i) 1807 i->skip(); 1808 } 1809 1810 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1811 StringRef T) 1812 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1813 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1814 SourceRange = SMRange(Start, Start); 1815 } 1816 1817 std::string Node::getVerbatimTag() const { 1818 StringRef Raw = getRawTag(); 1819 if (!Raw.empty() && Raw != "!") { 1820 std::string Ret; 1821 if (Raw.find_last_of('!') == 0) { 1822 Ret = Doc->getTagMap().find("!")->second; 1823 Ret += Raw.substr(1); 1824 return Ret; 1825 } else if (Raw.startswith("!!")) { 1826 Ret = Doc->getTagMap().find("!!")->second; 1827 Ret += Raw.substr(2); 1828 return Ret; 1829 } else { 1830 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1831 std::map<StringRef, StringRef>::const_iterator It = 1832 Doc->getTagMap().find(TagHandle); 1833 if (It != Doc->getTagMap().end()) 1834 Ret = It->second; 1835 else { 1836 Token T; 1837 T.Kind = Token::TK_Tag; 1838 T.Range = TagHandle; 1839 setError(Twine("Unknown tag handle ") + TagHandle, T); 1840 } 1841 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1842 return Ret; 1843 } 1844 } 1845 1846 switch (getType()) { 1847 case NK_Null: 1848 return "tag:yaml.org,2002:null"; 1849 case NK_Scalar: 1850 case NK_BlockScalar: 1851 // TODO: Tag resolution. 1852 return "tag:yaml.org,2002:str"; 1853 case NK_Mapping: 1854 return "tag:yaml.org,2002:map"; 1855 case NK_Sequence: 1856 return "tag:yaml.org,2002:seq"; 1857 } 1858 1859 return ""; 1860 } 1861 1862 Token &Node::peekNext() { 1863 return Doc->peekNext(); 1864 } 1865 1866 Token Node::getNext() { 1867 return Doc->getNext(); 1868 } 1869 1870 Node *Node::parseBlockNode() { 1871 return Doc->parseBlockNode(); 1872 } 1873 1874 BumpPtrAllocator &Node::getAllocator() { 1875 return Doc->NodeAllocator; 1876 } 1877 1878 void Node::setError(const Twine &Msg, Token &Tok) const { 1879 Doc->setError(Msg, Tok); 1880 } 1881 1882 bool Node::failed() const { 1883 return Doc->failed(); 1884 } 1885 1886 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1887 // TODO: Handle newlines properly. We need to remove leading whitespace. 1888 if (Value[0] == '"') { // Double quoted. 1889 // Pull off the leading and trailing "s. 1890 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1891 // Search for characters that would require unescaping the value. 1892 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1893 if (i != StringRef::npos) 1894 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1895 return UnquotedValue; 1896 } else if (Value[0] == '\'') { // Single quoted. 1897 // Pull off the leading and trailing 's. 1898 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1899 StringRef::size_type i = UnquotedValue.find('\''); 1900 if (i != StringRef::npos) { 1901 // We're going to need Storage. 1902 Storage.clear(); 1903 Storage.reserve(UnquotedValue.size()); 1904 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1905 StringRef Valid(UnquotedValue.begin(), i); 1906 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1907 Storage.push_back('\''); 1908 UnquotedValue = UnquotedValue.substr(i + 2); 1909 } 1910 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1911 return StringRef(Storage.begin(), Storage.size()); 1912 } 1913 return UnquotedValue; 1914 } 1915 // Plain or block. 1916 return Value.rtrim(' '); 1917 } 1918 1919 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1920 , StringRef::size_type i 1921 , SmallVectorImpl<char> &Storage) 1922 const { 1923 // Use Storage to build proper value. 1924 Storage.clear(); 1925 Storage.reserve(UnquotedValue.size()); 1926 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1927 // Insert all previous chars into Storage. 1928 StringRef Valid(UnquotedValue.begin(), i); 1929 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1930 // Chop off inserted chars. 1931 UnquotedValue = UnquotedValue.substr(i); 1932 1933 assert(!UnquotedValue.empty() && "Can't be empty!"); 1934 1935 // Parse escape or line break. 1936 switch (UnquotedValue[0]) { 1937 case '\r': 1938 case '\n': 1939 Storage.push_back('\n'); 1940 if ( UnquotedValue.size() > 1 1941 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1942 UnquotedValue = UnquotedValue.substr(1); 1943 UnquotedValue = UnquotedValue.substr(1); 1944 break; 1945 default: 1946 if (UnquotedValue.size() == 1) { 1947 Token T; 1948 T.Range = StringRef(UnquotedValue.begin(), 1); 1949 setError("Unrecognized escape code", T); 1950 return ""; 1951 } 1952 UnquotedValue = UnquotedValue.substr(1); 1953 switch (UnquotedValue[0]) { 1954 default: { 1955 Token T; 1956 T.Range = StringRef(UnquotedValue.begin(), 1); 1957 setError("Unrecognized escape code", T); 1958 return ""; 1959 } 1960 case '\r': 1961 case '\n': 1962 // Remove the new line. 1963 if ( UnquotedValue.size() > 1 1964 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1965 UnquotedValue = UnquotedValue.substr(1); 1966 // If this was just a single byte newline, it will get skipped 1967 // below. 1968 break; 1969 case '0': 1970 Storage.push_back(0x00); 1971 break; 1972 case 'a': 1973 Storage.push_back(0x07); 1974 break; 1975 case 'b': 1976 Storage.push_back(0x08); 1977 break; 1978 case 't': 1979 case 0x09: 1980 Storage.push_back(0x09); 1981 break; 1982 case 'n': 1983 Storage.push_back(0x0A); 1984 break; 1985 case 'v': 1986 Storage.push_back(0x0B); 1987 break; 1988 case 'f': 1989 Storage.push_back(0x0C); 1990 break; 1991 case 'r': 1992 Storage.push_back(0x0D); 1993 break; 1994 case 'e': 1995 Storage.push_back(0x1B); 1996 break; 1997 case ' ': 1998 Storage.push_back(0x20); 1999 break; 2000 case '"': 2001 Storage.push_back(0x22); 2002 break; 2003 case '/': 2004 Storage.push_back(0x2F); 2005 break; 2006 case '\\': 2007 Storage.push_back(0x5C); 2008 break; 2009 case 'N': 2010 encodeUTF8(0x85, Storage); 2011 break; 2012 case '_': 2013 encodeUTF8(0xA0, Storage); 2014 break; 2015 case 'L': 2016 encodeUTF8(0x2028, Storage); 2017 break; 2018 case 'P': 2019 encodeUTF8(0x2029, Storage); 2020 break; 2021 case 'x': { 2022 if (UnquotedValue.size() < 3) 2023 // TODO: Report error. 2024 break; 2025 unsigned int UnicodeScalarValue; 2026 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2027 // TODO: Report error. 2028 UnicodeScalarValue = 0xFFFD; 2029 encodeUTF8(UnicodeScalarValue, Storage); 2030 UnquotedValue = UnquotedValue.substr(2); 2031 break; 2032 } 2033 case 'u': { 2034 if (UnquotedValue.size() < 5) 2035 // TODO: Report error. 2036 break; 2037 unsigned int UnicodeScalarValue; 2038 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2039 // TODO: Report error. 2040 UnicodeScalarValue = 0xFFFD; 2041 encodeUTF8(UnicodeScalarValue, Storage); 2042 UnquotedValue = UnquotedValue.substr(4); 2043 break; 2044 } 2045 case 'U': { 2046 if (UnquotedValue.size() < 9) 2047 // TODO: Report error. 2048 break; 2049 unsigned int UnicodeScalarValue; 2050 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2051 // TODO: Report error. 2052 UnicodeScalarValue = 0xFFFD; 2053 encodeUTF8(UnicodeScalarValue, Storage); 2054 UnquotedValue = UnquotedValue.substr(8); 2055 break; 2056 } 2057 } 2058 UnquotedValue = UnquotedValue.substr(1); 2059 } 2060 } 2061 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2062 return StringRef(Storage.begin(), Storage.size()); 2063 } 2064 2065 Node *KeyValueNode::getKey() { 2066 if (Key) 2067 return Key; 2068 // Handle implicit null keys. 2069 { 2070 Token &t = peekNext(); 2071 if ( t.Kind == Token::TK_BlockEnd 2072 || t.Kind == Token::TK_Value 2073 || t.Kind == Token::TK_Error) { 2074 return Key = new (getAllocator()) NullNode(Doc); 2075 } 2076 if (t.Kind == Token::TK_Key) 2077 getNext(); // skip TK_Key. 2078 } 2079 2080 // Handle explicit null keys. 2081 Token &t = peekNext(); 2082 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2083 return Key = new (getAllocator()) NullNode(Doc); 2084 } 2085 2086 // We've got a normal key. 2087 return Key = parseBlockNode(); 2088 } 2089 2090 Node *KeyValueNode::getValue() { 2091 if (Value) 2092 return Value; 2093 2094 if (Node* Key = getKey()) 2095 Key->skip(); 2096 else { 2097 setError("Null key in Key Value.", peekNext()); 2098 return Value = new (getAllocator()) NullNode(Doc); 2099 } 2100 2101 if (failed()) 2102 return Value = new (getAllocator()) NullNode(Doc); 2103 2104 // Handle implicit null values. 2105 { 2106 Token &t = peekNext(); 2107 if ( t.Kind == Token::TK_BlockEnd 2108 || t.Kind == Token::TK_FlowMappingEnd 2109 || t.Kind == Token::TK_Key 2110 || t.Kind == Token::TK_FlowEntry 2111 || t.Kind == Token::TK_Error) { 2112 return Value = new (getAllocator()) NullNode(Doc); 2113 } 2114 2115 if (t.Kind != Token::TK_Value) { 2116 setError("Unexpected token in Key Value.", t); 2117 return Value = new (getAllocator()) NullNode(Doc); 2118 } 2119 getNext(); // skip TK_Value. 2120 } 2121 2122 // Handle explicit null values. 2123 Token &t = peekNext(); 2124 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2125 return Value = new (getAllocator()) NullNode(Doc); 2126 } 2127 2128 // We got a normal value. 2129 return Value = parseBlockNode(); 2130 } 2131 2132 void MappingNode::increment() { 2133 if (failed()) { 2134 IsAtEnd = true; 2135 CurrentEntry = nullptr; 2136 return; 2137 } 2138 if (CurrentEntry) { 2139 CurrentEntry->skip(); 2140 if (Type == MT_Inline) { 2141 IsAtEnd = true; 2142 CurrentEntry = nullptr; 2143 return; 2144 } 2145 } 2146 Token T = peekNext(); 2147 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2148 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2149 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2150 } else if (Type == MT_Block) { 2151 switch (T.Kind) { 2152 case Token::TK_BlockEnd: 2153 getNext(); 2154 IsAtEnd = true; 2155 CurrentEntry = nullptr; 2156 break; 2157 default: 2158 setError("Unexpected token. Expected Key or Block End", T); 2159 LLVM_FALLTHROUGH; 2160 case Token::TK_Error: 2161 IsAtEnd = true; 2162 CurrentEntry = nullptr; 2163 } 2164 } else { 2165 switch (T.Kind) { 2166 case Token::TK_FlowEntry: 2167 // Eat the flow entry and recurse. 2168 getNext(); 2169 return increment(); 2170 case Token::TK_FlowMappingEnd: 2171 getNext(); 2172 LLVM_FALLTHROUGH; 2173 case Token::TK_Error: 2174 // Set this to end iterator. 2175 IsAtEnd = true; 2176 CurrentEntry = nullptr; 2177 break; 2178 default: 2179 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2180 "Mapping End." 2181 , T); 2182 IsAtEnd = true; 2183 CurrentEntry = nullptr; 2184 } 2185 } 2186 } 2187 2188 void SequenceNode::increment() { 2189 if (failed()) { 2190 IsAtEnd = true; 2191 CurrentEntry = nullptr; 2192 return; 2193 } 2194 if (CurrentEntry) 2195 CurrentEntry->skip(); 2196 Token T = peekNext(); 2197 if (SeqType == ST_Block) { 2198 switch (T.Kind) { 2199 case Token::TK_BlockEntry: 2200 getNext(); 2201 CurrentEntry = parseBlockNode(); 2202 if (!CurrentEntry) { // An error occurred. 2203 IsAtEnd = true; 2204 CurrentEntry = nullptr; 2205 } 2206 break; 2207 case Token::TK_BlockEnd: 2208 getNext(); 2209 IsAtEnd = true; 2210 CurrentEntry = nullptr; 2211 break; 2212 default: 2213 setError( "Unexpected token. Expected Block Entry or Block End." 2214 , T); 2215 LLVM_FALLTHROUGH; 2216 case Token::TK_Error: 2217 IsAtEnd = true; 2218 CurrentEntry = nullptr; 2219 } 2220 } else if (SeqType == ST_Indentless) { 2221 switch (T.Kind) { 2222 case Token::TK_BlockEntry: 2223 getNext(); 2224 CurrentEntry = parseBlockNode(); 2225 if (!CurrentEntry) { // An error occurred. 2226 IsAtEnd = true; 2227 CurrentEntry = nullptr; 2228 } 2229 break; 2230 default: 2231 case Token::TK_Error: 2232 IsAtEnd = true; 2233 CurrentEntry = nullptr; 2234 } 2235 } else if (SeqType == ST_Flow) { 2236 switch (T.Kind) { 2237 case Token::TK_FlowEntry: 2238 // Eat the flow entry and recurse. 2239 getNext(); 2240 WasPreviousTokenFlowEntry = true; 2241 return increment(); 2242 case Token::TK_FlowSequenceEnd: 2243 getNext(); 2244 LLVM_FALLTHROUGH; 2245 case Token::TK_Error: 2246 // Set this to end iterator. 2247 IsAtEnd = true; 2248 CurrentEntry = nullptr; 2249 break; 2250 case Token::TK_StreamEnd: 2251 case Token::TK_DocumentEnd: 2252 case Token::TK_DocumentStart: 2253 setError("Could not find closing ]!", T); 2254 // Set this to end iterator. 2255 IsAtEnd = true; 2256 CurrentEntry = nullptr; 2257 break; 2258 default: 2259 if (!WasPreviousTokenFlowEntry) { 2260 setError("Expected , between entries!", T); 2261 IsAtEnd = true; 2262 CurrentEntry = nullptr; 2263 break; 2264 } 2265 // Otherwise it must be a flow entry. 2266 CurrentEntry = parseBlockNode(); 2267 if (!CurrentEntry) { 2268 IsAtEnd = true; 2269 } 2270 WasPreviousTokenFlowEntry = false; 2271 break; 2272 } 2273 } 2274 } 2275 2276 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2277 // Tag maps starts with two default mappings. 2278 TagMap["!"] = "!"; 2279 TagMap["!!"] = "tag:yaml.org,2002:"; 2280 2281 if (parseDirectives()) 2282 expectToken(Token::TK_DocumentStart); 2283 Token &T = peekNext(); 2284 if (T.Kind == Token::TK_DocumentStart) 2285 getNext(); 2286 } 2287 2288 bool Document::skip() { 2289 if (stream.scanner->failed()) 2290 return false; 2291 if (!Root && !getRoot()) 2292 return false; 2293 Root->skip(); 2294 Token &T = peekNext(); 2295 if (T.Kind == Token::TK_StreamEnd) 2296 return false; 2297 if (T.Kind == Token::TK_DocumentEnd) { 2298 getNext(); 2299 return skip(); 2300 } 2301 return true; 2302 } 2303 2304 Token &Document::peekNext() { 2305 return stream.scanner->peekNext(); 2306 } 2307 2308 Token Document::getNext() { 2309 return stream.scanner->getNext(); 2310 } 2311 2312 void Document::setError(const Twine &Message, Token &Location) const { 2313 stream.scanner->setError(Message, Location.Range.begin()); 2314 } 2315 2316 bool Document::failed() const { 2317 return stream.scanner->failed(); 2318 } 2319 2320 Node *Document::parseBlockNode() { 2321 Token T = peekNext(); 2322 // Handle properties. 2323 Token AnchorInfo; 2324 Token TagInfo; 2325 parse_property: 2326 switch (T.Kind) { 2327 case Token::TK_Alias: 2328 getNext(); 2329 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2330 case Token::TK_Anchor: 2331 if (AnchorInfo.Kind == Token::TK_Anchor) { 2332 setError("Already encountered an anchor for this node!", T); 2333 return nullptr; 2334 } 2335 AnchorInfo = getNext(); // Consume TK_Anchor. 2336 T = peekNext(); 2337 goto parse_property; 2338 case Token::TK_Tag: 2339 if (TagInfo.Kind == Token::TK_Tag) { 2340 setError("Already encountered a tag for this node!", T); 2341 return nullptr; 2342 } 2343 TagInfo = getNext(); // Consume TK_Tag. 2344 T = peekNext(); 2345 goto parse_property; 2346 default: 2347 break; 2348 } 2349 2350 switch (T.Kind) { 2351 case Token::TK_BlockEntry: 2352 // We got an unindented BlockEntry sequence. This is not terminated with 2353 // a BlockEnd. 2354 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2355 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2356 , AnchorInfo.Range.substr(1) 2357 , TagInfo.Range 2358 , SequenceNode::ST_Indentless); 2359 case Token::TK_BlockSequenceStart: 2360 getNext(); 2361 return new (NodeAllocator) 2362 SequenceNode( stream.CurrentDoc 2363 , AnchorInfo.Range.substr(1) 2364 , TagInfo.Range 2365 , SequenceNode::ST_Block); 2366 case Token::TK_BlockMappingStart: 2367 getNext(); 2368 return new (NodeAllocator) 2369 MappingNode( stream.CurrentDoc 2370 , AnchorInfo.Range.substr(1) 2371 , TagInfo.Range 2372 , MappingNode::MT_Block); 2373 case Token::TK_FlowSequenceStart: 2374 getNext(); 2375 return new (NodeAllocator) 2376 SequenceNode( stream.CurrentDoc 2377 , AnchorInfo.Range.substr(1) 2378 , TagInfo.Range 2379 , SequenceNode::ST_Flow); 2380 case Token::TK_FlowMappingStart: 2381 getNext(); 2382 return new (NodeAllocator) 2383 MappingNode( stream.CurrentDoc 2384 , AnchorInfo.Range.substr(1) 2385 , TagInfo.Range 2386 , MappingNode::MT_Flow); 2387 case Token::TK_Scalar: 2388 getNext(); 2389 return new (NodeAllocator) 2390 ScalarNode( stream.CurrentDoc 2391 , AnchorInfo.Range.substr(1) 2392 , TagInfo.Range 2393 , T.Range); 2394 case Token::TK_BlockScalar: { 2395 getNext(); 2396 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2397 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2398 return new (NodeAllocator) 2399 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2400 TagInfo.Range, StrCopy, T.Range); 2401 } 2402 case Token::TK_Key: 2403 // Don't eat the TK_Key, KeyValueNode expects it. 2404 return new (NodeAllocator) 2405 MappingNode( stream.CurrentDoc 2406 , AnchorInfo.Range.substr(1) 2407 , TagInfo.Range 2408 , MappingNode::MT_Inline); 2409 case Token::TK_DocumentStart: 2410 case Token::TK_DocumentEnd: 2411 case Token::TK_StreamEnd: 2412 default: 2413 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2414 // !!null null. 2415 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2416 case Token::TK_FlowMappingEnd: 2417 case Token::TK_FlowSequenceEnd: 2418 case Token::TK_FlowEntry: { 2419 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2420 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2421 2422 setError("Unexpected token", T); 2423 return nullptr; 2424 } 2425 case Token::TK_Error: 2426 return nullptr; 2427 } 2428 llvm_unreachable("Control flow shouldn't reach here."); 2429 return nullptr; 2430 } 2431 2432 bool Document::parseDirectives() { 2433 bool isDirective = false; 2434 while (true) { 2435 Token T = peekNext(); 2436 if (T.Kind == Token::TK_TagDirective) { 2437 parseTAGDirective(); 2438 isDirective = true; 2439 } else if (T.Kind == Token::TK_VersionDirective) { 2440 parseYAMLDirective(); 2441 isDirective = true; 2442 } else 2443 break; 2444 } 2445 return isDirective; 2446 } 2447 2448 void Document::parseYAMLDirective() { 2449 getNext(); // Eat %YAML <version> 2450 } 2451 2452 void Document::parseTAGDirective() { 2453 Token Tag = getNext(); // %TAG <handle> <prefix> 2454 StringRef T = Tag.Range; 2455 // Strip %TAG 2456 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2457 std::size_t HandleEnd = T.find_first_of(" \t"); 2458 StringRef TagHandle = T.substr(0, HandleEnd); 2459 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2460 TagMap[TagHandle] = TagPrefix; 2461 } 2462 2463 bool Document::expectToken(int TK) { 2464 Token T = getNext(); 2465 if (T.Kind != TK) { 2466 setError("Unexpected token", T); 2467 return false; 2468 } 2469 return true; 2470 } 2471