1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringExtras.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/MemoryBuffer.h" 26 #include "llvm/Support/SMLoc.h" 27 #include "llvm/Support/SourceMgr.h" 28 #include "llvm/Support/Unicode.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <algorithm> 31 #include <cassert> 32 #include <cstddef> 33 #include <cstdint> 34 #include <map> 35 #include <memory> 36 #include <string> 37 #include <system_error> 38 #include <utility> 39 40 using namespace llvm; 41 using namespace yaml; 42 43 enum UnicodeEncodingForm { 44 UEF_UTF32_LE, ///< UTF-32 Little Endian 45 UEF_UTF32_BE, ///< UTF-32 Big Endian 46 UEF_UTF16_LE, ///< UTF-16 Little Endian 47 UEF_UTF16_BE, ///< UTF-16 Big Endian 48 UEF_UTF8, ///< UTF-8 or ascii. 49 UEF_Unknown ///< Not a valid Unicode encoding. 50 }; 51 52 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 53 /// it exists. Length is in {0, 2, 3, 4}. 54 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 55 56 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 57 /// encoding form of \a Input. 58 /// 59 /// @param Input A string of length 0 or more. 60 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 61 /// and how long the byte order mark is if one exists. 62 static EncodingInfo getUnicodeEncoding(StringRef Input) { 63 if (Input.empty()) 64 return std::make_pair(UEF_Unknown, 0); 65 66 switch (uint8_t(Input[0])) { 67 case 0x00: 68 if (Input.size() >= 4) { 69 if ( Input[1] == 0 70 && uint8_t(Input[2]) == 0xFE 71 && uint8_t(Input[3]) == 0xFF) 72 return std::make_pair(UEF_UTF32_BE, 4); 73 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 74 return std::make_pair(UEF_UTF32_BE, 0); 75 } 76 77 if (Input.size() >= 2 && Input[1] != 0) 78 return std::make_pair(UEF_UTF16_BE, 0); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xFF: 81 if ( Input.size() >= 4 82 && uint8_t(Input[1]) == 0xFE 83 && Input[2] == 0 84 && Input[3] == 0) 85 return std::make_pair(UEF_UTF32_LE, 4); 86 87 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 88 return std::make_pair(UEF_UTF16_LE, 2); 89 return std::make_pair(UEF_Unknown, 0); 90 case 0xFE: 91 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 92 return std::make_pair(UEF_UTF16_BE, 2); 93 return std::make_pair(UEF_Unknown, 0); 94 case 0xEF: 95 if ( Input.size() >= 3 96 && uint8_t(Input[1]) == 0xBB 97 && uint8_t(Input[2]) == 0xBF) 98 return std::make_pair(UEF_UTF8, 3); 99 return std::make_pair(UEF_Unknown, 0); 100 } 101 102 // It could still be utf-32 or utf-16. 103 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 104 return std::make_pair(UEF_UTF32_LE, 0); 105 106 if (Input.size() >= 2 && Input[1] == 0) 107 return std::make_pair(UEF_UTF16_LE, 0); 108 109 return std::make_pair(UEF_UTF8, 0); 110 } 111 112 /// Pin the vtables to this file. 113 void Node::anchor() {} 114 void NullNode::anchor() {} 115 void ScalarNode::anchor() {} 116 void BlockScalarNode::anchor() {} 117 void KeyValueNode::anchor() {} 118 void MappingNode::anchor() {} 119 void SequenceNode::anchor() {} 120 void AliasNode::anchor() {} 121 122 namespace llvm { 123 namespace yaml { 124 125 /// Token - A single YAML token. 126 struct Token { 127 enum TokenKind { 128 TK_Error, // Uninitialized token. 129 TK_StreamStart, 130 TK_StreamEnd, 131 TK_VersionDirective, 132 TK_TagDirective, 133 TK_DocumentStart, 134 TK_DocumentEnd, 135 TK_BlockEntry, 136 TK_BlockEnd, 137 TK_BlockSequenceStart, 138 TK_BlockMappingStart, 139 TK_FlowEntry, 140 TK_FlowSequenceStart, 141 TK_FlowSequenceEnd, 142 TK_FlowMappingStart, 143 TK_FlowMappingEnd, 144 TK_Key, 145 TK_Value, 146 TK_Scalar, 147 TK_BlockScalar, 148 TK_Alias, 149 TK_Anchor, 150 TK_Tag 151 } Kind = TK_Error; 152 153 /// A string of length 0 or more whose begin() points to the logical location 154 /// of the token in the input. 155 StringRef Range; 156 157 /// The value of a block scalar node. 158 std::string Value; 159 160 Token() = default; 161 }; 162 163 } // end namespace yaml 164 } // end namespace llvm 165 166 using TokenQueueT = BumpPtrList<Token>; 167 168 namespace { 169 170 /// This struct is used to track simple keys. 171 /// 172 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 173 /// which could legally be the start of a simple key. When peekNext is called, 174 /// if the Token To be returned is referenced by a SimpleKey, we continue 175 /// tokenizing until that potential simple key has either been found to not be 176 /// a simple key (we moved on to the next line or went further than 1024 chars). 177 /// Or when we run into a Value, and then insert a Key token (and possibly 178 /// others) before the SimpleKey's Tok. 179 struct SimpleKey { 180 TokenQueueT::iterator Tok; 181 unsigned Column = 0; 182 unsigned Line = 0; 183 unsigned FlowLevel = 0; 184 bool IsRequired = false; 185 186 bool operator ==(const SimpleKey &Other) { 187 return Tok == Other.Tok; 188 } 189 }; 190 191 } // end anonymous namespace 192 193 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 194 /// subsequence and the subsequence's length in code units (uint8_t). 195 /// A length of 0 represents an error. 196 using UTF8Decoded = std::pair<uint32_t, unsigned>; 197 198 static UTF8Decoded decodeUTF8(StringRef Range) { 199 StringRef::iterator Position= Range.begin(); 200 StringRef::iterator End = Range.end(); 201 // 1 byte: [0x00, 0x7f] 202 // Bit pattern: 0xxxxxxx 203 if ((*Position & 0x80) == 0) { 204 return std::make_pair(*Position, 1); 205 } 206 // 2 bytes: [0x80, 0x7ff] 207 // Bit pattern: 110xxxxx 10xxxxxx 208 if (Position + 1 != End && 209 ((*Position & 0xE0) == 0xC0) && 210 ((*(Position + 1) & 0xC0) == 0x80)) { 211 uint32_t codepoint = ((*Position & 0x1F) << 6) | 212 (*(Position + 1) & 0x3F); 213 if (codepoint >= 0x80) 214 return std::make_pair(codepoint, 2); 215 } 216 // 3 bytes: [0x8000, 0xffff] 217 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 218 if (Position + 2 != End && 219 ((*Position & 0xF0) == 0xE0) && 220 ((*(Position + 1) & 0xC0) == 0x80) && 221 ((*(Position + 2) & 0xC0) == 0x80)) { 222 uint32_t codepoint = ((*Position & 0x0F) << 12) | 223 ((*(Position + 1) & 0x3F) << 6) | 224 (*(Position + 2) & 0x3F); 225 // Codepoints between 0xD800 and 0xDFFF are invalid, as 226 // they are high / low surrogate halves used by UTF-16. 227 if (codepoint >= 0x800 && 228 (codepoint < 0xD800 || codepoint > 0xDFFF)) 229 return std::make_pair(codepoint, 3); 230 } 231 // 4 bytes: [0x10000, 0x10FFFF] 232 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 233 if (Position + 3 != End && 234 ((*Position & 0xF8) == 0xF0) && 235 ((*(Position + 1) & 0xC0) == 0x80) && 236 ((*(Position + 2) & 0xC0) == 0x80) && 237 ((*(Position + 3) & 0xC0) == 0x80)) { 238 uint32_t codepoint = ((*Position & 0x07) << 18) | 239 ((*(Position + 1) & 0x3F) << 12) | 240 ((*(Position + 2) & 0x3F) << 6) | 241 (*(Position + 3) & 0x3F); 242 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 243 return std::make_pair(codepoint, 4); 244 } 245 return std::make_pair(0, 0); 246 } 247 248 namespace llvm { 249 namespace yaml { 250 251 /// Scans YAML tokens from a MemoryBuffer. 252 class Scanner { 253 public: 254 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 255 std::error_code *EC = nullptr); 256 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 257 std::error_code *EC = nullptr); 258 259 /// Parse the next token and return it without popping it. 260 Token &peekNext(); 261 262 /// Parse the next token and pop it from the queue. 263 Token getNext(); 264 265 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 266 ArrayRef<SMRange> Ranges = None) { 267 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 268 } 269 270 void setError(const Twine &Message, StringRef::iterator Position) { 271 if (Position >= End) 272 Position = End - 1; 273 274 // propagate the error if possible 275 if (EC) 276 *EC = make_error_code(std::errc::invalid_argument); 277 278 // Don't print out more errors after the first one we encounter. The rest 279 // are just the result of the first, and have no meaning. 280 if (!Failed) 281 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 282 Failed = true; 283 } 284 285 /// Returns true if an error occurred while parsing. 286 bool failed() { 287 return Failed; 288 } 289 290 private: 291 void init(MemoryBufferRef Buffer); 292 293 StringRef currentInput() { 294 return StringRef(Current, End - Current); 295 } 296 297 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 298 /// at \a Position. 299 /// 300 /// If the UTF-8 code units starting at Position do not form a well-formed 301 /// code unit subsequence, then the Unicode scalar value is 0, and the length 302 /// is 0. 303 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 304 return ::decodeUTF8(StringRef(Position, End - Position)); 305 } 306 307 // The following functions are based on the gramar rules in the YAML spec. The 308 // style of the function names it meant to closely match how they are written 309 // in the spec. The number within the [] is the number of the grammar rule in 310 // the spec. 311 // 312 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 313 // 314 // c- 315 // A production starting and ending with a special character. 316 // b- 317 // A production matching a single line break. 318 // nb- 319 // A production starting and ending with a non-break character. 320 // s- 321 // A production starting and ending with a white space character. 322 // ns- 323 // A production starting and ending with a non-space character. 324 // l- 325 // A production matching complete line(s). 326 327 /// Skip a single nb-char[27] starting at Position. 328 /// 329 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 330 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 331 /// 332 /// @returns The code unit after the nb-char, or Position if it's not an 333 /// nb-char. 334 StringRef::iterator skip_nb_char(StringRef::iterator Position); 335 336 /// Skip a single b-break[28] starting at Position. 337 /// 338 /// A b-break is 0xD 0xA | 0xD | 0xA 339 /// 340 /// @returns The code unit after the b-break, or Position if it's not a 341 /// b-break. 342 StringRef::iterator skip_b_break(StringRef::iterator Position); 343 344 /// Skip a single s-space[31] starting at Position. 345 /// 346 /// An s-space is 0x20 347 /// 348 /// @returns The code unit after the s-space, or Position if it's not a 349 /// s-space. 350 StringRef::iterator skip_s_space(StringRef::iterator Position); 351 352 /// Skip a single s-white[33] starting at Position. 353 /// 354 /// A s-white is 0x20 | 0x9 355 /// 356 /// @returns The code unit after the s-white, or Position if it's not a 357 /// s-white. 358 StringRef::iterator skip_s_white(StringRef::iterator Position); 359 360 /// Skip a single ns-char[34] starting at Position. 361 /// 362 /// A ns-char is nb-char - s-white 363 /// 364 /// @returns The code unit after the ns-char, or Position if it's not a 365 /// ns-char. 366 StringRef::iterator skip_ns_char(StringRef::iterator Position); 367 368 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 369 370 /// Skip minimal well-formed code unit subsequences until Func 371 /// returns its input. 372 /// 373 /// @returns The code unit after the last minimal well-formed code unit 374 /// subsequence that Func accepted. 375 StringRef::iterator skip_while( SkipWhileFunc Func 376 , StringRef::iterator Position); 377 378 /// Skip minimal well-formed code unit subsequences until Func returns its 379 /// input. 380 void advanceWhile(SkipWhileFunc Func); 381 382 /// Scan ns-uri-char[39]s starting at Cur. 383 /// 384 /// This updates Cur and Column while scanning. 385 void scan_ns_uri_char(); 386 387 /// Consume a minimal well-formed code unit subsequence starting at 388 /// \a Cur. Return false if it is not the same Unicode scalar value as 389 /// \a Expected. This updates \a Column. 390 bool consume(uint32_t Expected); 391 392 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 393 void skip(uint32_t Distance); 394 395 /// Return true if the minimal well-formed code unit subsequence at 396 /// Pos is whitespace or a new line 397 bool isBlankOrBreak(StringRef::iterator Position); 398 399 /// Consume a single b-break[28] if it's present at the current position. 400 /// 401 /// Return false if the code unit at the current position isn't a line break. 402 bool consumeLineBreakIfPresent(); 403 404 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 405 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 406 , unsigned AtColumn 407 , bool IsRequired); 408 409 /// Remove simple keys that can no longer be valid simple keys. 410 /// 411 /// Invalid simple keys are not on the current line or are further than 1024 412 /// columns back. 413 void removeStaleSimpleKeyCandidates(); 414 415 /// Remove all simple keys on FlowLevel \a Level. 416 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 417 418 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 419 /// tokens if needed. 420 bool unrollIndent(int ToColumn); 421 422 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 423 /// if needed. 424 bool rollIndent( int ToColumn 425 , Token::TokenKind Kind 426 , TokenQueueT::iterator InsertPoint); 427 428 /// Skip a single-line comment when the comment starts at the current 429 /// position of the scanner. 430 void skipComment(); 431 432 /// Skip whitespace and comments until the start of the next token. 433 void scanToNextToken(); 434 435 /// Must be the first token generated. 436 bool scanStreamStart(); 437 438 /// Generate tokens needed to close out the stream. 439 bool scanStreamEnd(); 440 441 /// Scan a %BLAH directive. 442 bool scanDirective(); 443 444 /// Scan a ... or ---. 445 bool scanDocumentIndicator(bool IsStart); 446 447 /// Scan a [ or { and generate the proper flow collection start token. 448 bool scanFlowCollectionStart(bool IsSequence); 449 450 /// Scan a ] or } and generate the proper flow collection end token. 451 bool scanFlowCollectionEnd(bool IsSequence); 452 453 /// Scan the , that separates entries in a flow collection. 454 bool scanFlowEntry(); 455 456 /// Scan the - that starts block sequence entries. 457 bool scanBlockEntry(); 458 459 /// Scan an explicit ? indicating a key. 460 bool scanKey(); 461 462 /// Scan an explicit : indicating a value. 463 bool scanValue(); 464 465 /// Scan a quoted scalar. 466 bool scanFlowScalar(bool IsDoubleQuoted); 467 468 /// Scan an unquoted scalar. 469 bool scanPlainScalar(); 470 471 /// Scan an Alias or Anchor starting with * or &. 472 bool scanAliasOrAnchor(bool IsAlias); 473 474 /// Scan a block scalar starting with | or >. 475 bool scanBlockScalar(bool IsLiteral); 476 477 /// Scan a chomping indicator in a block scalar header. 478 char scanBlockChompingIndicator(); 479 480 /// Scan an indentation indicator in a block scalar header. 481 unsigned scanBlockIndentationIndicator(); 482 483 /// Scan a block scalar header. 484 /// 485 /// Return false if an error occurred. 486 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 487 bool &IsDone); 488 489 /// Look for the indentation level of a block scalar. 490 /// 491 /// Return false if an error occurred. 492 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 493 unsigned &LineBreaks, bool &IsDone); 494 495 /// Scan the indentation of a text line in a block scalar. 496 /// 497 /// Return false if an error occurred. 498 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 499 bool &IsDone); 500 501 /// Scan a tag of the form !stuff. 502 bool scanTag(); 503 504 /// Dispatch to the next scanning function based on \a *Cur. 505 bool fetchMoreTokens(); 506 507 /// The SourceMgr used for diagnostics and buffer management. 508 SourceMgr &SM; 509 510 /// The original input. 511 MemoryBufferRef InputBuffer; 512 513 /// The current position of the scanner. 514 StringRef::iterator Current; 515 516 /// The end of the input (one past the last character). 517 StringRef::iterator End; 518 519 /// Current YAML indentation level in spaces. 520 int Indent; 521 522 /// Current column number in Unicode code points. 523 unsigned Column; 524 525 /// Current line number. 526 unsigned Line; 527 528 /// How deep we are in flow style containers. 0 Means at block level. 529 unsigned FlowLevel; 530 531 /// Are we at the start of the stream? 532 bool IsStartOfStream; 533 534 /// Can the next token be the start of a simple key? 535 bool IsSimpleKeyAllowed; 536 537 /// True if an error has occurred. 538 bool Failed; 539 540 /// Should colors be used when printing out the diagnostic messages? 541 bool ShowColors; 542 543 /// Queue of tokens. This is required to queue up tokens while looking 544 /// for the end of a simple key. And for cases where a single character 545 /// can produce multiple tokens (e.g. BlockEnd). 546 TokenQueueT TokenQueue; 547 548 /// Indentation levels. 549 SmallVector<int, 4> Indents; 550 551 /// Potential simple keys. 552 SmallVector<SimpleKey, 4> SimpleKeys; 553 554 std::error_code *EC; 555 }; 556 557 } // end namespace yaml 558 } // end namespace llvm 559 560 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 561 static void encodeUTF8( uint32_t UnicodeScalarValue 562 , SmallVectorImpl<char> &Result) { 563 if (UnicodeScalarValue <= 0x7F) { 564 Result.push_back(UnicodeScalarValue & 0x7F); 565 } else if (UnicodeScalarValue <= 0x7FF) { 566 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 567 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 568 Result.push_back(FirstByte); 569 Result.push_back(SecondByte); 570 } else if (UnicodeScalarValue <= 0xFFFF) { 571 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 572 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 573 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 574 Result.push_back(FirstByte); 575 Result.push_back(SecondByte); 576 Result.push_back(ThirdByte); 577 } else if (UnicodeScalarValue <= 0x10FFFF) { 578 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 579 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 580 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 581 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 582 Result.push_back(FirstByte); 583 Result.push_back(SecondByte); 584 Result.push_back(ThirdByte); 585 Result.push_back(FourthByte); 586 } 587 } 588 589 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 590 SourceMgr SM; 591 Scanner scanner(Input, SM); 592 while (true) { 593 Token T = scanner.getNext(); 594 switch (T.Kind) { 595 case Token::TK_StreamStart: 596 OS << "Stream-Start: "; 597 break; 598 case Token::TK_StreamEnd: 599 OS << "Stream-End: "; 600 break; 601 case Token::TK_VersionDirective: 602 OS << "Version-Directive: "; 603 break; 604 case Token::TK_TagDirective: 605 OS << "Tag-Directive: "; 606 break; 607 case Token::TK_DocumentStart: 608 OS << "Document-Start: "; 609 break; 610 case Token::TK_DocumentEnd: 611 OS << "Document-End: "; 612 break; 613 case Token::TK_BlockEntry: 614 OS << "Block-Entry: "; 615 break; 616 case Token::TK_BlockEnd: 617 OS << "Block-End: "; 618 break; 619 case Token::TK_BlockSequenceStart: 620 OS << "Block-Sequence-Start: "; 621 break; 622 case Token::TK_BlockMappingStart: 623 OS << "Block-Mapping-Start: "; 624 break; 625 case Token::TK_FlowEntry: 626 OS << "Flow-Entry: "; 627 break; 628 case Token::TK_FlowSequenceStart: 629 OS << "Flow-Sequence-Start: "; 630 break; 631 case Token::TK_FlowSequenceEnd: 632 OS << "Flow-Sequence-End: "; 633 break; 634 case Token::TK_FlowMappingStart: 635 OS << "Flow-Mapping-Start: "; 636 break; 637 case Token::TK_FlowMappingEnd: 638 OS << "Flow-Mapping-End: "; 639 break; 640 case Token::TK_Key: 641 OS << "Key: "; 642 break; 643 case Token::TK_Value: 644 OS << "Value: "; 645 break; 646 case Token::TK_Scalar: 647 OS << "Scalar: "; 648 break; 649 case Token::TK_BlockScalar: 650 OS << "Block Scalar: "; 651 break; 652 case Token::TK_Alias: 653 OS << "Alias: "; 654 break; 655 case Token::TK_Anchor: 656 OS << "Anchor: "; 657 break; 658 case Token::TK_Tag: 659 OS << "Tag: "; 660 break; 661 case Token::TK_Error: 662 break; 663 } 664 OS << T.Range << "\n"; 665 if (T.Kind == Token::TK_StreamEnd) 666 break; 667 else if (T.Kind == Token::TK_Error) 668 return false; 669 } 670 return true; 671 } 672 673 bool yaml::scanTokens(StringRef Input) { 674 SourceMgr SM; 675 Scanner scanner(Input, SM); 676 while (true) { 677 Token T = scanner.getNext(); 678 if (T.Kind == Token::TK_StreamEnd) 679 break; 680 else if (T.Kind == Token::TK_Error) 681 return false; 682 } 683 return true; 684 } 685 686 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 687 std::string EscapedInput; 688 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 689 if (*i == '\\') 690 EscapedInput += "\\\\"; 691 else if (*i == '"') 692 EscapedInput += "\\\""; 693 else if (*i == 0) 694 EscapedInput += "\\0"; 695 else if (*i == 0x07) 696 EscapedInput += "\\a"; 697 else if (*i == 0x08) 698 EscapedInput += "\\b"; 699 else if (*i == 0x09) 700 EscapedInput += "\\t"; 701 else if (*i == 0x0A) 702 EscapedInput += "\\n"; 703 else if (*i == 0x0B) 704 EscapedInput += "\\v"; 705 else if (*i == 0x0C) 706 EscapedInput += "\\f"; 707 else if (*i == 0x0D) 708 EscapedInput += "\\r"; 709 else if (*i == 0x1B) 710 EscapedInput += "\\e"; 711 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 712 std::string HexStr = utohexstr(*i); 713 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 714 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 715 UTF8Decoded UnicodeScalarValue 716 = decodeUTF8(StringRef(i, Input.end() - i)); 717 if (UnicodeScalarValue.second == 0) { 718 // Found invalid char. 719 SmallString<4> Val; 720 encodeUTF8(0xFFFD, Val); 721 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 722 // FIXME: Error reporting. 723 return EscapedInput; 724 } 725 if (UnicodeScalarValue.first == 0x85) 726 EscapedInput += "\\N"; 727 else if (UnicodeScalarValue.first == 0xA0) 728 EscapedInput += "\\_"; 729 else if (UnicodeScalarValue.first == 0x2028) 730 EscapedInput += "\\L"; 731 else if (UnicodeScalarValue.first == 0x2029) 732 EscapedInput += "\\P"; 733 else if (!EscapePrintable && 734 sys::unicode::isPrintable(UnicodeScalarValue.first)) 735 EscapedInput += StringRef(i, UnicodeScalarValue.second); 736 else { 737 std::string HexStr = utohexstr(UnicodeScalarValue.first); 738 if (HexStr.size() <= 2) 739 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 740 else if (HexStr.size() <= 4) 741 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 742 else if (HexStr.size() <= 8) 743 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 744 } 745 i += UnicodeScalarValue.second - 1; 746 } else 747 EscapedInput.push_back(*i); 748 } 749 return EscapedInput; 750 } 751 752 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 753 std::error_code *EC) 754 : SM(sm), ShowColors(ShowColors), EC(EC) { 755 init(MemoryBufferRef(Input, "YAML")); 756 } 757 758 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 759 std::error_code *EC) 760 : SM(SM_), ShowColors(ShowColors), EC(EC) { 761 init(Buffer); 762 } 763 764 void Scanner::init(MemoryBufferRef Buffer) { 765 InputBuffer = Buffer; 766 Current = InputBuffer.getBufferStart(); 767 End = InputBuffer.getBufferEnd(); 768 Indent = -1; 769 Column = 0; 770 Line = 0; 771 FlowLevel = 0; 772 IsStartOfStream = true; 773 IsSimpleKeyAllowed = true; 774 Failed = false; 775 std::unique_ptr<MemoryBuffer> InputBufferOwner = 776 MemoryBuffer::getMemBuffer(Buffer); 777 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 778 } 779 780 Token &Scanner::peekNext() { 781 // If the current token is a possible simple key, keep parsing until we 782 // can confirm. 783 bool NeedMore = false; 784 while (true) { 785 if (TokenQueue.empty() || NeedMore) { 786 if (!fetchMoreTokens()) { 787 TokenQueue.clear(); 788 SimpleKeys.clear(); 789 TokenQueue.push_back(Token()); 790 return TokenQueue.front(); 791 } 792 } 793 assert(!TokenQueue.empty() && 794 "fetchMoreTokens lied about getting tokens!"); 795 796 removeStaleSimpleKeyCandidates(); 797 SimpleKey SK; 798 SK.Tok = TokenQueue.begin(); 799 if (!is_contained(SimpleKeys, SK)) 800 break; 801 else 802 NeedMore = true; 803 } 804 return TokenQueue.front(); 805 } 806 807 Token Scanner::getNext() { 808 Token Ret = peekNext(); 809 // TokenQueue can be empty if there was an error getting the next token. 810 if (!TokenQueue.empty()) 811 TokenQueue.pop_front(); 812 813 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 814 // quick deallocation of them all. 815 if (TokenQueue.empty()) 816 TokenQueue.resetAlloc(); 817 818 return Ret; 819 } 820 821 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 822 if (Position == End) 823 return Position; 824 // Check 7 bit c-printable - b-char. 825 if ( *Position == 0x09 826 || (*Position >= 0x20 && *Position <= 0x7E)) 827 return Position + 1; 828 829 // Check for valid UTF-8. 830 if (uint8_t(*Position) & 0x80) { 831 UTF8Decoded u8d = decodeUTF8(Position); 832 if ( u8d.second != 0 833 && u8d.first != 0xFEFF 834 && ( u8d.first == 0x85 835 || ( u8d.first >= 0xA0 836 && u8d.first <= 0xD7FF) 837 || ( u8d.first >= 0xE000 838 && u8d.first <= 0xFFFD) 839 || ( u8d.first >= 0x10000 840 && u8d.first <= 0x10FFFF))) 841 return Position + u8d.second; 842 } 843 return Position; 844 } 845 846 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 847 if (Position == End) 848 return Position; 849 if (*Position == 0x0D) { 850 if (Position + 1 != End && *(Position + 1) == 0x0A) 851 return Position + 2; 852 return Position + 1; 853 } 854 855 if (*Position == 0x0A) 856 return Position + 1; 857 return Position; 858 } 859 860 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 861 if (Position == End) 862 return Position; 863 if (*Position == ' ') 864 return Position + 1; 865 return Position; 866 } 867 868 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 869 if (Position == End) 870 return Position; 871 if (*Position == ' ' || *Position == '\t') 872 return Position + 1; 873 return Position; 874 } 875 876 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 877 if (Position == End) 878 return Position; 879 if (*Position == ' ' || *Position == '\t') 880 return Position; 881 return skip_nb_char(Position); 882 } 883 884 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 885 , StringRef::iterator Position) { 886 while (true) { 887 StringRef::iterator i = (this->*Func)(Position); 888 if (i == Position) 889 break; 890 Position = i; 891 } 892 return Position; 893 } 894 895 void Scanner::advanceWhile(SkipWhileFunc Func) { 896 auto Final = skip_while(Func, Current); 897 Column += Final - Current; 898 Current = Final; 899 } 900 901 static bool is_ns_hex_digit(const char C) { 902 return (C >= '0' && C <= '9') 903 || (C >= 'a' && C <= 'z') 904 || (C >= 'A' && C <= 'Z'); 905 } 906 907 static bool is_ns_word_char(const char C) { 908 return C == '-' 909 || (C >= 'a' && C <= 'z') 910 || (C >= 'A' && C <= 'Z'); 911 } 912 913 void Scanner::scan_ns_uri_char() { 914 while (true) { 915 if (Current == End) 916 break; 917 if (( *Current == '%' 918 && Current + 2 < End 919 && is_ns_hex_digit(*(Current + 1)) 920 && is_ns_hex_digit(*(Current + 2))) 921 || is_ns_word_char(*Current) 922 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 923 != StringRef::npos) { 924 ++Current; 925 ++Column; 926 } else 927 break; 928 } 929 } 930 931 bool Scanner::consume(uint32_t Expected) { 932 if (Expected >= 0x80) { 933 setError("Cannot consume non-ascii characters", Current); 934 return false; 935 } 936 if (Current == End) 937 return false; 938 if (uint8_t(*Current) >= 0x80) { 939 setError("Cannot consume non-ascii characters", Current); 940 return false; 941 } 942 if (uint8_t(*Current) == Expected) { 943 ++Current; 944 ++Column; 945 return true; 946 } 947 return false; 948 } 949 950 void Scanner::skip(uint32_t Distance) { 951 Current += Distance; 952 Column += Distance; 953 assert(Current <= End && "Skipped past the end"); 954 } 955 956 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 957 if (Position == End) 958 return false; 959 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 960 *Position == '\n'; 961 } 962 963 bool Scanner::consumeLineBreakIfPresent() { 964 auto Next = skip_b_break(Current); 965 if (Next == Current) 966 return false; 967 Column = 0; 968 ++Line; 969 Current = Next; 970 return true; 971 } 972 973 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 974 , unsigned AtColumn 975 , bool IsRequired) { 976 if (IsSimpleKeyAllowed) { 977 SimpleKey SK; 978 SK.Tok = Tok; 979 SK.Line = Line; 980 SK.Column = AtColumn; 981 SK.IsRequired = IsRequired; 982 SK.FlowLevel = FlowLevel; 983 SimpleKeys.push_back(SK); 984 } 985 } 986 987 void Scanner::removeStaleSimpleKeyCandidates() { 988 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 989 i != SimpleKeys.end();) { 990 if (i->Line != Line || i->Column + 1024 < Column) { 991 if (i->IsRequired) 992 setError( "Could not find expected : for simple key" 993 , i->Tok->Range.begin()); 994 i = SimpleKeys.erase(i); 995 } else 996 ++i; 997 } 998 } 999 1000 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1001 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1002 SimpleKeys.pop_back(); 1003 } 1004 1005 bool Scanner::unrollIndent(int ToColumn) { 1006 Token T; 1007 // Indentation is ignored in flow. 1008 if (FlowLevel != 0) 1009 return true; 1010 1011 while (Indent > ToColumn) { 1012 T.Kind = Token::TK_BlockEnd; 1013 T.Range = StringRef(Current, 1); 1014 TokenQueue.push_back(T); 1015 Indent = Indents.pop_back_val(); 1016 } 1017 1018 return true; 1019 } 1020 1021 bool Scanner::rollIndent( int ToColumn 1022 , Token::TokenKind Kind 1023 , TokenQueueT::iterator InsertPoint) { 1024 if (FlowLevel) 1025 return true; 1026 if (Indent < ToColumn) { 1027 Indents.push_back(Indent); 1028 Indent = ToColumn; 1029 1030 Token T; 1031 T.Kind = Kind; 1032 T.Range = StringRef(Current, 0); 1033 TokenQueue.insert(InsertPoint, T); 1034 } 1035 return true; 1036 } 1037 1038 void Scanner::skipComment() { 1039 if (*Current != '#') 1040 return; 1041 while (true) { 1042 // This may skip more than one byte, thus Column is only incremented 1043 // for code points. 1044 StringRef::iterator I = skip_nb_char(Current); 1045 if (I == Current) 1046 break; 1047 Current = I; 1048 ++Column; 1049 } 1050 } 1051 1052 void Scanner::scanToNextToken() { 1053 while (true) { 1054 while (*Current == ' ' || *Current == '\t') { 1055 skip(1); 1056 } 1057 1058 skipComment(); 1059 1060 // Skip EOL. 1061 StringRef::iterator i = skip_b_break(Current); 1062 if (i == Current) 1063 break; 1064 Current = i; 1065 ++Line; 1066 Column = 0; 1067 // New lines may start a simple key. 1068 if (!FlowLevel) 1069 IsSimpleKeyAllowed = true; 1070 } 1071 } 1072 1073 bool Scanner::scanStreamStart() { 1074 IsStartOfStream = false; 1075 1076 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1077 1078 Token T; 1079 T.Kind = Token::TK_StreamStart; 1080 T.Range = StringRef(Current, EI.second); 1081 TokenQueue.push_back(T); 1082 Current += EI.second; 1083 return true; 1084 } 1085 1086 bool Scanner::scanStreamEnd() { 1087 // Force an ending new line if one isn't present. 1088 if (Column != 0) { 1089 Column = 0; 1090 ++Line; 1091 } 1092 1093 unrollIndent(-1); 1094 SimpleKeys.clear(); 1095 IsSimpleKeyAllowed = false; 1096 1097 Token T; 1098 T.Kind = Token::TK_StreamEnd; 1099 T.Range = StringRef(Current, 0); 1100 TokenQueue.push_back(T); 1101 return true; 1102 } 1103 1104 bool Scanner::scanDirective() { 1105 // Reset the indentation level. 1106 unrollIndent(-1); 1107 SimpleKeys.clear(); 1108 IsSimpleKeyAllowed = false; 1109 1110 StringRef::iterator Start = Current; 1111 consume('%'); 1112 StringRef::iterator NameStart = Current; 1113 Current = skip_while(&Scanner::skip_ns_char, Current); 1114 StringRef Name(NameStart, Current - NameStart); 1115 Current = skip_while(&Scanner::skip_s_white, Current); 1116 1117 Token T; 1118 if (Name == "YAML") { 1119 Current = skip_while(&Scanner::skip_ns_char, Current); 1120 T.Kind = Token::TK_VersionDirective; 1121 T.Range = StringRef(Start, Current - Start); 1122 TokenQueue.push_back(T); 1123 return true; 1124 } else if(Name == "TAG") { 1125 Current = skip_while(&Scanner::skip_ns_char, Current); 1126 Current = skip_while(&Scanner::skip_s_white, Current); 1127 Current = skip_while(&Scanner::skip_ns_char, Current); 1128 T.Kind = Token::TK_TagDirective; 1129 T.Range = StringRef(Start, Current - Start); 1130 TokenQueue.push_back(T); 1131 return true; 1132 } 1133 return false; 1134 } 1135 1136 bool Scanner::scanDocumentIndicator(bool IsStart) { 1137 unrollIndent(-1); 1138 SimpleKeys.clear(); 1139 IsSimpleKeyAllowed = false; 1140 1141 Token T; 1142 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1143 T.Range = StringRef(Current, 3); 1144 skip(3); 1145 TokenQueue.push_back(T); 1146 return true; 1147 } 1148 1149 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1150 Token T; 1151 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1152 : Token::TK_FlowMappingStart; 1153 T.Range = StringRef(Current, 1); 1154 skip(1); 1155 TokenQueue.push_back(T); 1156 1157 // [ and { may begin a simple key. 1158 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1159 1160 // And may also be followed by a simple key. 1161 IsSimpleKeyAllowed = true; 1162 ++FlowLevel; 1163 return true; 1164 } 1165 1166 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1167 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1168 IsSimpleKeyAllowed = false; 1169 Token T; 1170 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1171 : Token::TK_FlowMappingEnd; 1172 T.Range = StringRef(Current, 1); 1173 skip(1); 1174 TokenQueue.push_back(T); 1175 if (FlowLevel) 1176 --FlowLevel; 1177 return true; 1178 } 1179 1180 bool Scanner::scanFlowEntry() { 1181 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1182 IsSimpleKeyAllowed = true; 1183 Token T; 1184 T.Kind = Token::TK_FlowEntry; 1185 T.Range = StringRef(Current, 1); 1186 skip(1); 1187 TokenQueue.push_back(T); 1188 return true; 1189 } 1190 1191 bool Scanner::scanBlockEntry() { 1192 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1193 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1194 IsSimpleKeyAllowed = true; 1195 Token T; 1196 T.Kind = Token::TK_BlockEntry; 1197 T.Range = StringRef(Current, 1); 1198 skip(1); 1199 TokenQueue.push_back(T); 1200 return true; 1201 } 1202 1203 bool Scanner::scanKey() { 1204 if (!FlowLevel) 1205 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1206 1207 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1208 IsSimpleKeyAllowed = !FlowLevel; 1209 1210 Token T; 1211 T.Kind = Token::TK_Key; 1212 T.Range = StringRef(Current, 1); 1213 skip(1); 1214 TokenQueue.push_back(T); 1215 return true; 1216 } 1217 1218 bool Scanner::scanValue() { 1219 // If the previous token could have been a simple key, insert the key token 1220 // into the token queue. 1221 if (!SimpleKeys.empty()) { 1222 SimpleKey SK = SimpleKeys.pop_back_val(); 1223 Token T; 1224 T.Kind = Token::TK_Key; 1225 T.Range = SK.Tok->Range; 1226 TokenQueueT::iterator i, e; 1227 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1228 if (i == SK.Tok) 1229 break; 1230 } 1231 if (i == e) { 1232 Failed = true; 1233 return false; 1234 } 1235 i = TokenQueue.insert(i, T); 1236 1237 // We may also need to add a Block-Mapping-Start token. 1238 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1239 1240 IsSimpleKeyAllowed = false; 1241 } else { 1242 if (!FlowLevel) 1243 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1244 IsSimpleKeyAllowed = !FlowLevel; 1245 } 1246 1247 Token T; 1248 T.Kind = Token::TK_Value; 1249 T.Range = StringRef(Current, 1); 1250 skip(1); 1251 TokenQueue.push_back(T); 1252 return true; 1253 } 1254 1255 // Forbidding inlining improves performance by roughly 20%. 1256 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1257 LLVM_ATTRIBUTE_NOINLINE static bool 1258 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1259 1260 // Returns whether a character at 'Position' was escaped with a leading '\'. 1261 // 'First' specifies the position of the first character in the string. 1262 static bool wasEscaped(StringRef::iterator First, 1263 StringRef::iterator Position) { 1264 assert(Position - 1 >= First); 1265 StringRef::iterator I = Position - 1; 1266 // We calculate the number of consecutive '\'s before the current position 1267 // by iterating backwards through our string. 1268 while (I >= First && *I == '\\') --I; 1269 // (Position - 1 - I) now contains the number of '\'s before the current 1270 // position. If it is odd, the character at 'Position' was escaped. 1271 return (Position - 1 - I) % 2 == 1; 1272 } 1273 1274 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1275 StringRef::iterator Start = Current; 1276 unsigned ColStart = Column; 1277 if (IsDoubleQuoted) { 1278 do { 1279 ++Current; 1280 while (Current != End && *Current != '"') 1281 ++Current; 1282 // Repeat until the previous character was not a '\' or was an escaped 1283 // backslash. 1284 } while ( Current != End 1285 && *(Current - 1) == '\\' 1286 && wasEscaped(Start + 1, Current)); 1287 } else { 1288 skip(1); 1289 while (true) { 1290 // Skip a ' followed by another '. 1291 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1292 skip(2); 1293 continue; 1294 } else if (*Current == '\'') 1295 break; 1296 StringRef::iterator i = skip_nb_char(Current); 1297 if (i == Current) { 1298 i = skip_b_break(Current); 1299 if (i == Current) 1300 break; 1301 Current = i; 1302 Column = 0; 1303 ++Line; 1304 } else { 1305 if (i == End) 1306 break; 1307 Current = i; 1308 ++Column; 1309 } 1310 } 1311 } 1312 1313 if (Current == End) { 1314 setError("Expected quote at end of scalar", Current); 1315 return false; 1316 } 1317 1318 skip(1); // Skip ending quote. 1319 Token T; 1320 T.Kind = Token::TK_Scalar; 1321 T.Range = StringRef(Start, Current - Start); 1322 TokenQueue.push_back(T); 1323 1324 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1325 1326 IsSimpleKeyAllowed = false; 1327 1328 return true; 1329 } 1330 1331 bool Scanner::scanPlainScalar() { 1332 StringRef::iterator Start = Current; 1333 unsigned ColStart = Column; 1334 unsigned LeadingBlanks = 0; 1335 assert(Indent >= -1 && "Indent must be >= -1 !"); 1336 unsigned indent = static_cast<unsigned>(Indent + 1); 1337 while (true) { 1338 if (*Current == '#') 1339 break; 1340 1341 while (!isBlankOrBreak(Current)) { 1342 if ( FlowLevel && *Current == ':' 1343 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1344 setError("Found unexpected ':' while scanning a plain scalar", Current); 1345 return false; 1346 } 1347 1348 // Check for the end of the plain scalar. 1349 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1350 || ( FlowLevel 1351 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1352 != StringRef::npos))) 1353 break; 1354 1355 StringRef::iterator i = skip_nb_char(Current); 1356 if (i == Current) 1357 break; 1358 Current = i; 1359 ++Column; 1360 } 1361 1362 // Are we at the end? 1363 if (!isBlankOrBreak(Current)) 1364 break; 1365 1366 // Eat blanks. 1367 StringRef::iterator Tmp = Current; 1368 while (isBlankOrBreak(Tmp)) { 1369 StringRef::iterator i = skip_s_white(Tmp); 1370 if (i != Tmp) { 1371 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1372 setError("Found invalid tab character in indentation", Tmp); 1373 return false; 1374 } 1375 Tmp = i; 1376 ++Column; 1377 } else { 1378 i = skip_b_break(Tmp); 1379 if (!LeadingBlanks) 1380 LeadingBlanks = 1; 1381 Tmp = i; 1382 Column = 0; 1383 ++Line; 1384 } 1385 } 1386 1387 if (!FlowLevel && Column < indent) 1388 break; 1389 1390 Current = Tmp; 1391 } 1392 if (Start == Current) { 1393 setError("Got empty plain scalar", Start); 1394 return false; 1395 } 1396 Token T; 1397 T.Kind = Token::TK_Scalar; 1398 T.Range = StringRef(Start, Current - Start); 1399 TokenQueue.push_back(T); 1400 1401 // Plain scalars can be simple keys. 1402 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1403 1404 IsSimpleKeyAllowed = false; 1405 1406 return true; 1407 } 1408 1409 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1410 StringRef::iterator Start = Current; 1411 unsigned ColStart = Column; 1412 skip(1); 1413 while(true) { 1414 if ( *Current == '[' || *Current == ']' 1415 || *Current == '{' || *Current == '}' 1416 || *Current == ',' 1417 || *Current == ':') 1418 break; 1419 StringRef::iterator i = skip_ns_char(Current); 1420 if (i == Current) 1421 break; 1422 Current = i; 1423 ++Column; 1424 } 1425 1426 if (Start == Current) { 1427 setError("Got empty alias or anchor", Start); 1428 return false; 1429 } 1430 1431 Token T; 1432 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1433 T.Range = StringRef(Start, Current - Start); 1434 TokenQueue.push_back(T); 1435 1436 // Alias and anchors can be simple keys. 1437 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1438 1439 IsSimpleKeyAllowed = false; 1440 1441 return true; 1442 } 1443 1444 char Scanner::scanBlockChompingIndicator() { 1445 char Indicator = ' '; 1446 if (Current != End && (*Current == '+' || *Current == '-')) { 1447 Indicator = *Current; 1448 skip(1); 1449 } 1450 return Indicator; 1451 } 1452 1453 /// Get the number of line breaks after chomping. 1454 /// 1455 /// Return the number of trailing line breaks to emit, depending on 1456 /// \p ChompingIndicator. 1457 static unsigned getChompedLineBreaks(char ChompingIndicator, 1458 unsigned LineBreaks, StringRef Str) { 1459 if (ChompingIndicator == '-') // Strip all line breaks. 1460 return 0; 1461 if (ChompingIndicator == '+') // Keep all line breaks. 1462 return LineBreaks; 1463 // Clip trailing lines. 1464 return Str.empty() ? 0 : 1; 1465 } 1466 1467 unsigned Scanner::scanBlockIndentationIndicator() { 1468 unsigned Indent = 0; 1469 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1470 Indent = unsigned(*Current - '0'); 1471 skip(1); 1472 } 1473 return Indent; 1474 } 1475 1476 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1477 unsigned &IndentIndicator, bool &IsDone) { 1478 auto Start = Current; 1479 1480 ChompingIndicator = scanBlockChompingIndicator(); 1481 IndentIndicator = scanBlockIndentationIndicator(); 1482 // Check for the chomping indicator once again. 1483 if (ChompingIndicator == ' ') 1484 ChompingIndicator = scanBlockChompingIndicator(); 1485 Current = skip_while(&Scanner::skip_s_white, Current); 1486 skipComment(); 1487 1488 if (Current == End) { // EOF, we have an empty scalar. 1489 Token T; 1490 T.Kind = Token::TK_BlockScalar; 1491 T.Range = StringRef(Start, Current - Start); 1492 TokenQueue.push_back(T); 1493 IsDone = true; 1494 return true; 1495 } 1496 1497 if (!consumeLineBreakIfPresent()) { 1498 setError("Expected a line break after block scalar header", Current); 1499 return false; 1500 } 1501 return true; 1502 } 1503 1504 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1505 unsigned BlockExitIndent, 1506 unsigned &LineBreaks, bool &IsDone) { 1507 unsigned MaxAllSpaceLineCharacters = 0; 1508 StringRef::iterator LongestAllSpaceLine; 1509 1510 while (true) { 1511 advanceWhile(&Scanner::skip_s_space); 1512 if (skip_nb_char(Current) != Current) { 1513 // This line isn't empty, so try and find the indentation. 1514 if (Column <= BlockExitIndent) { // End of the block literal. 1515 IsDone = true; 1516 return true; 1517 } 1518 // We found the block's indentation. 1519 BlockIndent = Column; 1520 if (MaxAllSpaceLineCharacters > BlockIndent) { 1521 setError( 1522 "Leading all-spaces line must be smaller than the block indent", 1523 LongestAllSpaceLine); 1524 return false; 1525 } 1526 return true; 1527 } 1528 if (skip_b_break(Current) != Current && 1529 Column > MaxAllSpaceLineCharacters) { 1530 // Record the longest all-space line in case it's longer than the 1531 // discovered block indent. 1532 MaxAllSpaceLineCharacters = Column; 1533 LongestAllSpaceLine = Current; 1534 } 1535 1536 // Check for EOF. 1537 if (Current == End) { 1538 IsDone = true; 1539 return true; 1540 } 1541 1542 if (!consumeLineBreakIfPresent()) { 1543 IsDone = true; 1544 return true; 1545 } 1546 ++LineBreaks; 1547 } 1548 return true; 1549 } 1550 1551 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1552 unsigned BlockExitIndent, bool &IsDone) { 1553 // Skip the indentation. 1554 while (Column < BlockIndent) { 1555 auto I = skip_s_space(Current); 1556 if (I == Current) 1557 break; 1558 Current = I; 1559 ++Column; 1560 } 1561 1562 if (skip_nb_char(Current) == Current) 1563 return true; 1564 1565 if (Column <= BlockExitIndent) { // End of the block literal. 1566 IsDone = true; 1567 return true; 1568 } 1569 1570 if (Column < BlockIndent) { 1571 if (Current != End && *Current == '#') { // Trailing comment. 1572 IsDone = true; 1573 return true; 1574 } 1575 setError("A text line is less indented than the block scalar", Current); 1576 return false; 1577 } 1578 return true; // A normal text line. 1579 } 1580 1581 bool Scanner::scanBlockScalar(bool IsLiteral) { 1582 // Eat '|' or '>' 1583 assert(*Current == '|' || *Current == '>'); 1584 skip(1); 1585 1586 char ChompingIndicator; 1587 unsigned BlockIndent; 1588 bool IsDone = false; 1589 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1590 return false; 1591 if (IsDone) 1592 return true; 1593 1594 auto Start = Current; 1595 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1596 unsigned LineBreaks = 0; 1597 if (BlockIndent == 0) { 1598 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1599 IsDone)) 1600 return false; 1601 } 1602 1603 // Scan the block's scalars body. 1604 SmallString<256> Str; 1605 while (!IsDone) { 1606 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1607 return false; 1608 if (IsDone) 1609 break; 1610 1611 // Parse the current line. 1612 auto LineStart = Current; 1613 advanceWhile(&Scanner::skip_nb_char); 1614 if (LineStart != Current) { 1615 Str.append(LineBreaks, '\n'); 1616 Str.append(StringRef(LineStart, Current - LineStart)); 1617 LineBreaks = 0; 1618 } 1619 1620 // Check for EOF. 1621 if (Current == End) 1622 break; 1623 1624 if (!consumeLineBreakIfPresent()) 1625 break; 1626 ++LineBreaks; 1627 } 1628 1629 if (Current == End && !LineBreaks) 1630 // Ensure that there is at least one line break before the end of file. 1631 LineBreaks = 1; 1632 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1633 1634 // New lines may start a simple key. 1635 if (!FlowLevel) 1636 IsSimpleKeyAllowed = true; 1637 1638 Token T; 1639 T.Kind = Token::TK_BlockScalar; 1640 T.Range = StringRef(Start, Current - Start); 1641 T.Value = std::string(Str); 1642 TokenQueue.push_back(T); 1643 return true; 1644 } 1645 1646 bool Scanner::scanTag() { 1647 StringRef::iterator Start = Current; 1648 unsigned ColStart = Column; 1649 skip(1); // Eat !. 1650 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1651 else if (*Current == '<') { 1652 skip(1); 1653 scan_ns_uri_char(); 1654 if (!consume('>')) 1655 return false; 1656 } else { 1657 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1658 Current = skip_while(&Scanner::skip_ns_char, Current); 1659 } 1660 1661 Token T; 1662 T.Kind = Token::TK_Tag; 1663 T.Range = StringRef(Start, Current - Start); 1664 TokenQueue.push_back(T); 1665 1666 // Tags can be simple keys. 1667 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1668 1669 IsSimpleKeyAllowed = false; 1670 1671 return true; 1672 } 1673 1674 bool Scanner::fetchMoreTokens() { 1675 if (IsStartOfStream) 1676 return scanStreamStart(); 1677 1678 scanToNextToken(); 1679 1680 if (Current == End) 1681 return scanStreamEnd(); 1682 1683 removeStaleSimpleKeyCandidates(); 1684 1685 unrollIndent(Column); 1686 1687 if (Column == 0 && *Current == '%') 1688 return scanDirective(); 1689 1690 if (Column == 0 && Current + 4 <= End 1691 && *Current == '-' 1692 && *(Current + 1) == '-' 1693 && *(Current + 2) == '-' 1694 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1695 return scanDocumentIndicator(true); 1696 1697 if (Column == 0 && Current + 4 <= End 1698 && *Current == '.' 1699 && *(Current + 1) == '.' 1700 && *(Current + 2) == '.' 1701 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1702 return scanDocumentIndicator(false); 1703 1704 if (*Current == '[') 1705 return scanFlowCollectionStart(true); 1706 1707 if (*Current == '{') 1708 return scanFlowCollectionStart(false); 1709 1710 if (*Current == ']') 1711 return scanFlowCollectionEnd(true); 1712 1713 if (*Current == '}') 1714 return scanFlowCollectionEnd(false); 1715 1716 if (*Current == ',') 1717 return scanFlowEntry(); 1718 1719 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1720 return scanBlockEntry(); 1721 1722 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1723 return scanKey(); 1724 1725 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1726 return scanValue(); 1727 1728 if (*Current == '*') 1729 return scanAliasOrAnchor(true); 1730 1731 if (*Current == '&') 1732 return scanAliasOrAnchor(false); 1733 1734 if (*Current == '!') 1735 return scanTag(); 1736 1737 if (*Current == '|' && !FlowLevel) 1738 return scanBlockScalar(true); 1739 1740 if (*Current == '>' && !FlowLevel) 1741 return scanBlockScalar(false); 1742 1743 if (*Current == '\'') 1744 return scanFlowScalar(false); 1745 1746 if (*Current == '"') 1747 return scanFlowScalar(true); 1748 1749 // Get a plain scalar. 1750 StringRef FirstChar(Current, 1); 1751 if (!(isBlankOrBreak(Current) 1752 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1753 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1754 || (!FlowLevel && (*Current == '?' || *Current == ':') 1755 && isBlankOrBreak(Current + 1)) 1756 || (!FlowLevel && *Current == ':' 1757 && Current + 2 < End 1758 && *(Current + 1) == ':' 1759 && !isBlankOrBreak(Current + 2))) 1760 return scanPlainScalar(); 1761 1762 setError("Unrecognized character while tokenizing.", Current); 1763 return false; 1764 } 1765 1766 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1767 std::error_code *EC) 1768 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1769 1770 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1771 std::error_code *EC) 1772 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1773 1774 Stream::~Stream() = default; 1775 1776 bool Stream::failed() { return scanner->failed(); } 1777 1778 void Stream::printError(Node *N, const Twine &Msg) { 1779 SMRange Range = N ? N->getSourceRange() : SMRange(); 1780 scanner->printError( Range.Start 1781 , SourceMgr::DK_Error 1782 , Msg 1783 , Range); 1784 } 1785 1786 document_iterator Stream::begin() { 1787 if (CurrentDoc) 1788 report_fatal_error("Can only iterate over the stream once"); 1789 1790 // Skip Stream-Start. 1791 scanner->getNext(); 1792 1793 CurrentDoc.reset(new Document(*this)); 1794 return document_iterator(CurrentDoc); 1795 } 1796 1797 document_iterator Stream::end() { 1798 return document_iterator(); 1799 } 1800 1801 void Stream::skip() { 1802 for (document_iterator i = begin(), e = end(); i != e; ++i) 1803 i->skip(); 1804 } 1805 1806 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1807 StringRef T) 1808 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1809 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1810 SourceRange = SMRange(Start, Start); 1811 } 1812 1813 std::string Node::getVerbatimTag() const { 1814 StringRef Raw = getRawTag(); 1815 if (!Raw.empty() && Raw != "!") { 1816 std::string Ret; 1817 if (Raw.find_last_of('!') == 0) { 1818 Ret = std::string(Doc->getTagMap().find("!")->second); 1819 Ret += Raw.substr(1); 1820 return Ret; 1821 } else if (Raw.startswith("!!")) { 1822 Ret = std::string(Doc->getTagMap().find("!!")->second); 1823 Ret += Raw.substr(2); 1824 return Ret; 1825 } else { 1826 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1827 std::map<StringRef, StringRef>::const_iterator It = 1828 Doc->getTagMap().find(TagHandle); 1829 if (It != Doc->getTagMap().end()) 1830 Ret = std::string(It->second); 1831 else { 1832 Token T; 1833 T.Kind = Token::TK_Tag; 1834 T.Range = TagHandle; 1835 setError(Twine("Unknown tag handle ") + TagHandle, T); 1836 } 1837 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1838 return Ret; 1839 } 1840 } 1841 1842 switch (getType()) { 1843 case NK_Null: 1844 return "tag:yaml.org,2002:null"; 1845 case NK_Scalar: 1846 case NK_BlockScalar: 1847 // TODO: Tag resolution. 1848 return "tag:yaml.org,2002:str"; 1849 case NK_Mapping: 1850 return "tag:yaml.org,2002:map"; 1851 case NK_Sequence: 1852 return "tag:yaml.org,2002:seq"; 1853 } 1854 1855 return ""; 1856 } 1857 1858 Token &Node::peekNext() { 1859 return Doc->peekNext(); 1860 } 1861 1862 Token Node::getNext() { 1863 return Doc->getNext(); 1864 } 1865 1866 Node *Node::parseBlockNode() { 1867 return Doc->parseBlockNode(); 1868 } 1869 1870 BumpPtrAllocator &Node::getAllocator() { 1871 return Doc->NodeAllocator; 1872 } 1873 1874 void Node::setError(const Twine &Msg, Token &Tok) const { 1875 Doc->setError(Msg, Tok); 1876 } 1877 1878 bool Node::failed() const { 1879 return Doc->failed(); 1880 } 1881 1882 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1883 // TODO: Handle newlines properly. We need to remove leading whitespace. 1884 if (Value[0] == '"') { // Double quoted. 1885 // Pull off the leading and trailing "s. 1886 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1887 // Search for characters that would require unescaping the value. 1888 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1889 if (i != StringRef::npos) 1890 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1891 return UnquotedValue; 1892 } else if (Value[0] == '\'') { // Single quoted. 1893 // Pull off the leading and trailing 's. 1894 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1895 StringRef::size_type i = UnquotedValue.find('\''); 1896 if (i != StringRef::npos) { 1897 // We're going to need Storage. 1898 Storage.clear(); 1899 Storage.reserve(UnquotedValue.size()); 1900 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1901 StringRef Valid(UnquotedValue.begin(), i); 1902 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1903 Storage.push_back('\''); 1904 UnquotedValue = UnquotedValue.substr(i + 2); 1905 } 1906 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1907 return StringRef(Storage.begin(), Storage.size()); 1908 } 1909 return UnquotedValue; 1910 } 1911 // Plain or block. 1912 return Value.rtrim(' '); 1913 } 1914 1915 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1916 , StringRef::size_type i 1917 , SmallVectorImpl<char> &Storage) 1918 const { 1919 // Use Storage to build proper value. 1920 Storage.clear(); 1921 Storage.reserve(UnquotedValue.size()); 1922 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1923 // Insert all previous chars into Storage. 1924 StringRef Valid(UnquotedValue.begin(), i); 1925 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1926 // Chop off inserted chars. 1927 UnquotedValue = UnquotedValue.substr(i); 1928 1929 assert(!UnquotedValue.empty() && "Can't be empty!"); 1930 1931 // Parse escape or line break. 1932 switch (UnquotedValue[0]) { 1933 case '\r': 1934 case '\n': 1935 Storage.push_back('\n'); 1936 if ( UnquotedValue.size() > 1 1937 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1938 UnquotedValue = UnquotedValue.substr(1); 1939 UnquotedValue = UnquotedValue.substr(1); 1940 break; 1941 default: 1942 if (UnquotedValue.size() == 1) { 1943 Token T; 1944 T.Range = StringRef(UnquotedValue.begin(), 1); 1945 setError("Unrecognized escape code", T); 1946 return ""; 1947 } 1948 UnquotedValue = UnquotedValue.substr(1); 1949 switch (UnquotedValue[0]) { 1950 default: { 1951 Token T; 1952 T.Range = StringRef(UnquotedValue.begin(), 1); 1953 setError("Unrecognized escape code", T); 1954 return ""; 1955 } 1956 case '\r': 1957 case '\n': 1958 // Remove the new line. 1959 if ( UnquotedValue.size() > 1 1960 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1961 UnquotedValue = UnquotedValue.substr(1); 1962 // If this was just a single byte newline, it will get skipped 1963 // below. 1964 break; 1965 case '0': 1966 Storage.push_back(0x00); 1967 break; 1968 case 'a': 1969 Storage.push_back(0x07); 1970 break; 1971 case 'b': 1972 Storage.push_back(0x08); 1973 break; 1974 case 't': 1975 case 0x09: 1976 Storage.push_back(0x09); 1977 break; 1978 case 'n': 1979 Storage.push_back(0x0A); 1980 break; 1981 case 'v': 1982 Storage.push_back(0x0B); 1983 break; 1984 case 'f': 1985 Storage.push_back(0x0C); 1986 break; 1987 case 'r': 1988 Storage.push_back(0x0D); 1989 break; 1990 case 'e': 1991 Storage.push_back(0x1B); 1992 break; 1993 case ' ': 1994 Storage.push_back(0x20); 1995 break; 1996 case '"': 1997 Storage.push_back(0x22); 1998 break; 1999 case '/': 2000 Storage.push_back(0x2F); 2001 break; 2002 case '\\': 2003 Storage.push_back(0x5C); 2004 break; 2005 case 'N': 2006 encodeUTF8(0x85, Storage); 2007 break; 2008 case '_': 2009 encodeUTF8(0xA0, Storage); 2010 break; 2011 case 'L': 2012 encodeUTF8(0x2028, Storage); 2013 break; 2014 case 'P': 2015 encodeUTF8(0x2029, Storage); 2016 break; 2017 case 'x': { 2018 if (UnquotedValue.size() < 3) 2019 // TODO: Report error. 2020 break; 2021 unsigned int UnicodeScalarValue; 2022 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2023 // TODO: Report error. 2024 UnicodeScalarValue = 0xFFFD; 2025 encodeUTF8(UnicodeScalarValue, Storage); 2026 UnquotedValue = UnquotedValue.substr(2); 2027 break; 2028 } 2029 case 'u': { 2030 if (UnquotedValue.size() < 5) 2031 // TODO: Report error. 2032 break; 2033 unsigned int UnicodeScalarValue; 2034 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2035 // TODO: Report error. 2036 UnicodeScalarValue = 0xFFFD; 2037 encodeUTF8(UnicodeScalarValue, Storage); 2038 UnquotedValue = UnquotedValue.substr(4); 2039 break; 2040 } 2041 case 'U': { 2042 if (UnquotedValue.size() < 9) 2043 // TODO: Report error. 2044 break; 2045 unsigned int UnicodeScalarValue; 2046 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2047 // TODO: Report error. 2048 UnicodeScalarValue = 0xFFFD; 2049 encodeUTF8(UnicodeScalarValue, Storage); 2050 UnquotedValue = UnquotedValue.substr(8); 2051 break; 2052 } 2053 } 2054 UnquotedValue = UnquotedValue.substr(1); 2055 } 2056 } 2057 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2058 return StringRef(Storage.begin(), Storage.size()); 2059 } 2060 2061 Node *KeyValueNode::getKey() { 2062 if (Key) 2063 return Key; 2064 // Handle implicit null keys. 2065 { 2066 Token &t = peekNext(); 2067 if ( t.Kind == Token::TK_BlockEnd 2068 || t.Kind == Token::TK_Value 2069 || t.Kind == Token::TK_Error) { 2070 return Key = new (getAllocator()) NullNode(Doc); 2071 } 2072 if (t.Kind == Token::TK_Key) 2073 getNext(); // skip TK_Key. 2074 } 2075 2076 // Handle explicit null keys. 2077 Token &t = peekNext(); 2078 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2079 return Key = new (getAllocator()) NullNode(Doc); 2080 } 2081 2082 // We've got a normal key. 2083 return Key = parseBlockNode(); 2084 } 2085 2086 Node *KeyValueNode::getValue() { 2087 if (Value) 2088 return Value; 2089 2090 if (Node* Key = getKey()) 2091 Key->skip(); 2092 else { 2093 setError("Null key in Key Value.", peekNext()); 2094 return Value = new (getAllocator()) NullNode(Doc); 2095 } 2096 2097 if (failed()) 2098 return Value = new (getAllocator()) NullNode(Doc); 2099 2100 // Handle implicit null values. 2101 { 2102 Token &t = peekNext(); 2103 if ( t.Kind == Token::TK_BlockEnd 2104 || t.Kind == Token::TK_FlowMappingEnd 2105 || t.Kind == Token::TK_Key 2106 || t.Kind == Token::TK_FlowEntry 2107 || t.Kind == Token::TK_Error) { 2108 return Value = new (getAllocator()) NullNode(Doc); 2109 } 2110 2111 if (t.Kind != Token::TK_Value) { 2112 setError("Unexpected token in Key Value.", t); 2113 return Value = new (getAllocator()) NullNode(Doc); 2114 } 2115 getNext(); // skip TK_Value. 2116 } 2117 2118 // Handle explicit null values. 2119 Token &t = peekNext(); 2120 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2121 return Value = new (getAllocator()) NullNode(Doc); 2122 } 2123 2124 // We got a normal value. 2125 return Value = parseBlockNode(); 2126 } 2127 2128 void MappingNode::increment() { 2129 if (failed()) { 2130 IsAtEnd = true; 2131 CurrentEntry = nullptr; 2132 return; 2133 } 2134 if (CurrentEntry) { 2135 CurrentEntry->skip(); 2136 if (Type == MT_Inline) { 2137 IsAtEnd = true; 2138 CurrentEntry = nullptr; 2139 return; 2140 } 2141 } 2142 Token T = peekNext(); 2143 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2144 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2145 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2146 } else if (Type == MT_Block) { 2147 switch (T.Kind) { 2148 case Token::TK_BlockEnd: 2149 getNext(); 2150 IsAtEnd = true; 2151 CurrentEntry = nullptr; 2152 break; 2153 default: 2154 setError("Unexpected token. Expected Key or Block End", T); 2155 LLVM_FALLTHROUGH; 2156 case Token::TK_Error: 2157 IsAtEnd = true; 2158 CurrentEntry = nullptr; 2159 } 2160 } else { 2161 switch (T.Kind) { 2162 case Token::TK_FlowEntry: 2163 // Eat the flow entry and recurse. 2164 getNext(); 2165 return increment(); 2166 case Token::TK_FlowMappingEnd: 2167 getNext(); 2168 LLVM_FALLTHROUGH; 2169 case Token::TK_Error: 2170 // Set this to end iterator. 2171 IsAtEnd = true; 2172 CurrentEntry = nullptr; 2173 break; 2174 default: 2175 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2176 "Mapping End." 2177 , T); 2178 IsAtEnd = true; 2179 CurrentEntry = nullptr; 2180 } 2181 } 2182 } 2183 2184 void SequenceNode::increment() { 2185 if (failed()) { 2186 IsAtEnd = true; 2187 CurrentEntry = nullptr; 2188 return; 2189 } 2190 if (CurrentEntry) 2191 CurrentEntry->skip(); 2192 Token T = peekNext(); 2193 if (SeqType == ST_Block) { 2194 switch (T.Kind) { 2195 case Token::TK_BlockEntry: 2196 getNext(); 2197 CurrentEntry = parseBlockNode(); 2198 if (!CurrentEntry) { // An error occurred. 2199 IsAtEnd = true; 2200 CurrentEntry = nullptr; 2201 } 2202 break; 2203 case Token::TK_BlockEnd: 2204 getNext(); 2205 IsAtEnd = true; 2206 CurrentEntry = nullptr; 2207 break; 2208 default: 2209 setError( "Unexpected token. Expected Block Entry or Block End." 2210 , T); 2211 LLVM_FALLTHROUGH; 2212 case Token::TK_Error: 2213 IsAtEnd = true; 2214 CurrentEntry = nullptr; 2215 } 2216 } else if (SeqType == ST_Indentless) { 2217 switch (T.Kind) { 2218 case Token::TK_BlockEntry: 2219 getNext(); 2220 CurrentEntry = parseBlockNode(); 2221 if (!CurrentEntry) { // An error occurred. 2222 IsAtEnd = true; 2223 CurrentEntry = nullptr; 2224 } 2225 break; 2226 default: 2227 case Token::TK_Error: 2228 IsAtEnd = true; 2229 CurrentEntry = nullptr; 2230 } 2231 } else if (SeqType == ST_Flow) { 2232 switch (T.Kind) { 2233 case Token::TK_FlowEntry: 2234 // Eat the flow entry and recurse. 2235 getNext(); 2236 WasPreviousTokenFlowEntry = true; 2237 return increment(); 2238 case Token::TK_FlowSequenceEnd: 2239 getNext(); 2240 LLVM_FALLTHROUGH; 2241 case Token::TK_Error: 2242 // Set this to end iterator. 2243 IsAtEnd = true; 2244 CurrentEntry = nullptr; 2245 break; 2246 case Token::TK_StreamEnd: 2247 case Token::TK_DocumentEnd: 2248 case Token::TK_DocumentStart: 2249 setError("Could not find closing ]!", T); 2250 // Set this to end iterator. 2251 IsAtEnd = true; 2252 CurrentEntry = nullptr; 2253 break; 2254 default: 2255 if (!WasPreviousTokenFlowEntry) { 2256 setError("Expected , between entries!", T); 2257 IsAtEnd = true; 2258 CurrentEntry = nullptr; 2259 break; 2260 } 2261 // Otherwise it must be a flow entry. 2262 CurrentEntry = parseBlockNode(); 2263 if (!CurrentEntry) { 2264 IsAtEnd = true; 2265 } 2266 WasPreviousTokenFlowEntry = false; 2267 break; 2268 } 2269 } 2270 } 2271 2272 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2273 // Tag maps starts with two default mappings. 2274 TagMap["!"] = "!"; 2275 TagMap["!!"] = "tag:yaml.org,2002:"; 2276 2277 if (parseDirectives()) 2278 expectToken(Token::TK_DocumentStart); 2279 Token &T = peekNext(); 2280 if (T.Kind == Token::TK_DocumentStart) 2281 getNext(); 2282 } 2283 2284 bool Document::skip() { 2285 if (stream.scanner->failed()) 2286 return false; 2287 if (!Root && !getRoot()) 2288 return false; 2289 Root->skip(); 2290 Token &T = peekNext(); 2291 if (T.Kind == Token::TK_StreamEnd) 2292 return false; 2293 if (T.Kind == Token::TK_DocumentEnd) { 2294 getNext(); 2295 return skip(); 2296 } 2297 return true; 2298 } 2299 2300 Token &Document::peekNext() { 2301 return stream.scanner->peekNext(); 2302 } 2303 2304 Token Document::getNext() { 2305 return stream.scanner->getNext(); 2306 } 2307 2308 void Document::setError(const Twine &Message, Token &Location) const { 2309 stream.scanner->setError(Message, Location.Range.begin()); 2310 } 2311 2312 bool Document::failed() const { 2313 return stream.scanner->failed(); 2314 } 2315 2316 Node *Document::parseBlockNode() { 2317 Token T = peekNext(); 2318 // Handle properties. 2319 Token AnchorInfo; 2320 Token TagInfo; 2321 parse_property: 2322 switch (T.Kind) { 2323 case Token::TK_Alias: 2324 getNext(); 2325 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2326 case Token::TK_Anchor: 2327 if (AnchorInfo.Kind == Token::TK_Anchor) { 2328 setError("Already encountered an anchor for this node!", T); 2329 return nullptr; 2330 } 2331 AnchorInfo = getNext(); // Consume TK_Anchor. 2332 T = peekNext(); 2333 goto parse_property; 2334 case Token::TK_Tag: 2335 if (TagInfo.Kind == Token::TK_Tag) { 2336 setError("Already encountered a tag for this node!", T); 2337 return nullptr; 2338 } 2339 TagInfo = getNext(); // Consume TK_Tag. 2340 T = peekNext(); 2341 goto parse_property; 2342 default: 2343 break; 2344 } 2345 2346 switch (T.Kind) { 2347 case Token::TK_BlockEntry: 2348 // We got an unindented BlockEntry sequence. This is not terminated with 2349 // a BlockEnd. 2350 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2351 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2352 , AnchorInfo.Range.substr(1) 2353 , TagInfo.Range 2354 , SequenceNode::ST_Indentless); 2355 case Token::TK_BlockSequenceStart: 2356 getNext(); 2357 return new (NodeAllocator) 2358 SequenceNode( stream.CurrentDoc 2359 , AnchorInfo.Range.substr(1) 2360 , TagInfo.Range 2361 , SequenceNode::ST_Block); 2362 case Token::TK_BlockMappingStart: 2363 getNext(); 2364 return new (NodeAllocator) 2365 MappingNode( stream.CurrentDoc 2366 , AnchorInfo.Range.substr(1) 2367 , TagInfo.Range 2368 , MappingNode::MT_Block); 2369 case Token::TK_FlowSequenceStart: 2370 getNext(); 2371 return new (NodeAllocator) 2372 SequenceNode( stream.CurrentDoc 2373 , AnchorInfo.Range.substr(1) 2374 , TagInfo.Range 2375 , SequenceNode::ST_Flow); 2376 case Token::TK_FlowMappingStart: 2377 getNext(); 2378 return new (NodeAllocator) 2379 MappingNode( stream.CurrentDoc 2380 , AnchorInfo.Range.substr(1) 2381 , TagInfo.Range 2382 , MappingNode::MT_Flow); 2383 case Token::TK_Scalar: 2384 getNext(); 2385 return new (NodeAllocator) 2386 ScalarNode( stream.CurrentDoc 2387 , AnchorInfo.Range.substr(1) 2388 , TagInfo.Range 2389 , T.Range); 2390 case Token::TK_BlockScalar: { 2391 getNext(); 2392 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2393 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2394 return new (NodeAllocator) 2395 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2396 TagInfo.Range, StrCopy, T.Range); 2397 } 2398 case Token::TK_Key: 2399 // Don't eat the TK_Key, KeyValueNode expects it. 2400 return new (NodeAllocator) 2401 MappingNode( stream.CurrentDoc 2402 , AnchorInfo.Range.substr(1) 2403 , TagInfo.Range 2404 , MappingNode::MT_Inline); 2405 case Token::TK_DocumentStart: 2406 case Token::TK_DocumentEnd: 2407 case Token::TK_StreamEnd: 2408 default: 2409 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2410 // !!null null. 2411 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2412 case Token::TK_FlowMappingEnd: 2413 case Token::TK_FlowSequenceEnd: 2414 case Token::TK_FlowEntry: { 2415 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2416 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2417 2418 setError("Unexpected token", T); 2419 return nullptr; 2420 } 2421 case Token::TK_Error: 2422 return nullptr; 2423 } 2424 llvm_unreachable("Control flow shouldn't reach here."); 2425 return nullptr; 2426 } 2427 2428 bool Document::parseDirectives() { 2429 bool isDirective = false; 2430 while (true) { 2431 Token T = peekNext(); 2432 if (T.Kind == Token::TK_TagDirective) { 2433 parseTAGDirective(); 2434 isDirective = true; 2435 } else if (T.Kind == Token::TK_VersionDirective) { 2436 parseYAMLDirective(); 2437 isDirective = true; 2438 } else 2439 break; 2440 } 2441 return isDirective; 2442 } 2443 2444 void Document::parseYAMLDirective() { 2445 getNext(); // Eat %YAML <version> 2446 } 2447 2448 void Document::parseTAGDirective() { 2449 Token Tag = getNext(); // %TAG <handle> <prefix> 2450 StringRef T = Tag.Range; 2451 // Strip %TAG 2452 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2453 std::size_t HandleEnd = T.find_first_of(" \t"); 2454 StringRef TagHandle = T.substr(0, HandleEnd); 2455 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2456 TagMap[TagHandle] = TagPrefix; 2457 } 2458 2459 bool Document::expectToken(int TK) { 2460 Token T = getNext(); 2461 if (T.Kind != TK) { 2462 setError("Unexpected token", T); 2463 return false; 2464 } 2465 return true; 2466 } 2467