1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringExtras.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/MemoryBuffer.h" 26 #include "llvm/Support/SMLoc.h" 27 #include "llvm/Support/SourceMgr.h" 28 #include "llvm/Support/Unicode.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <cassert> 31 #include <cstddef> 32 #include <cstdint> 33 #include <map> 34 #include <memory> 35 #include <string> 36 #include <system_error> 37 #include <utility> 38 39 using namespace llvm; 40 using namespace yaml; 41 42 enum UnicodeEncodingForm { 43 UEF_UTF32_LE, ///< UTF-32 Little Endian 44 UEF_UTF32_BE, ///< UTF-32 Big Endian 45 UEF_UTF16_LE, ///< UTF-16 Little Endian 46 UEF_UTF16_BE, ///< UTF-16 Big Endian 47 UEF_UTF8, ///< UTF-8 or ascii. 48 UEF_Unknown ///< Not a valid Unicode encoding. 49 }; 50 51 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 52 /// it exists. Length is in {0, 2, 3, 4}. 53 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 54 55 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 56 /// encoding form of \a Input. 57 /// 58 /// @param Input A string of length 0 or more. 59 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 60 /// and how long the byte order mark is if one exists. 61 static EncodingInfo getUnicodeEncoding(StringRef Input) { 62 if (Input.empty()) 63 return std::make_pair(UEF_Unknown, 0); 64 65 switch (uint8_t(Input[0])) { 66 case 0x00: 67 if (Input.size() >= 4) { 68 if ( Input[1] == 0 69 && uint8_t(Input[2]) == 0xFE 70 && uint8_t(Input[3]) == 0xFF) 71 return std::make_pair(UEF_UTF32_BE, 4); 72 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 73 return std::make_pair(UEF_UTF32_BE, 0); 74 } 75 76 if (Input.size() >= 2 && Input[1] != 0) 77 return std::make_pair(UEF_UTF16_BE, 0); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xFF: 80 if ( Input.size() >= 4 81 && uint8_t(Input[1]) == 0xFE 82 && Input[2] == 0 83 && Input[3] == 0) 84 return std::make_pair(UEF_UTF32_LE, 4); 85 86 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 87 return std::make_pair(UEF_UTF16_LE, 2); 88 return std::make_pair(UEF_Unknown, 0); 89 case 0xFE: 90 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 91 return std::make_pair(UEF_UTF16_BE, 2); 92 return std::make_pair(UEF_Unknown, 0); 93 case 0xEF: 94 if ( Input.size() >= 3 95 && uint8_t(Input[1]) == 0xBB 96 && uint8_t(Input[2]) == 0xBF) 97 return std::make_pair(UEF_UTF8, 3); 98 return std::make_pair(UEF_Unknown, 0); 99 } 100 101 // It could still be utf-32 or utf-16. 102 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 103 return std::make_pair(UEF_UTF32_LE, 0); 104 105 if (Input.size() >= 2 && Input[1] == 0) 106 return std::make_pair(UEF_UTF16_LE, 0); 107 108 return std::make_pair(UEF_UTF8, 0); 109 } 110 111 /// Pin the vtables to this file. 112 void Node::anchor() {} 113 void NullNode::anchor() {} 114 void ScalarNode::anchor() {} 115 void BlockScalarNode::anchor() {} 116 void KeyValueNode::anchor() {} 117 void MappingNode::anchor() {} 118 void SequenceNode::anchor() {} 119 void AliasNode::anchor() {} 120 121 namespace llvm { 122 namespace yaml { 123 124 /// Token - A single YAML token. 125 struct Token { 126 enum TokenKind { 127 TK_Error, // Uninitialized token. 128 TK_StreamStart, 129 TK_StreamEnd, 130 TK_VersionDirective, 131 TK_TagDirective, 132 TK_DocumentStart, 133 TK_DocumentEnd, 134 TK_BlockEntry, 135 TK_BlockEnd, 136 TK_BlockSequenceStart, 137 TK_BlockMappingStart, 138 TK_FlowEntry, 139 TK_FlowSequenceStart, 140 TK_FlowSequenceEnd, 141 TK_FlowMappingStart, 142 TK_FlowMappingEnd, 143 TK_Key, 144 TK_Value, 145 TK_Scalar, 146 TK_BlockScalar, 147 TK_Alias, 148 TK_Anchor, 149 TK_Tag 150 } Kind = TK_Error; 151 152 /// A string of length 0 or more whose begin() points to the logical location 153 /// of the token in the input. 154 StringRef Range; 155 156 /// The value of a block scalar node. 157 std::string Value; 158 159 Token() = default; 160 }; 161 162 } // end namespace yaml 163 } // end namespace llvm 164 165 using TokenQueueT = BumpPtrList<Token>; 166 167 namespace { 168 169 /// This struct is used to track simple keys. 170 /// 171 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 172 /// which could legally be the start of a simple key. When peekNext is called, 173 /// if the Token To be returned is referenced by a SimpleKey, we continue 174 /// tokenizing until that potential simple key has either been found to not be 175 /// a simple key (we moved on to the next line or went further than 1024 chars). 176 /// Or when we run into a Value, and then insert a Key token (and possibly 177 /// others) before the SimpleKey's Tok. 178 struct SimpleKey { 179 TokenQueueT::iterator Tok; 180 unsigned Column = 0; 181 unsigned Line = 0; 182 unsigned FlowLevel = 0; 183 bool IsRequired = false; 184 185 bool operator ==(const SimpleKey &Other) { 186 return Tok == Other.Tok; 187 } 188 }; 189 190 } // end anonymous namespace 191 192 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 193 /// subsequence and the subsequence's length in code units (uint8_t). 194 /// A length of 0 represents an error. 195 using UTF8Decoded = std::pair<uint32_t, unsigned>; 196 197 static UTF8Decoded decodeUTF8(StringRef Range) { 198 StringRef::iterator Position= Range.begin(); 199 StringRef::iterator End = Range.end(); 200 // 1 byte: [0x00, 0x7f] 201 // Bit pattern: 0xxxxxxx 202 if (Position < End && (*Position & 0x80) == 0) { 203 return std::make_pair(*Position, 1); 204 } 205 // 2 bytes: [0x80, 0x7ff] 206 // Bit pattern: 110xxxxx 10xxxxxx 207 if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && 208 ((*(Position + 1) & 0xC0) == 0x80)) { 209 uint32_t codepoint = ((*Position & 0x1F) << 6) | 210 (*(Position + 1) & 0x3F); 211 if (codepoint >= 0x80) 212 return std::make_pair(codepoint, 2); 213 } 214 // 3 bytes: [0x8000, 0xffff] 215 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 216 if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && 217 ((*(Position + 1) & 0xC0) == 0x80) && 218 ((*(Position + 2) & 0xC0) == 0x80)) { 219 uint32_t codepoint = ((*Position & 0x0F) << 12) | 220 ((*(Position + 1) & 0x3F) << 6) | 221 (*(Position + 2) & 0x3F); 222 // Codepoints between 0xD800 and 0xDFFF are invalid, as 223 // they are high / low surrogate halves used by UTF-16. 224 if (codepoint >= 0x800 && 225 (codepoint < 0xD800 || codepoint > 0xDFFF)) 226 return std::make_pair(codepoint, 3); 227 } 228 // 4 bytes: [0x10000, 0x10FFFF] 229 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 230 if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && 231 ((*(Position + 1) & 0xC0) == 0x80) && 232 ((*(Position + 2) & 0xC0) == 0x80) && 233 ((*(Position + 3) & 0xC0) == 0x80)) { 234 uint32_t codepoint = ((*Position & 0x07) << 18) | 235 ((*(Position + 1) & 0x3F) << 12) | 236 ((*(Position + 2) & 0x3F) << 6) | 237 (*(Position + 3) & 0x3F); 238 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 239 return std::make_pair(codepoint, 4); 240 } 241 return std::make_pair(0, 0); 242 } 243 244 namespace llvm { 245 namespace yaml { 246 247 /// Scans YAML tokens from a MemoryBuffer. 248 class Scanner { 249 public: 250 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 251 std::error_code *EC = nullptr); 252 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 253 std::error_code *EC = nullptr); 254 255 /// Parse the next token and return it without popping it. 256 Token &peekNext(); 257 258 /// Parse the next token and pop it from the queue. 259 Token getNext(); 260 261 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 262 ArrayRef<SMRange> Ranges = None) { 263 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 264 } 265 266 void setError(const Twine &Message, StringRef::iterator Position) { 267 if (Position >= End) 268 Position = End - 1; 269 270 // propagate the error if possible 271 if (EC) 272 *EC = make_error_code(std::errc::invalid_argument); 273 274 // Don't print out more errors after the first one we encounter. The rest 275 // are just the result of the first, and have no meaning. 276 if (!Failed) 277 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 278 Failed = true; 279 } 280 281 /// Returns true if an error occurred while parsing. 282 bool failed() { 283 return Failed; 284 } 285 286 private: 287 void init(MemoryBufferRef Buffer); 288 289 StringRef currentInput() { 290 return StringRef(Current, End - Current); 291 } 292 293 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 294 /// at \a Position. 295 /// 296 /// If the UTF-8 code units starting at Position do not form a well-formed 297 /// code unit subsequence, then the Unicode scalar value is 0, and the length 298 /// is 0. 299 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 300 return ::decodeUTF8(StringRef(Position, End - Position)); 301 } 302 303 // The following functions are based on the gramar rules in the YAML spec. The 304 // style of the function names it meant to closely match how they are written 305 // in the spec. The number within the [] is the number of the grammar rule in 306 // the spec. 307 // 308 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 309 // 310 // c- 311 // A production starting and ending with a special character. 312 // b- 313 // A production matching a single line break. 314 // nb- 315 // A production starting and ending with a non-break character. 316 // s- 317 // A production starting and ending with a white space character. 318 // ns- 319 // A production starting and ending with a non-space character. 320 // l- 321 // A production matching complete line(s). 322 323 /// Skip a single nb-char[27] starting at Position. 324 /// 325 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 326 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 327 /// 328 /// @returns The code unit after the nb-char, or Position if it's not an 329 /// nb-char. 330 StringRef::iterator skip_nb_char(StringRef::iterator Position); 331 332 /// Skip a single b-break[28] starting at Position. 333 /// 334 /// A b-break is 0xD 0xA | 0xD | 0xA 335 /// 336 /// @returns The code unit after the b-break, or Position if it's not a 337 /// b-break. 338 StringRef::iterator skip_b_break(StringRef::iterator Position); 339 340 /// Skip a single s-space[31] starting at Position. 341 /// 342 /// An s-space is 0x20 343 /// 344 /// @returns The code unit after the s-space, or Position if it's not a 345 /// s-space. 346 StringRef::iterator skip_s_space(StringRef::iterator Position); 347 348 /// Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 365 366 /// Skip minimal well-formed code unit subsequences until Func 367 /// returns its input. 368 /// 369 /// @returns The code unit after the last minimal well-formed code unit 370 /// subsequence that Func accepted. 371 StringRef::iterator skip_while( SkipWhileFunc Func 372 , StringRef::iterator Position); 373 374 /// Skip minimal well-formed code unit subsequences until Func returns its 375 /// input. 376 void advanceWhile(SkipWhileFunc Func); 377 378 /// Scan ns-uri-char[39]s starting at Cur. 379 /// 380 /// This updates Cur and Column while scanning. 381 void scan_ns_uri_char(); 382 383 /// Consume a minimal well-formed code unit subsequence starting at 384 /// \a Cur. Return false if it is not the same Unicode scalar value as 385 /// \a Expected. This updates \a Column. 386 bool consume(uint32_t Expected); 387 388 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 389 void skip(uint32_t Distance); 390 391 /// Return true if the minimal well-formed code unit subsequence at 392 /// Pos is whitespace or a new line 393 bool isBlankOrBreak(StringRef::iterator Position); 394 395 /// Consume a single b-break[28] if it's present at the current position. 396 /// 397 /// Return false if the code unit at the current position isn't a line break. 398 bool consumeLineBreakIfPresent(); 399 400 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 401 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 402 , unsigned AtColumn 403 , bool IsRequired); 404 405 /// Remove simple keys that can no longer be valid simple keys. 406 /// 407 /// Invalid simple keys are not on the current line or are further than 1024 408 /// columns back. 409 void removeStaleSimpleKeyCandidates(); 410 411 /// Remove all simple keys on FlowLevel \a Level. 412 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 413 414 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 415 /// tokens if needed. 416 bool unrollIndent(int ToColumn); 417 418 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 419 /// if needed. 420 bool rollIndent( int ToColumn 421 , Token::TokenKind Kind 422 , TokenQueueT::iterator InsertPoint); 423 424 /// Skip a single-line comment when the comment starts at the current 425 /// position of the scanner. 426 void skipComment(); 427 428 /// Skip whitespace and comments until the start of the next token. 429 void scanToNextToken(); 430 431 /// Must be the first token generated. 432 bool scanStreamStart(); 433 434 /// Generate tokens needed to close out the stream. 435 bool scanStreamEnd(); 436 437 /// Scan a %BLAH directive. 438 bool scanDirective(); 439 440 /// Scan a ... or ---. 441 bool scanDocumentIndicator(bool IsStart); 442 443 /// Scan a [ or { and generate the proper flow collection start token. 444 bool scanFlowCollectionStart(bool IsSequence); 445 446 /// Scan a ] or } and generate the proper flow collection end token. 447 bool scanFlowCollectionEnd(bool IsSequence); 448 449 /// Scan the , that separates entries in a flow collection. 450 bool scanFlowEntry(); 451 452 /// Scan the - that starts block sequence entries. 453 bool scanBlockEntry(); 454 455 /// Scan an explicit ? indicating a key. 456 bool scanKey(); 457 458 /// Scan an explicit : indicating a value. 459 bool scanValue(); 460 461 /// Scan a quoted scalar. 462 bool scanFlowScalar(bool IsDoubleQuoted); 463 464 /// Scan an unquoted scalar. 465 bool scanPlainScalar(); 466 467 /// Scan an Alias or Anchor starting with * or &. 468 bool scanAliasOrAnchor(bool IsAlias); 469 470 /// Scan a block scalar starting with | or >. 471 bool scanBlockScalar(bool IsLiteral); 472 473 /// Scan a chomping indicator in a block scalar header. 474 char scanBlockChompingIndicator(); 475 476 /// Scan an indentation indicator in a block scalar header. 477 unsigned scanBlockIndentationIndicator(); 478 479 /// Scan a block scalar header. 480 /// 481 /// Return false if an error occurred. 482 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 483 bool &IsDone); 484 485 /// Look for the indentation level of a block scalar. 486 /// 487 /// Return false if an error occurred. 488 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 489 unsigned &LineBreaks, bool &IsDone); 490 491 /// Scan the indentation of a text line in a block scalar. 492 /// 493 /// Return false if an error occurred. 494 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 495 bool &IsDone); 496 497 /// Scan a tag of the form !stuff. 498 bool scanTag(); 499 500 /// Dispatch to the next scanning function based on \a *Cur. 501 bool fetchMoreTokens(); 502 503 /// The SourceMgr used for diagnostics and buffer management. 504 SourceMgr &SM; 505 506 /// The original input. 507 MemoryBufferRef InputBuffer; 508 509 /// The current position of the scanner. 510 StringRef::iterator Current; 511 512 /// The end of the input (one past the last character). 513 StringRef::iterator End; 514 515 /// Current YAML indentation level in spaces. 516 int Indent; 517 518 /// Current column number in Unicode code points. 519 unsigned Column; 520 521 /// Current line number. 522 unsigned Line; 523 524 /// How deep we are in flow style containers. 0 Means at block level. 525 unsigned FlowLevel; 526 527 /// Are we at the start of the stream? 528 bool IsStartOfStream; 529 530 /// Can the next token be the start of a simple key? 531 bool IsSimpleKeyAllowed; 532 533 /// True if an error has occurred. 534 bool Failed; 535 536 /// Should colors be used when printing out the diagnostic messages? 537 bool ShowColors; 538 539 /// Queue of tokens. This is required to queue up tokens while looking 540 /// for the end of a simple key. And for cases where a single character 541 /// can produce multiple tokens (e.g. BlockEnd). 542 TokenQueueT TokenQueue; 543 544 /// Indentation levels. 545 SmallVector<int, 4> Indents; 546 547 /// Potential simple keys. 548 SmallVector<SimpleKey, 4> SimpleKeys; 549 550 std::error_code *EC; 551 }; 552 553 } // end namespace yaml 554 } // end namespace llvm 555 556 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 557 static void encodeUTF8( uint32_t UnicodeScalarValue 558 , SmallVectorImpl<char> &Result) { 559 if (UnicodeScalarValue <= 0x7F) { 560 Result.push_back(UnicodeScalarValue & 0x7F); 561 } else if (UnicodeScalarValue <= 0x7FF) { 562 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 563 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 564 Result.push_back(FirstByte); 565 Result.push_back(SecondByte); 566 } else if (UnicodeScalarValue <= 0xFFFF) { 567 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 568 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 569 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 570 Result.push_back(FirstByte); 571 Result.push_back(SecondByte); 572 Result.push_back(ThirdByte); 573 } else if (UnicodeScalarValue <= 0x10FFFF) { 574 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 575 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 576 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 577 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 578 Result.push_back(FirstByte); 579 Result.push_back(SecondByte); 580 Result.push_back(ThirdByte); 581 Result.push_back(FourthByte); 582 } 583 } 584 585 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 586 SourceMgr SM; 587 Scanner scanner(Input, SM); 588 while (true) { 589 Token T = scanner.getNext(); 590 switch (T.Kind) { 591 case Token::TK_StreamStart: 592 OS << "Stream-Start: "; 593 break; 594 case Token::TK_StreamEnd: 595 OS << "Stream-End: "; 596 break; 597 case Token::TK_VersionDirective: 598 OS << "Version-Directive: "; 599 break; 600 case Token::TK_TagDirective: 601 OS << "Tag-Directive: "; 602 break; 603 case Token::TK_DocumentStart: 604 OS << "Document-Start: "; 605 break; 606 case Token::TK_DocumentEnd: 607 OS << "Document-End: "; 608 break; 609 case Token::TK_BlockEntry: 610 OS << "Block-Entry: "; 611 break; 612 case Token::TK_BlockEnd: 613 OS << "Block-End: "; 614 break; 615 case Token::TK_BlockSequenceStart: 616 OS << "Block-Sequence-Start: "; 617 break; 618 case Token::TK_BlockMappingStart: 619 OS << "Block-Mapping-Start: "; 620 break; 621 case Token::TK_FlowEntry: 622 OS << "Flow-Entry: "; 623 break; 624 case Token::TK_FlowSequenceStart: 625 OS << "Flow-Sequence-Start: "; 626 break; 627 case Token::TK_FlowSequenceEnd: 628 OS << "Flow-Sequence-End: "; 629 break; 630 case Token::TK_FlowMappingStart: 631 OS << "Flow-Mapping-Start: "; 632 break; 633 case Token::TK_FlowMappingEnd: 634 OS << "Flow-Mapping-End: "; 635 break; 636 case Token::TK_Key: 637 OS << "Key: "; 638 break; 639 case Token::TK_Value: 640 OS << "Value: "; 641 break; 642 case Token::TK_Scalar: 643 OS << "Scalar: "; 644 break; 645 case Token::TK_BlockScalar: 646 OS << "Block Scalar: "; 647 break; 648 case Token::TK_Alias: 649 OS << "Alias: "; 650 break; 651 case Token::TK_Anchor: 652 OS << "Anchor: "; 653 break; 654 case Token::TK_Tag: 655 OS << "Tag: "; 656 break; 657 case Token::TK_Error: 658 break; 659 } 660 OS << T.Range << "\n"; 661 if (T.Kind == Token::TK_StreamEnd) 662 break; 663 else if (T.Kind == Token::TK_Error) 664 return false; 665 } 666 return true; 667 } 668 669 bool yaml::scanTokens(StringRef Input) { 670 SourceMgr SM; 671 Scanner scanner(Input, SM); 672 while (true) { 673 Token T = scanner.getNext(); 674 if (T.Kind == Token::TK_StreamEnd) 675 break; 676 else if (T.Kind == Token::TK_Error) 677 return false; 678 } 679 return true; 680 } 681 682 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 683 std::string EscapedInput; 684 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 685 if (*i == '\\') 686 EscapedInput += "\\\\"; 687 else if (*i == '"') 688 EscapedInput += "\\\""; 689 else if (*i == 0) 690 EscapedInput += "\\0"; 691 else if (*i == 0x07) 692 EscapedInput += "\\a"; 693 else if (*i == 0x08) 694 EscapedInput += "\\b"; 695 else if (*i == 0x09) 696 EscapedInput += "\\t"; 697 else if (*i == 0x0A) 698 EscapedInput += "\\n"; 699 else if (*i == 0x0B) 700 EscapedInput += "\\v"; 701 else if (*i == 0x0C) 702 EscapedInput += "\\f"; 703 else if (*i == 0x0D) 704 EscapedInput += "\\r"; 705 else if (*i == 0x1B) 706 EscapedInput += "\\e"; 707 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 708 std::string HexStr = utohexstr(*i); 709 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 710 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 711 UTF8Decoded UnicodeScalarValue 712 = decodeUTF8(StringRef(i, Input.end() - i)); 713 if (UnicodeScalarValue.second == 0) { 714 // Found invalid char. 715 SmallString<4> Val; 716 encodeUTF8(0xFFFD, Val); 717 llvm::append_range(EscapedInput, Val); 718 // FIXME: Error reporting. 719 return EscapedInput; 720 } 721 if (UnicodeScalarValue.first == 0x85) 722 EscapedInput += "\\N"; 723 else if (UnicodeScalarValue.first == 0xA0) 724 EscapedInput += "\\_"; 725 else if (UnicodeScalarValue.first == 0x2028) 726 EscapedInput += "\\L"; 727 else if (UnicodeScalarValue.first == 0x2029) 728 EscapedInput += "\\P"; 729 else if (!EscapePrintable && 730 sys::unicode::isPrintable(UnicodeScalarValue.first)) 731 EscapedInput += StringRef(i, UnicodeScalarValue.second); 732 else { 733 std::string HexStr = utohexstr(UnicodeScalarValue.first); 734 if (HexStr.size() <= 2) 735 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 736 else if (HexStr.size() <= 4) 737 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 738 else if (HexStr.size() <= 8) 739 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 740 } 741 i += UnicodeScalarValue.second - 1; 742 } else 743 EscapedInput.push_back(*i); 744 } 745 return EscapedInput; 746 } 747 748 llvm::Optional<bool> yaml::parseBool(StringRef S) { 749 switch (S.size()) { 750 case 1: 751 switch (S.front()) { 752 case 'y': 753 case 'Y': 754 return true; 755 case 'n': 756 case 'N': 757 return false; 758 default: 759 return None; 760 } 761 case 2: 762 switch (S.front()) { 763 case 'O': 764 if (S[1] == 'N') // ON 765 return true; 766 LLVM_FALLTHROUGH; 767 case 'o': 768 if (S[1] == 'n') //[Oo]n 769 return true; 770 return None; 771 case 'N': 772 if (S[1] == 'O') // NO 773 return false; 774 LLVM_FALLTHROUGH; 775 case 'n': 776 if (S[1] == 'o') //[Nn]o 777 return false; 778 return None; 779 default: 780 return None; 781 } 782 case 3: 783 switch (S.front()) { 784 case 'O': 785 if (S.drop_front() == "FF") // OFF 786 return false; 787 LLVM_FALLTHROUGH; 788 case 'o': 789 if (S.drop_front() == "ff") //[Oo]ff 790 return false; 791 return None; 792 case 'Y': 793 if (S.drop_front() == "ES") // YES 794 return true; 795 LLVM_FALLTHROUGH; 796 case 'y': 797 if (S.drop_front() == "es") //[Yy]es 798 return true; 799 return None; 800 default: 801 return None; 802 } 803 case 4: 804 switch (S.front()) { 805 case 'T': 806 if (S.drop_front() == "RUE") // TRUE 807 return true; 808 LLVM_FALLTHROUGH; 809 case 't': 810 if (S.drop_front() == "rue") //[Tt]rue 811 return true; 812 return None; 813 default: 814 return None; 815 } 816 case 5: 817 switch (S.front()) { 818 case 'F': 819 if (S.drop_front() == "ALSE") // FALSE 820 return false; 821 LLVM_FALLTHROUGH; 822 case 'f': 823 if (S.drop_front() == "alse") //[Ff]alse 824 return false; 825 return None; 826 default: 827 return None; 828 } 829 default: 830 return None; 831 } 832 } 833 834 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 835 std::error_code *EC) 836 : SM(sm), ShowColors(ShowColors), EC(EC) { 837 init(MemoryBufferRef(Input, "YAML")); 838 } 839 840 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 841 std::error_code *EC) 842 : SM(SM_), ShowColors(ShowColors), EC(EC) { 843 init(Buffer); 844 } 845 846 void Scanner::init(MemoryBufferRef Buffer) { 847 InputBuffer = Buffer; 848 Current = InputBuffer.getBufferStart(); 849 End = InputBuffer.getBufferEnd(); 850 Indent = -1; 851 Column = 0; 852 Line = 0; 853 FlowLevel = 0; 854 IsStartOfStream = true; 855 IsSimpleKeyAllowed = true; 856 Failed = false; 857 std::unique_ptr<MemoryBuffer> InputBufferOwner = 858 MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); 859 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 860 } 861 862 Token &Scanner::peekNext() { 863 // If the current token is a possible simple key, keep parsing until we 864 // can confirm. 865 bool NeedMore = false; 866 while (true) { 867 if (TokenQueue.empty() || NeedMore) { 868 if (!fetchMoreTokens()) { 869 TokenQueue.clear(); 870 SimpleKeys.clear(); 871 TokenQueue.push_back(Token()); 872 return TokenQueue.front(); 873 } 874 } 875 assert(!TokenQueue.empty() && 876 "fetchMoreTokens lied about getting tokens!"); 877 878 removeStaleSimpleKeyCandidates(); 879 SimpleKey SK; 880 SK.Tok = TokenQueue.begin(); 881 if (!is_contained(SimpleKeys, SK)) 882 break; 883 else 884 NeedMore = true; 885 } 886 return TokenQueue.front(); 887 } 888 889 Token Scanner::getNext() { 890 Token Ret = peekNext(); 891 // TokenQueue can be empty if there was an error getting the next token. 892 if (!TokenQueue.empty()) 893 TokenQueue.pop_front(); 894 895 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 896 // quick deallocation of them all. 897 if (TokenQueue.empty()) 898 TokenQueue.resetAlloc(); 899 900 return Ret; 901 } 902 903 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 904 if (Position == End) 905 return Position; 906 // Check 7 bit c-printable - b-char. 907 if ( *Position == 0x09 908 || (*Position >= 0x20 && *Position <= 0x7E)) 909 return Position + 1; 910 911 // Check for valid UTF-8. 912 if (uint8_t(*Position) & 0x80) { 913 UTF8Decoded u8d = decodeUTF8(Position); 914 if ( u8d.second != 0 915 && u8d.first != 0xFEFF 916 && ( u8d.first == 0x85 917 || ( u8d.first >= 0xA0 918 && u8d.first <= 0xD7FF) 919 || ( u8d.first >= 0xE000 920 && u8d.first <= 0xFFFD) 921 || ( u8d.first >= 0x10000 922 && u8d.first <= 0x10FFFF))) 923 return Position + u8d.second; 924 } 925 return Position; 926 } 927 928 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 929 if (Position == End) 930 return Position; 931 if (*Position == 0x0D) { 932 if (Position + 1 != End && *(Position + 1) == 0x0A) 933 return Position + 2; 934 return Position + 1; 935 } 936 937 if (*Position == 0x0A) 938 return Position + 1; 939 return Position; 940 } 941 942 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 943 if (Position == End) 944 return Position; 945 if (*Position == ' ') 946 return Position + 1; 947 return Position; 948 } 949 950 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 951 if (Position == End) 952 return Position; 953 if (*Position == ' ' || *Position == '\t') 954 return Position + 1; 955 return Position; 956 } 957 958 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 959 if (Position == End) 960 return Position; 961 if (*Position == ' ' || *Position == '\t') 962 return Position; 963 return skip_nb_char(Position); 964 } 965 966 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 967 , StringRef::iterator Position) { 968 while (true) { 969 StringRef::iterator i = (this->*Func)(Position); 970 if (i == Position) 971 break; 972 Position = i; 973 } 974 return Position; 975 } 976 977 void Scanner::advanceWhile(SkipWhileFunc Func) { 978 auto Final = skip_while(Func, Current); 979 Column += Final - Current; 980 Current = Final; 981 } 982 983 static bool is_ns_hex_digit(const char C) { return isAlnum(C); } 984 985 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); } 986 987 void Scanner::scan_ns_uri_char() { 988 while (true) { 989 if (Current == End) 990 break; 991 if (( *Current == '%' 992 && Current + 2 < End 993 && is_ns_hex_digit(*(Current + 1)) 994 && is_ns_hex_digit(*(Current + 2))) 995 || is_ns_word_char(*Current) 996 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 997 != StringRef::npos) { 998 ++Current; 999 ++Column; 1000 } else 1001 break; 1002 } 1003 } 1004 1005 bool Scanner::consume(uint32_t Expected) { 1006 if (Expected >= 0x80) { 1007 setError("Cannot consume non-ascii characters", Current); 1008 return false; 1009 } 1010 if (Current == End) 1011 return false; 1012 if (uint8_t(*Current) >= 0x80) { 1013 setError("Cannot consume non-ascii characters", Current); 1014 return false; 1015 } 1016 if (uint8_t(*Current) == Expected) { 1017 ++Current; 1018 ++Column; 1019 return true; 1020 } 1021 return false; 1022 } 1023 1024 void Scanner::skip(uint32_t Distance) { 1025 Current += Distance; 1026 Column += Distance; 1027 assert(Current <= End && "Skipped past the end"); 1028 } 1029 1030 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 1031 if (Position == End) 1032 return false; 1033 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 1034 *Position == '\n'; 1035 } 1036 1037 bool Scanner::consumeLineBreakIfPresent() { 1038 auto Next = skip_b_break(Current); 1039 if (Next == Current) 1040 return false; 1041 Column = 0; 1042 ++Line; 1043 Current = Next; 1044 return true; 1045 } 1046 1047 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 1048 , unsigned AtColumn 1049 , bool IsRequired) { 1050 if (IsSimpleKeyAllowed) { 1051 SimpleKey SK; 1052 SK.Tok = Tok; 1053 SK.Line = Line; 1054 SK.Column = AtColumn; 1055 SK.IsRequired = IsRequired; 1056 SK.FlowLevel = FlowLevel; 1057 SimpleKeys.push_back(SK); 1058 } 1059 } 1060 1061 void Scanner::removeStaleSimpleKeyCandidates() { 1062 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 1063 i != SimpleKeys.end();) { 1064 if (i->Line != Line || i->Column + 1024 < Column) { 1065 if (i->IsRequired) 1066 setError( "Could not find expected : for simple key" 1067 , i->Tok->Range.begin()); 1068 i = SimpleKeys.erase(i); 1069 } else 1070 ++i; 1071 } 1072 } 1073 1074 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1075 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1076 SimpleKeys.pop_back(); 1077 } 1078 1079 bool Scanner::unrollIndent(int ToColumn) { 1080 Token T; 1081 // Indentation is ignored in flow. 1082 if (FlowLevel != 0) 1083 return true; 1084 1085 while (Indent > ToColumn) { 1086 T.Kind = Token::TK_BlockEnd; 1087 T.Range = StringRef(Current, 1); 1088 TokenQueue.push_back(T); 1089 Indent = Indents.pop_back_val(); 1090 } 1091 1092 return true; 1093 } 1094 1095 bool Scanner::rollIndent( int ToColumn 1096 , Token::TokenKind Kind 1097 , TokenQueueT::iterator InsertPoint) { 1098 if (FlowLevel) 1099 return true; 1100 if (Indent < ToColumn) { 1101 Indents.push_back(Indent); 1102 Indent = ToColumn; 1103 1104 Token T; 1105 T.Kind = Kind; 1106 T.Range = StringRef(Current, 0); 1107 TokenQueue.insert(InsertPoint, T); 1108 } 1109 return true; 1110 } 1111 1112 void Scanner::skipComment() { 1113 if (Current == End || *Current != '#') 1114 return; 1115 while (true) { 1116 // This may skip more than one byte, thus Column is only incremented 1117 // for code points. 1118 StringRef::iterator I = skip_nb_char(Current); 1119 if (I == Current) 1120 break; 1121 Current = I; 1122 ++Column; 1123 } 1124 } 1125 1126 void Scanner::scanToNextToken() { 1127 while (true) { 1128 while (Current != End && (*Current == ' ' || *Current == '\t')) { 1129 skip(1); 1130 } 1131 1132 skipComment(); 1133 1134 // Skip EOL. 1135 StringRef::iterator i = skip_b_break(Current); 1136 if (i == Current) 1137 break; 1138 Current = i; 1139 ++Line; 1140 Column = 0; 1141 // New lines may start a simple key. 1142 if (!FlowLevel) 1143 IsSimpleKeyAllowed = true; 1144 } 1145 } 1146 1147 bool Scanner::scanStreamStart() { 1148 IsStartOfStream = false; 1149 1150 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1151 1152 Token T; 1153 T.Kind = Token::TK_StreamStart; 1154 T.Range = StringRef(Current, EI.second); 1155 TokenQueue.push_back(T); 1156 Current += EI.second; 1157 return true; 1158 } 1159 1160 bool Scanner::scanStreamEnd() { 1161 // Force an ending new line if one isn't present. 1162 if (Column != 0) { 1163 Column = 0; 1164 ++Line; 1165 } 1166 1167 unrollIndent(-1); 1168 SimpleKeys.clear(); 1169 IsSimpleKeyAllowed = false; 1170 1171 Token T; 1172 T.Kind = Token::TK_StreamEnd; 1173 T.Range = StringRef(Current, 0); 1174 TokenQueue.push_back(T); 1175 return true; 1176 } 1177 1178 bool Scanner::scanDirective() { 1179 // Reset the indentation level. 1180 unrollIndent(-1); 1181 SimpleKeys.clear(); 1182 IsSimpleKeyAllowed = false; 1183 1184 StringRef::iterator Start = Current; 1185 consume('%'); 1186 StringRef::iterator NameStart = Current; 1187 Current = skip_while(&Scanner::skip_ns_char, Current); 1188 StringRef Name(NameStart, Current - NameStart); 1189 Current = skip_while(&Scanner::skip_s_white, Current); 1190 1191 Token T; 1192 if (Name == "YAML") { 1193 Current = skip_while(&Scanner::skip_ns_char, Current); 1194 T.Kind = Token::TK_VersionDirective; 1195 T.Range = StringRef(Start, Current - Start); 1196 TokenQueue.push_back(T); 1197 return true; 1198 } else if(Name == "TAG") { 1199 Current = skip_while(&Scanner::skip_ns_char, Current); 1200 Current = skip_while(&Scanner::skip_s_white, Current); 1201 Current = skip_while(&Scanner::skip_ns_char, Current); 1202 T.Kind = Token::TK_TagDirective; 1203 T.Range = StringRef(Start, Current - Start); 1204 TokenQueue.push_back(T); 1205 return true; 1206 } 1207 return false; 1208 } 1209 1210 bool Scanner::scanDocumentIndicator(bool IsStart) { 1211 unrollIndent(-1); 1212 SimpleKeys.clear(); 1213 IsSimpleKeyAllowed = false; 1214 1215 Token T; 1216 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1217 T.Range = StringRef(Current, 3); 1218 skip(3); 1219 TokenQueue.push_back(T); 1220 return true; 1221 } 1222 1223 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1224 Token T; 1225 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1226 : Token::TK_FlowMappingStart; 1227 T.Range = StringRef(Current, 1); 1228 skip(1); 1229 TokenQueue.push_back(T); 1230 1231 // [ and { may begin a simple key. 1232 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1233 1234 // And may also be followed by a simple key. 1235 IsSimpleKeyAllowed = true; 1236 ++FlowLevel; 1237 return true; 1238 } 1239 1240 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1241 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1242 IsSimpleKeyAllowed = false; 1243 Token T; 1244 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1245 : Token::TK_FlowMappingEnd; 1246 T.Range = StringRef(Current, 1); 1247 skip(1); 1248 TokenQueue.push_back(T); 1249 if (FlowLevel) 1250 --FlowLevel; 1251 return true; 1252 } 1253 1254 bool Scanner::scanFlowEntry() { 1255 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1256 IsSimpleKeyAllowed = true; 1257 Token T; 1258 T.Kind = Token::TK_FlowEntry; 1259 T.Range = StringRef(Current, 1); 1260 skip(1); 1261 TokenQueue.push_back(T); 1262 return true; 1263 } 1264 1265 bool Scanner::scanBlockEntry() { 1266 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1267 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1268 IsSimpleKeyAllowed = true; 1269 Token T; 1270 T.Kind = Token::TK_BlockEntry; 1271 T.Range = StringRef(Current, 1); 1272 skip(1); 1273 TokenQueue.push_back(T); 1274 return true; 1275 } 1276 1277 bool Scanner::scanKey() { 1278 if (!FlowLevel) 1279 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1280 1281 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1282 IsSimpleKeyAllowed = !FlowLevel; 1283 1284 Token T; 1285 T.Kind = Token::TK_Key; 1286 T.Range = StringRef(Current, 1); 1287 skip(1); 1288 TokenQueue.push_back(T); 1289 return true; 1290 } 1291 1292 bool Scanner::scanValue() { 1293 // If the previous token could have been a simple key, insert the key token 1294 // into the token queue. 1295 if (!SimpleKeys.empty()) { 1296 SimpleKey SK = SimpleKeys.pop_back_val(); 1297 Token T; 1298 T.Kind = Token::TK_Key; 1299 T.Range = SK.Tok->Range; 1300 TokenQueueT::iterator i, e; 1301 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1302 if (i == SK.Tok) 1303 break; 1304 } 1305 if (i == e) { 1306 Failed = true; 1307 return false; 1308 } 1309 i = TokenQueue.insert(i, T); 1310 1311 // We may also need to add a Block-Mapping-Start token. 1312 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1313 1314 IsSimpleKeyAllowed = false; 1315 } else { 1316 if (!FlowLevel) 1317 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1318 IsSimpleKeyAllowed = !FlowLevel; 1319 } 1320 1321 Token T; 1322 T.Kind = Token::TK_Value; 1323 T.Range = StringRef(Current, 1); 1324 skip(1); 1325 TokenQueue.push_back(T); 1326 return true; 1327 } 1328 1329 // Forbidding inlining improves performance by roughly 20%. 1330 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1331 LLVM_ATTRIBUTE_NOINLINE static bool 1332 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1333 1334 // Returns whether a character at 'Position' was escaped with a leading '\'. 1335 // 'First' specifies the position of the first character in the string. 1336 static bool wasEscaped(StringRef::iterator First, 1337 StringRef::iterator Position) { 1338 assert(Position - 1 >= First); 1339 StringRef::iterator I = Position - 1; 1340 // We calculate the number of consecutive '\'s before the current position 1341 // by iterating backwards through our string. 1342 while (I >= First && *I == '\\') --I; 1343 // (Position - 1 - I) now contains the number of '\'s before the current 1344 // position. If it is odd, the character at 'Position' was escaped. 1345 return (Position - 1 - I) % 2 == 1; 1346 } 1347 1348 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1349 StringRef::iterator Start = Current; 1350 unsigned ColStart = Column; 1351 if (IsDoubleQuoted) { 1352 do { 1353 ++Current; 1354 while (Current != End && *Current != '"') 1355 ++Current; 1356 // Repeat until the previous character was not a '\' or was an escaped 1357 // backslash. 1358 } while ( Current != End 1359 && *(Current - 1) == '\\' 1360 && wasEscaped(Start + 1, Current)); 1361 } else { 1362 skip(1); 1363 while (Current != End) { 1364 // Skip a ' followed by another '. 1365 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1366 skip(2); 1367 continue; 1368 } else if (*Current == '\'') 1369 break; 1370 StringRef::iterator i = skip_nb_char(Current); 1371 if (i == Current) { 1372 i = skip_b_break(Current); 1373 if (i == Current) 1374 break; 1375 Current = i; 1376 Column = 0; 1377 ++Line; 1378 } else { 1379 if (i == End) 1380 break; 1381 Current = i; 1382 ++Column; 1383 } 1384 } 1385 } 1386 1387 if (Current == End) { 1388 setError("Expected quote at end of scalar", Current); 1389 return false; 1390 } 1391 1392 skip(1); // Skip ending quote. 1393 Token T; 1394 T.Kind = Token::TK_Scalar; 1395 T.Range = StringRef(Start, Current - Start); 1396 TokenQueue.push_back(T); 1397 1398 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1399 1400 IsSimpleKeyAllowed = false; 1401 1402 return true; 1403 } 1404 1405 bool Scanner::scanPlainScalar() { 1406 StringRef::iterator Start = Current; 1407 unsigned ColStart = Column; 1408 unsigned LeadingBlanks = 0; 1409 assert(Indent >= -1 && "Indent must be >= -1 !"); 1410 unsigned indent = static_cast<unsigned>(Indent + 1); 1411 while (Current != End) { 1412 if (*Current == '#') 1413 break; 1414 1415 while (Current != End && !isBlankOrBreak(Current)) { 1416 if (FlowLevel && *Current == ':' && 1417 (Current + 1 == End || 1418 !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) { 1419 setError("Found unexpected ':' while scanning a plain scalar", Current); 1420 return false; 1421 } 1422 1423 // Check for the end of the plain scalar. 1424 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1425 || ( FlowLevel 1426 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1427 != StringRef::npos))) 1428 break; 1429 1430 StringRef::iterator i = skip_nb_char(Current); 1431 if (i == Current) 1432 break; 1433 Current = i; 1434 ++Column; 1435 } 1436 1437 // Are we at the end? 1438 if (!isBlankOrBreak(Current)) 1439 break; 1440 1441 // Eat blanks. 1442 StringRef::iterator Tmp = Current; 1443 while (isBlankOrBreak(Tmp)) { 1444 StringRef::iterator i = skip_s_white(Tmp); 1445 if (i != Tmp) { 1446 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1447 setError("Found invalid tab character in indentation", Tmp); 1448 return false; 1449 } 1450 Tmp = i; 1451 ++Column; 1452 } else { 1453 i = skip_b_break(Tmp); 1454 if (!LeadingBlanks) 1455 LeadingBlanks = 1; 1456 Tmp = i; 1457 Column = 0; 1458 ++Line; 1459 } 1460 } 1461 1462 if (!FlowLevel && Column < indent) 1463 break; 1464 1465 Current = Tmp; 1466 } 1467 if (Start == Current) { 1468 setError("Got empty plain scalar", Start); 1469 return false; 1470 } 1471 Token T; 1472 T.Kind = Token::TK_Scalar; 1473 T.Range = StringRef(Start, Current - Start); 1474 TokenQueue.push_back(T); 1475 1476 // Plain scalars can be simple keys. 1477 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1478 1479 IsSimpleKeyAllowed = false; 1480 1481 return true; 1482 } 1483 1484 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1485 StringRef::iterator Start = Current; 1486 unsigned ColStart = Column; 1487 skip(1); 1488 while (Current != End) { 1489 if ( *Current == '[' || *Current == ']' 1490 || *Current == '{' || *Current == '}' 1491 || *Current == ',' 1492 || *Current == ':') 1493 break; 1494 StringRef::iterator i = skip_ns_char(Current); 1495 if (i == Current) 1496 break; 1497 Current = i; 1498 ++Column; 1499 } 1500 1501 if (Start + 1 == Current) { 1502 setError("Got empty alias or anchor", Start); 1503 return false; 1504 } 1505 1506 Token T; 1507 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1508 T.Range = StringRef(Start, Current - Start); 1509 TokenQueue.push_back(T); 1510 1511 // Alias and anchors can be simple keys. 1512 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1513 1514 IsSimpleKeyAllowed = false; 1515 1516 return true; 1517 } 1518 1519 char Scanner::scanBlockChompingIndicator() { 1520 char Indicator = ' '; 1521 if (Current != End && (*Current == '+' || *Current == '-')) { 1522 Indicator = *Current; 1523 skip(1); 1524 } 1525 return Indicator; 1526 } 1527 1528 /// Get the number of line breaks after chomping. 1529 /// 1530 /// Return the number of trailing line breaks to emit, depending on 1531 /// \p ChompingIndicator. 1532 static unsigned getChompedLineBreaks(char ChompingIndicator, 1533 unsigned LineBreaks, StringRef Str) { 1534 if (ChompingIndicator == '-') // Strip all line breaks. 1535 return 0; 1536 if (ChompingIndicator == '+') // Keep all line breaks. 1537 return LineBreaks; 1538 // Clip trailing lines. 1539 return Str.empty() ? 0 : 1; 1540 } 1541 1542 unsigned Scanner::scanBlockIndentationIndicator() { 1543 unsigned Indent = 0; 1544 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1545 Indent = unsigned(*Current - '0'); 1546 skip(1); 1547 } 1548 return Indent; 1549 } 1550 1551 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1552 unsigned &IndentIndicator, bool &IsDone) { 1553 auto Start = Current; 1554 1555 ChompingIndicator = scanBlockChompingIndicator(); 1556 IndentIndicator = scanBlockIndentationIndicator(); 1557 // Check for the chomping indicator once again. 1558 if (ChompingIndicator == ' ') 1559 ChompingIndicator = scanBlockChompingIndicator(); 1560 Current = skip_while(&Scanner::skip_s_white, Current); 1561 skipComment(); 1562 1563 if (Current == End) { // EOF, we have an empty scalar. 1564 Token T; 1565 T.Kind = Token::TK_BlockScalar; 1566 T.Range = StringRef(Start, Current - Start); 1567 TokenQueue.push_back(T); 1568 IsDone = true; 1569 return true; 1570 } 1571 1572 if (!consumeLineBreakIfPresent()) { 1573 setError("Expected a line break after block scalar header", Current); 1574 return false; 1575 } 1576 return true; 1577 } 1578 1579 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1580 unsigned BlockExitIndent, 1581 unsigned &LineBreaks, bool &IsDone) { 1582 unsigned MaxAllSpaceLineCharacters = 0; 1583 StringRef::iterator LongestAllSpaceLine; 1584 1585 while (true) { 1586 advanceWhile(&Scanner::skip_s_space); 1587 if (skip_nb_char(Current) != Current) { 1588 // This line isn't empty, so try and find the indentation. 1589 if (Column <= BlockExitIndent) { // End of the block literal. 1590 IsDone = true; 1591 return true; 1592 } 1593 // We found the block's indentation. 1594 BlockIndent = Column; 1595 if (MaxAllSpaceLineCharacters > BlockIndent) { 1596 setError( 1597 "Leading all-spaces line must be smaller than the block indent", 1598 LongestAllSpaceLine); 1599 return false; 1600 } 1601 return true; 1602 } 1603 if (skip_b_break(Current) != Current && 1604 Column > MaxAllSpaceLineCharacters) { 1605 // Record the longest all-space line in case it's longer than the 1606 // discovered block indent. 1607 MaxAllSpaceLineCharacters = Column; 1608 LongestAllSpaceLine = Current; 1609 } 1610 1611 // Check for EOF. 1612 if (Current == End) { 1613 IsDone = true; 1614 return true; 1615 } 1616 1617 if (!consumeLineBreakIfPresent()) { 1618 IsDone = true; 1619 return true; 1620 } 1621 ++LineBreaks; 1622 } 1623 return true; 1624 } 1625 1626 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1627 unsigned BlockExitIndent, bool &IsDone) { 1628 // Skip the indentation. 1629 while (Column < BlockIndent) { 1630 auto I = skip_s_space(Current); 1631 if (I == Current) 1632 break; 1633 Current = I; 1634 ++Column; 1635 } 1636 1637 if (skip_nb_char(Current) == Current) 1638 return true; 1639 1640 if (Column <= BlockExitIndent) { // End of the block literal. 1641 IsDone = true; 1642 return true; 1643 } 1644 1645 if (Column < BlockIndent) { 1646 if (Current != End && *Current == '#') { // Trailing comment. 1647 IsDone = true; 1648 return true; 1649 } 1650 setError("A text line is less indented than the block scalar", Current); 1651 return false; 1652 } 1653 return true; // A normal text line. 1654 } 1655 1656 bool Scanner::scanBlockScalar(bool IsLiteral) { 1657 // Eat '|' or '>' 1658 assert(*Current == '|' || *Current == '>'); 1659 skip(1); 1660 1661 char ChompingIndicator; 1662 unsigned BlockIndent; 1663 bool IsDone = false; 1664 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1665 return false; 1666 if (IsDone) 1667 return true; 1668 1669 auto Start = Current; 1670 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1671 unsigned LineBreaks = 0; 1672 if (BlockIndent == 0) { 1673 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1674 IsDone)) 1675 return false; 1676 } 1677 1678 // Scan the block's scalars body. 1679 SmallString<256> Str; 1680 while (!IsDone) { 1681 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1682 return false; 1683 if (IsDone) 1684 break; 1685 1686 // Parse the current line. 1687 auto LineStart = Current; 1688 advanceWhile(&Scanner::skip_nb_char); 1689 if (LineStart != Current) { 1690 Str.append(LineBreaks, '\n'); 1691 Str.append(StringRef(LineStart, Current - LineStart)); 1692 LineBreaks = 0; 1693 } 1694 1695 // Check for EOF. 1696 if (Current == End) 1697 break; 1698 1699 if (!consumeLineBreakIfPresent()) 1700 break; 1701 ++LineBreaks; 1702 } 1703 1704 if (Current == End && !LineBreaks) 1705 // Ensure that there is at least one line break before the end of file. 1706 LineBreaks = 1; 1707 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1708 1709 // New lines may start a simple key. 1710 if (!FlowLevel) 1711 IsSimpleKeyAllowed = true; 1712 1713 Token T; 1714 T.Kind = Token::TK_BlockScalar; 1715 T.Range = StringRef(Start, Current - Start); 1716 T.Value = std::string(Str); 1717 TokenQueue.push_back(T); 1718 return true; 1719 } 1720 1721 bool Scanner::scanTag() { 1722 StringRef::iterator Start = Current; 1723 unsigned ColStart = Column; 1724 skip(1); // Eat !. 1725 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1726 else if (*Current == '<') { 1727 skip(1); 1728 scan_ns_uri_char(); 1729 if (!consume('>')) 1730 return false; 1731 } else { 1732 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1733 Current = skip_while(&Scanner::skip_ns_char, Current); 1734 } 1735 1736 Token T; 1737 T.Kind = Token::TK_Tag; 1738 T.Range = StringRef(Start, Current - Start); 1739 TokenQueue.push_back(T); 1740 1741 // Tags can be simple keys. 1742 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1743 1744 IsSimpleKeyAllowed = false; 1745 1746 return true; 1747 } 1748 1749 bool Scanner::fetchMoreTokens() { 1750 if (IsStartOfStream) 1751 return scanStreamStart(); 1752 1753 scanToNextToken(); 1754 1755 if (Current == End) 1756 return scanStreamEnd(); 1757 1758 removeStaleSimpleKeyCandidates(); 1759 1760 unrollIndent(Column); 1761 1762 if (Column == 0 && *Current == '%') 1763 return scanDirective(); 1764 1765 if (Column == 0 && Current + 4 <= End 1766 && *Current == '-' 1767 && *(Current + 1) == '-' 1768 && *(Current + 2) == '-' 1769 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1770 return scanDocumentIndicator(true); 1771 1772 if (Column == 0 && Current + 4 <= End 1773 && *Current == '.' 1774 && *(Current + 1) == '.' 1775 && *(Current + 2) == '.' 1776 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1777 return scanDocumentIndicator(false); 1778 1779 if (*Current == '[') 1780 return scanFlowCollectionStart(true); 1781 1782 if (*Current == '{') 1783 return scanFlowCollectionStart(false); 1784 1785 if (*Current == ']') 1786 return scanFlowCollectionEnd(true); 1787 1788 if (*Current == '}') 1789 return scanFlowCollectionEnd(false); 1790 1791 if (*Current == ',') 1792 return scanFlowEntry(); 1793 1794 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1795 return scanBlockEntry(); 1796 1797 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1798 return scanKey(); 1799 1800 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1801 return scanValue(); 1802 1803 if (*Current == '*') 1804 return scanAliasOrAnchor(true); 1805 1806 if (*Current == '&') 1807 return scanAliasOrAnchor(false); 1808 1809 if (*Current == '!') 1810 return scanTag(); 1811 1812 if (*Current == '|' && !FlowLevel) 1813 return scanBlockScalar(true); 1814 1815 if (*Current == '>' && !FlowLevel) 1816 return scanBlockScalar(false); 1817 1818 if (*Current == '\'') 1819 return scanFlowScalar(false); 1820 1821 if (*Current == '"') 1822 return scanFlowScalar(true); 1823 1824 // Get a plain scalar. 1825 StringRef FirstChar(Current, 1); 1826 if (!(isBlankOrBreak(Current) 1827 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1828 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1829 || (!FlowLevel && (*Current == '?' || *Current == ':') 1830 && isBlankOrBreak(Current + 1)) 1831 || (!FlowLevel && *Current == ':' 1832 && Current + 2 < End 1833 && *(Current + 1) == ':' 1834 && !isBlankOrBreak(Current + 2))) 1835 return scanPlainScalar(); 1836 1837 setError("Unrecognized character while tokenizing.", Current); 1838 return false; 1839 } 1840 1841 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1842 std::error_code *EC) 1843 : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {} 1844 1845 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1846 std::error_code *EC) 1847 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {} 1848 1849 Stream::~Stream() = default; 1850 1851 bool Stream::failed() { return scanner->failed(); } 1852 1853 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { 1854 printError(N ? N->getSourceRange() : SMRange(), Msg, Kind); 1855 } 1856 1857 void Stream::printError(const SMRange &Range, const Twine &Msg, 1858 SourceMgr::DiagKind Kind) { 1859 scanner->printError(Range.Start, Kind, Msg, Range); 1860 } 1861 1862 document_iterator Stream::begin() { 1863 if (CurrentDoc) 1864 report_fatal_error("Can only iterate over the stream once"); 1865 1866 // Skip Stream-Start. 1867 scanner->getNext(); 1868 1869 CurrentDoc.reset(new Document(*this)); 1870 return document_iterator(CurrentDoc); 1871 } 1872 1873 document_iterator Stream::end() { 1874 return document_iterator(); 1875 } 1876 1877 void Stream::skip() { 1878 for (Document &Doc : *this) 1879 Doc.skip(); 1880 } 1881 1882 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1883 StringRef T) 1884 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1885 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1886 SourceRange = SMRange(Start, Start); 1887 } 1888 1889 std::string Node::getVerbatimTag() const { 1890 StringRef Raw = getRawTag(); 1891 if (!Raw.empty() && Raw != "!") { 1892 std::string Ret; 1893 if (Raw.find_last_of('!') == 0) { 1894 Ret = std::string(Doc->getTagMap().find("!")->second); 1895 Ret += Raw.substr(1); 1896 return Ret; 1897 } else if (Raw.startswith("!!")) { 1898 Ret = std::string(Doc->getTagMap().find("!!")->second); 1899 Ret += Raw.substr(2); 1900 return Ret; 1901 } else { 1902 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1903 std::map<StringRef, StringRef>::const_iterator It = 1904 Doc->getTagMap().find(TagHandle); 1905 if (It != Doc->getTagMap().end()) 1906 Ret = std::string(It->second); 1907 else { 1908 Token T; 1909 T.Kind = Token::TK_Tag; 1910 T.Range = TagHandle; 1911 setError(Twine("Unknown tag handle ") + TagHandle, T); 1912 } 1913 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1914 return Ret; 1915 } 1916 } 1917 1918 switch (getType()) { 1919 case NK_Null: 1920 return "tag:yaml.org,2002:null"; 1921 case NK_Scalar: 1922 case NK_BlockScalar: 1923 // TODO: Tag resolution. 1924 return "tag:yaml.org,2002:str"; 1925 case NK_Mapping: 1926 return "tag:yaml.org,2002:map"; 1927 case NK_Sequence: 1928 return "tag:yaml.org,2002:seq"; 1929 } 1930 1931 return ""; 1932 } 1933 1934 Token &Node::peekNext() { 1935 return Doc->peekNext(); 1936 } 1937 1938 Token Node::getNext() { 1939 return Doc->getNext(); 1940 } 1941 1942 Node *Node::parseBlockNode() { 1943 return Doc->parseBlockNode(); 1944 } 1945 1946 BumpPtrAllocator &Node::getAllocator() { 1947 return Doc->NodeAllocator; 1948 } 1949 1950 void Node::setError(const Twine &Msg, Token &Tok) const { 1951 Doc->setError(Msg, Tok); 1952 } 1953 1954 bool Node::failed() const { 1955 return Doc->failed(); 1956 } 1957 1958 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1959 // TODO: Handle newlines properly. We need to remove leading whitespace. 1960 if (Value[0] == '"') { // Double quoted. 1961 // Pull off the leading and trailing "s. 1962 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1963 // Search for characters that would require unescaping the value. 1964 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1965 if (i != StringRef::npos) 1966 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1967 return UnquotedValue; 1968 } else if (Value[0] == '\'') { // Single quoted. 1969 // Pull off the leading and trailing 's. 1970 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1971 StringRef::size_type i = UnquotedValue.find('\''); 1972 if (i != StringRef::npos) { 1973 // We're going to need Storage. 1974 Storage.clear(); 1975 Storage.reserve(UnquotedValue.size()); 1976 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1977 StringRef Valid(UnquotedValue.begin(), i); 1978 llvm::append_range(Storage, Valid); 1979 Storage.push_back('\''); 1980 UnquotedValue = UnquotedValue.substr(i + 2); 1981 } 1982 llvm::append_range(Storage, UnquotedValue); 1983 return StringRef(Storage.begin(), Storage.size()); 1984 } 1985 return UnquotedValue; 1986 } 1987 // Plain or block. 1988 return Value.rtrim(' '); 1989 } 1990 1991 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1992 , StringRef::size_type i 1993 , SmallVectorImpl<char> &Storage) 1994 const { 1995 // Use Storage to build proper value. 1996 Storage.clear(); 1997 Storage.reserve(UnquotedValue.size()); 1998 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1999 // Insert all previous chars into Storage. 2000 StringRef Valid(UnquotedValue.begin(), i); 2001 llvm::append_range(Storage, Valid); 2002 // Chop off inserted chars. 2003 UnquotedValue = UnquotedValue.substr(i); 2004 2005 assert(!UnquotedValue.empty() && "Can't be empty!"); 2006 2007 // Parse escape or line break. 2008 switch (UnquotedValue[0]) { 2009 case '\r': 2010 case '\n': 2011 Storage.push_back('\n'); 2012 if ( UnquotedValue.size() > 1 2013 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2014 UnquotedValue = UnquotedValue.substr(1); 2015 UnquotedValue = UnquotedValue.substr(1); 2016 break; 2017 default: 2018 if (UnquotedValue.size() == 1) { 2019 Token T; 2020 T.Range = StringRef(UnquotedValue.begin(), 1); 2021 setError("Unrecognized escape code", T); 2022 return ""; 2023 } 2024 UnquotedValue = UnquotedValue.substr(1); 2025 switch (UnquotedValue[0]) { 2026 default: { 2027 Token T; 2028 T.Range = StringRef(UnquotedValue.begin(), 1); 2029 setError("Unrecognized escape code", T); 2030 return ""; 2031 } 2032 case '\r': 2033 case '\n': 2034 // Remove the new line. 2035 if ( UnquotedValue.size() > 1 2036 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2037 UnquotedValue = UnquotedValue.substr(1); 2038 // If this was just a single byte newline, it will get skipped 2039 // below. 2040 break; 2041 case '0': 2042 Storage.push_back(0x00); 2043 break; 2044 case 'a': 2045 Storage.push_back(0x07); 2046 break; 2047 case 'b': 2048 Storage.push_back(0x08); 2049 break; 2050 case 't': 2051 case 0x09: 2052 Storage.push_back(0x09); 2053 break; 2054 case 'n': 2055 Storage.push_back(0x0A); 2056 break; 2057 case 'v': 2058 Storage.push_back(0x0B); 2059 break; 2060 case 'f': 2061 Storage.push_back(0x0C); 2062 break; 2063 case 'r': 2064 Storage.push_back(0x0D); 2065 break; 2066 case 'e': 2067 Storage.push_back(0x1B); 2068 break; 2069 case ' ': 2070 Storage.push_back(0x20); 2071 break; 2072 case '"': 2073 Storage.push_back(0x22); 2074 break; 2075 case '/': 2076 Storage.push_back(0x2F); 2077 break; 2078 case '\\': 2079 Storage.push_back(0x5C); 2080 break; 2081 case 'N': 2082 encodeUTF8(0x85, Storage); 2083 break; 2084 case '_': 2085 encodeUTF8(0xA0, Storage); 2086 break; 2087 case 'L': 2088 encodeUTF8(0x2028, Storage); 2089 break; 2090 case 'P': 2091 encodeUTF8(0x2029, Storage); 2092 break; 2093 case 'x': { 2094 if (UnquotedValue.size() < 3) 2095 // TODO: Report error. 2096 break; 2097 unsigned int UnicodeScalarValue; 2098 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2099 // TODO: Report error. 2100 UnicodeScalarValue = 0xFFFD; 2101 encodeUTF8(UnicodeScalarValue, Storage); 2102 UnquotedValue = UnquotedValue.substr(2); 2103 break; 2104 } 2105 case 'u': { 2106 if (UnquotedValue.size() < 5) 2107 // TODO: Report error. 2108 break; 2109 unsigned int UnicodeScalarValue; 2110 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2111 // TODO: Report error. 2112 UnicodeScalarValue = 0xFFFD; 2113 encodeUTF8(UnicodeScalarValue, Storage); 2114 UnquotedValue = UnquotedValue.substr(4); 2115 break; 2116 } 2117 case 'U': { 2118 if (UnquotedValue.size() < 9) 2119 // TODO: Report error. 2120 break; 2121 unsigned int UnicodeScalarValue; 2122 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2123 // TODO: Report error. 2124 UnicodeScalarValue = 0xFFFD; 2125 encodeUTF8(UnicodeScalarValue, Storage); 2126 UnquotedValue = UnquotedValue.substr(8); 2127 break; 2128 } 2129 } 2130 UnquotedValue = UnquotedValue.substr(1); 2131 } 2132 } 2133 llvm::append_range(Storage, UnquotedValue); 2134 return StringRef(Storage.begin(), Storage.size()); 2135 } 2136 2137 Node *KeyValueNode::getKey() { 2138 if (Key) 2139 return Key; 2140 // Handle implicit null keys. 2141 { 2142 Token &t = peekNext(); 2143 if ( t.Kind == Token::TK_BlockEnd 2144 || t.Kind == Token::TK_Value 2145 || t.Kind == Token::TK_Error) { 2146 return Key = new (getAllocator()) NullNode(Doc); 2147 } 2148 if (t.Kind == Token::TK_Key) 2149 getNext(); // skip TK_Key. 2150 } 2151 2152 // Handle explicit null keys. 2153 Token &t = peekNext(); 2154 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2155 return Key = new (getAllocator()) NullNode(Doc); 2156 } 2157 2158 // We've got a normal key. 2159 return Key = parseBlockNode(); 2160 } 2161 2162 Node *KeyValueNode::getValue() { 2163 if (Value) 2164 return Value; 2165 2166 if (Node* Key = getKey()) 2167 Key->skip(); 2168 else { 2169 setError("Null key in Key Value.", peekNext()); 2170 return Value = new (getAllocator()) NullNode(Doc); 2171 } 2172 2173 if (failed()) 2174 return Value = new (getAllocator()) NullNode(Doc); 2175 2176 // Handle implicit null values. 2177 { 2178 Token &t = peekNext(); 2179 if ( t.Kind == Token::TK_BlockEnd 2180 || t.Kind == Token::TK_FlowMappingEnd 2181 || t.Kind == Token::TK_Key 2182 || t.Kind == Token::TK_FlowEntry 2183 || t.Kind == Token::TK_Error) { 2184 return Value = new (getAllocator()) NullNode(Doc); 2185 } 2186 2187 if (t.Kind != Token::TK_Value) { 2188 setError("Unexpected token in Key Value.", t); 2189 return Value = new (getAllocator()) NullNode(Doc); 2190 } 2191 getNext(); // skip TK_Value. 2192 } 2193 2194 // Handle explicit null values. 2195 Token &t = peekNext(); 2196 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2197 return Value = new (getAllocator()) NullNode(Doc); 2198 } 2199 2200 // We got a normal value. 2201 return Value = parseBlockNode(); 2202 } 2203 2204 void MappingNode::increment() { 2205 if (failed()) { 2206 IsAtEnd = true; 2207 CurrentEntry = nullptr; 2208 return; 2209 } 2210 if (CurrentEntry) { 2211 CurrentEntry->skip(); 2212 if (Type == MT_Inline) { 2213 IsAtEnd = true; 2214 CurrentEntry = nullptr; 2215 return; 2216 } 2217 } 2218 Token T = peekNext(); 2219 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2220 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2221 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2222 } else if (Type == MT_Block) { 2223 switch (T.Kind) { 2224 case Token::TK_BlockEnd: 2225 getNext(); 2226 IsAtEnd = true; 2227 CurrentEntry = nullptr; 2228 break; 2229 default: 2230 setError("Unexpected token. Expected Key or Block End", T); 2231 LLVM_FALLTHROUGH; 2232 case Token::TK_Error: 2233 IsAtEnd = true; 2234 CurrentEntry = nullptr; 2235 } 2236 } else { 2237 switch (T.Kind) { 2238 case Token::TK_FlowEntry: 2239 // Eat the flow entry and recurse. 2240 getNext(); 2241 return increment(); 2242 case Token::TK_FlowMappingEnd: 2243 getNext(); 2244 LLVM_FALLTHROUGH; 2245 case Token::TK_Error: 2246 // Set this to end iterator. 2247 IsAtEnd = true; 2248 CurrentEntry = nullptr; 2249 break; 2250 default: 2251 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2252 "Mapping End." 2253 , T); 2254 IsAtEnd = true; 2255 CurrentEntry = nullptr; 2256 } 2257 } 2258 } 2259 2260 void SequenceNode::increment() { 2261 if (failed()) { 2262 IsAtEnd = true; 2263 CurrentEntry = nullptr; 2264 return; 2265 } 2266 if (CurrentEntry) 2267 CurrentEntry->skip(); 2268 Token T = peekNext(); 2269 if (SeqType == ST_Block) { 2270 switch (T.Kind) { 2271 case Token::TK_BlockEntry: 2272 getNext(); 2273 CurrentEntry = parseBlockNode(); 2274 if (!CurrentEntry) { // An error occurred. 2275 IsAtEnd = true; 2276 CurrentEntry = nullptr; 2277 } 2278 break; 2279 case Token::TK_BlockEnd: 2280 getNext(); 2281 IsAtEnd = true; 2282 CurrentEntry = nullptr; 2283 break; 2284 default: 2285 setError( "Unexpected token. Expected Block Entry or Block End." 2286 , T); 2287 LLVM_FALLTHROUGH; 2288 case Token::TK_Error: 2289 IsAtEnd = true; 2290 CurrentEntry = nullptr; 2291 } 2292 } else if (SeqType == ST_Indentless) { 2293 switch (T.Kind) { 2294 case Token::TK_BlockEntry: 2295 getNext(); 2296 CurrentEntry = parseBlockNode(); 2297 if (!CurrentEntry) { // An error occurred. 2298 IsAtEnd = true; 2299 CurrentEntry = nullptr; 2300 } 2301 break; 2302 default: 2303 case Token::TK_Error: 2304 IsAtEnd = true; 2305 CurrentEntry = nullptr; 2306 } 2307 } else if (SeqType == ST_Flow) { 2308 switch (T.Kind) { 2309 case Token::TK_FlowEntry: 2310 // Eat the flow entry and recurse. 2311 getNext(); 2312 WasPreviousTokenFlowEntry = true; 2313 return increment(); 2314 case Token::TK_FlowSequenceEnd: 2315 getNext(); 2316 LLVM_FALLTHROUGH; 2317 case Token::TK_Error: 2318 // Set this to end iterator. 2319 IsAtEnd = true; 2320 CurrentEntry = nullptr; 2321 break; 2322 case Token::TK_StreamEnd: 2323 case Token::TK_DocumentEnd: 2324 case Token::TK_DocumentStart: 2325 setError("Could not find closing ]!", T); 2326 // Set this to end iterator. 2327 IsAtEnd = true; 2328 CurrentEntry = nullptr; 2329 break; 2330 default: 2331 if (!WasPreviousTokenFlowEntry) { 2332 setError("Expected , between entries!", T); 2333 IsAtEnd = true; 2334 CurrentEntry = nullptr; 2335 break; 2336 } 2337 // Otherwise it must be a flow entry. 2338 CurrentEntry = parseBlockNode(); 2339 if (!CurrentEntry) { 2340 IsAtEnd = true; 2341 } 2342 WasPreviousTokenFlowEntry = false; 2343 break; 2344 } 2345 } 2346 } 2347 2348 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2349 // Tag maps starts with two default mappings. 2350 TagMap["!"] = "!"; 2351 TagMap["!!"] = "tag:yaml.org,2002:"; 2352 2353 if (parseDirectives()) 2354 expectToken(Token::TK_DocumentStart); 2355 Token &T = peekNext(); 2356 if (T.Kind == Token::TK_DocumentStart) 2357 getNext(); 2358 } 2359 2360 bool Document::skip() { 2361 if (stream.scanner->failed()) 2362 return false; 2363 if (!Root && !getRoot()) 2364 return false; 2365 Root->skip(); 2366 Token &T = peekNext(); 2367 if (T.Kind == Token::TK_StreamEnd) 2368 return false; 2369 if (T.Kind == Token::TK_DocumentEnd) { 2370 getNext(); 2371 return skip(); 2372 } 2373 return true; 2374 } 2375 2376 Token &Document::peekNext() { 2377 return stream.scanner->peekNext(); 2378 } 2379 2380 Token Document::getNext() { 2381 return stream.scanner->getNext(); 2382 } 2383 2384 void Document::setError(const Twine &Message, Token &Location) const { 2385 stream.scanner->setError(Message, Location.Range.begin()); 2386 } 2387 2388 bool Document::failed() const { 2389 return stream.scanner->failed(); 2390 } 2391 2392 Node *Document::parseBlockNode() { 2393 Token T = peekNext(); 2394 // Handle properties. 2395 Token AnchorInfo; 2396 Token TagInfo; 2397 parse_property: 2398 switch (T.Kind) { 2399 case Token::TK_Alias: 2400 getNext(); 2401 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2402 case Token::TK_Anchor: 2403 if (AnchorInfo.Kind == Token::TK_Anchor) { 2404 setError("Already encountered an anchor for this node!", T); 2405 return nullptr; 2406 } 2407 AnchorInfo = getNext(); // Consume TK_Anchor. 2408 T = peekNext(); 2409 goto parse_property; 2410 case Token::TK_Tag: 2411 if (TagInfo.Kind == Token::TK_Tag) { 2412 setError("Already encountered a tag for this node!", T); 2413 return nullptr; 2414 } 2415 TagInfo = getNext(); // Consume TK_Tag. 2416 T = peekNext(); 2417 goto parse_property; 2418 default: 2419 break; 2420 } 2421 2422 switch (T.Kind) { 2423 case Token::TK_BlockEntry: 2424 // We got an unindented BlockEntry sequence. This is not terminated with 2425 // a BlockEnd. 2426 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2427 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2428 , AnchorInfo.Range.substr(1) 2429 , TagInfo.Range 2430 , SequenceNode::ST_Indentless); 2431 case Token::TK_BlockSequenceStart: 2432 getNext(); 2433 return new (NodeAllocator) 2434 SequenceNode( stream.CurrentDoc 2435 , AnchorInfo.Range.substr(1) 2436 , TagInfo.Range 2437 , SequenceNode::ST_Block); 2438 case Token::TK_BlockMappingStart: 2439 getNext(); 2440 return new (NodeAllocator) 2441 MappingNode( stream.CurrentDoc 2442 , AnchorInfo.Range.substr(1) 2443 , TagInfo.Range 2444 , MappingNode::MT_Block); 2445 case Token::TK_FlowSequenceStart: 2446 getNext(); 2447 return new (NodeAllocator) 2448 SequenceNode( stream.CurrentDoc 2449 , AnchorInfo.Range.substr(1) 2450 , TagInfo.Range 2451 , SequenceNode::ST_Flow); 2452 case Token::TK_FlowMappingStart: 2453 getNext(); 2454 return new (NodeAllocator) 2455 MappingNode( stream.CurrentDoc 2456 , AnchorInfo.Range.substr(1) 2457 , TagInfo.Range 2458 , MappingNode::MT_Flow); 2459 case Token::TK_Scalar: 2460 getNext(); 2461 return new (NodeAllocator) 2462 ScalarNode( stream.CurrentDoc 2463 , AnchorInfo.Range.substr(1) 2464 , TagInfo.Range 2465 , T.Range); 2466 case Token::TK_BlockScalar: { 2467 getNext(); 2468 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2469 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2470 return new (NodeAllocator) 2471 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2472 TagInfo.Range, StrCopy, T.Range); 2473 } 2474 case Token::TK_Key: 2475 // Don't eat the TK_Key, KeyValueNode expects it. 2476 return new (NodeAllocator) 2477 MappingNode( stream.CurrentDoc 2478 , AnchorInfo.Range.substr(1) 2479 , TagInfo.Range 2480 , MappingNode::MT_Inline); 2481 case Token::TK_DocumentStart: 2482 case Token::TK_DocumentEnd: 2483 case Token::TK_StreamEnd: 2484 default: 2485 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2486 // !!null null. 2487 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2488 case Token::TK_FlowMappingEnd: 2489 case Token::TK_FlowSequenceEnd: 2490 case Token::TK_FlowEntry: { 2491 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2492 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2493 2494 setError("Unexpected token", T); 2495 return nullptr; 2496 } 2497 case Token::TK_Error: 2498 return nullptr; 2499 } 2500 llvm_unreachable("Control flow shouldn't reach here."); 2501 return nullptr; 2502 } 2503 2504 bool Document::parseDirectives() { 2505 bool isDirective = false; 2506 while (true) { 2507 Token T = peekNext(); 2508 if (T.Kind == Token::TK_TagDirective) { 2509 parseTAGDirective(); 2510 isDirective = true; 2511 } else if (T.Kind == Token::TK_VersionDirective) { 2512 parseYAMLDirective(); 2513 isDirective = true; 2514 } else 2515 break; 2516 } 2517 return isDirective; 2518 } 2519 2520 void Document::parseYAMLDirective() { 2521 getNext(); // Eat %YAML <version> 2522 } 2523 2524 void Document::parseTAGDirective() { 2525 Token Tag = getNext(); // %TAG <handle> <prefix> 2526 StringRef T = Tag.Range; 2527 // Strip %TAG 2528 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2529 std::size_t HandleEnd = T.find_first_of(" \t"); 2530 StringRef TagHandle = T.substr(0, HandleEnd); 2531 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2532 TagMap[TagHandle] = TagPrefix; 2533 } 2534 2535 bool Document::expectToken(int TK) { 2536 Token T = getNext(); 2537 if (T.Kind != TK) { 2538 setError("Unexpected token", T); 2539 return false; 2540 } 2541 return true; 2542 } 2543