1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements a YAML parser. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "llvm/Support/YAMLParser.h" 14 #include "llvm/ADT/AllocatorList.h" 15 #include "llvm/ADT/ArrayRef.h" 16 #include "llvm/ADT/None.h" 17 #include "llvm/ADT/STLExtras.h" 18 #include "llvm/ADT/SmallString.h" 19 #include "llvm/ADT/SmallVector.h" 20 #include "llvm/ADT/StringExtras.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/ADT/Twine.h" 23 #include "llvm/Support/Compiler.h" 24 #include "llvm/Support/ErrorHandling.h" 25 #include "llvm/Support/MemoryBuffer.h" 26 #include "llvm/Support/SMLoc.h" 27 #include "llvm/Support/SourceMgr.h" 28 #include "llvm/Support/Unicode.h" 29 #include "llvm/Support/raw_ostream.h" 30 #include <cassert> 31 #include <cstddef> 32 #include <cstdint> 33 #include <map> 34 #include <memory> 35 #include <string> 36 #include <system_error> 37 #include <utility> 38 39 using namespace llvm; 40 using namespace yaml; 41 42 enum UnicodeEncodingForm { 43 UEF_UTF32_LE, ///< UTF-32 Little Endian 44 UEF_UTF32_BE, ///< UTF-32 Big Endian 45 UEF_UTF16_LE, ///< UTF-16 Little Endian 46 UEF_UTF16_BE, ///< UTF-16 Big Endian 47 UEF_UTF8, ///< UTF-8 or ascii. 48 UEF_Unknown ///< Not a valid Unicode encoding. 49 }; 50 51 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 52 /// it exists. Length is in {0, 2, 3, 4}. 53 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>; 54 55 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 56 /// encoding form of \a Input. 57 /// 58 /// @param Input A string of length 0 or more. 59 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 60 /// and how long the byte order mark is if one exists. 61 static EncodingInfo getUnicodeEncoding(StringRef Input) { 62 if (Input.empty()) 63 return std::make_pair(UEF_Unknown, 0); 64 65 switch (uint8_t(Input[0])) { 66 case 0x00: 67 if (Input.size() >= 4) { 68 if ( Input[1] == 0 69 && uint8_t(Input[2]) == 0xFE 70 && uint8_t(Input[3]) == 0xFF) 71 return std::make_pair(UEF_UTF32_BE, 4); 72 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 73 return std::make_pair(UEF_UTF32_BE, 0); 74 } 75 76 if (Input.size() >= 2 && Input[1] != 0) 77 return std::make_pair(UEF_UTF16_BE, 0); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xFF: 80 if ( Input.size() >= 4 81 && uint8_t(Input[1]) == 0xFE 82 && Input[2] == 0 83 && Input[3] == 0) 84 return std::make_pair(UEF_UTF32_LE, 4); 85 86 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 87 return std::make_pair(UEF_UTF16_LE, 2); 88 return std::make_pair(UEF_Unknown, 0); 89 case 0xFE: 90 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 91 return std::make_pair(UEF_UTF16_BE, 2); 92 return std::make_pair(UEF_Unknown, 0); 93 case 0xEF: 94 if ( Input.size() >= 3 95 && uint8_t(Input[1]) == 0xBB 96 && uint8_t(Input[2]) == 0xBF) 97 return std::make_pair(UEF_UTF8, 3); 98 return std::make_pair(UEF_Unknown, 0); 99 } 100 101 // It could still be utf-32 or utf-16. 102 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 103 return std::make_pair(UEF_UTF32_LE, 0); 104 105 if (Input.size() >= 2 && Input[1] == 0) 106 return std::make_pair(UEF_UTF16_LE, 0); 107 108 return std::make_pair(UEF_UTF8, 0); 109 } 110 111 /// Pin the vtables to this file. 112 void Node::anchor() {} 113 void NullNode::anchor() {} 114 void ScalarNode::anchor() {} 115 void BlockScalarNode::anchor() {} 116 void KeyValueNode::anchor() {} 117 void MappingNode::anchor() {} 118 void SequenceNode::anchor() {} 119 void AliasNode::anchor() {} 120 121 namespace llvm { 122 namespace yaml { 123 124 /// Token - A single YAML token. 125 struct Token { 126 enum TokenKind { 127 TK_Error, // Uninitialized token. 128 TK_StreamStart, 129 TK_StreamEnd, 130 TK_VersionDirective, 131 TK_TagDirective, 132 TK_DocumentStart, 133 TK_DocumentEnd, 134 TK_BlockEntry, 135 TK_BlockEnd, 136 TK_BlockSequenceStart, 137 TK_BlockMappingStart, 138 TK_FlowEntry, 139 TK_FlowSequenceStart, 140 TK_FlowSequenceEnd, 141 TK_FlowMappingStart, 142 TK_FlowMappingEnd, 143 TK_Key, 144 TK_Value, 145 TK_Scalar, 146 TK_BlockScalar, 147 TK_Alias, 148 TK_Anchor, 149 TK_Tag 150 } Kind = TK_Error; 151 152 /// A string of length 0 or more whose begin() points to the logical location 153 /// of the token in the input. 154 StringRef Range; 155 156 /// The value of a block scalar node. 157 std::string Value; 158 159 Token() = default; 160 }; 161 162 } // end namespace yaml 163 } // end namespace llvm 164 165 using TokenQueueT = BumpPtrList<Token>; 166 167 namespace { 168 169 /// This struct is used to track simple keys. 170 /// 171 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 172 /// which could legally be the start of a simple key. When peekNext is called, 173 /// if the Token To be returned is referenced by a SimpleKey, we continue 174 /// tokenizing until that potential simple key has either been found to not be 175 /// a simple key (we moved on to the next line or went further than 1024 chars). 176 /// Or when we run into a Value, and then insert a Key token (and possibly 177 /// others) before the SimpleKey's Tok. 178 struct SimpleKey { 179 TokenQueueT::iterator Tok; 180 unsigned Column = 0; 181 unsigned Line = 0; 182 unsigned FlowLevel = 0; 183 bool IsRequired = false; 184 185 bool operator ==(const SimpleKey &Other) { 186 return Tok == Other.Tok; 187 } 188 }; 189 190 } // end anonymous namespace 191 192 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit 193 /// subsequence and the subsequence's length in code units (uint8_t). 194 /// A length of 0 represents an error. 195 using UTF8Decoded = std::pair<uint32_t, unsigned>; 196 197 static UTF8Decoded decodeUTF8(StringRef Range) { 198 StringRef::iterator Position= Range.begin(); 199 StringRef::iterator End = Range.end(); 200 // 1 byte: [0x00, 0x7f] 201 // Bit pattern: 0xxxxxxx 202 if (Position < End && (*Position & 0x80) == 0) { 203 return std::make_pair(*Position, 1); 204 } 205 // 2 bytes: [0x80, 0x7ff] 206 // Bit pattern: 110xxxxx 10xxxxxx 207 if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) && 208 ((*(Position + 1) & 0xC0) == 0x80)) { 209 uint32_t codepoint = ((*Position & 0x1F) << 6) | 210 (*(Position + 1) & 0x3F); 211 if (codepoint >= 0x80) 212 return std::make_pair(codepoint, 2); 213 } 214 // 3 bytes: [0x8000, 0xffff] 215 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 216 if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) && 217 ((*(Position + 1) & 0xC0) == 0x80) && 218 ((*(Position + 2) & 0xC0) == 0x80)) { 219 uint32_t codepoint = ((*Position & 0x0F) << 12) | 220 ((*(Position + 1) & 0x3F) << 6) | 221 (*(Position + 2) & 0x3F); 222 // Codepoints between 0xD800 and 0xDFFF are invalid, as 223 // they are high / low surrogate halves used by UTF-16. 224 if (codepoint >= 0x800 && 225 (codepoint < 0xD800 || codepoint > 0xDFFF)) 226 return std::make_pair(codepoint, 3); 227 } 228 // 4 bytes: [0x10000, 0x10FFFF] 229 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 230 if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) && 231 ((*(Position + 1) & 0xC0) == 0x80) && 232 ((*(Position + 2) & 0xC0) == 0x80) && 233 ((*(Position + 3) & 0xC0) == 0x80)) { 234 uint32_t codepoint = ((*Position & 0x07) << 18) | 235 ((*(Position + 1) & 0x3F) << 12) | 236 ((*(Position + 2) & 0x3F) << 6) | 237 (*(Position + 3) & 0x3F); 238 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 239 return std::make_pair(codepoint, 4); 240 } 241 return std::make_pair(0, 0); 242 } 243 244 namespace llvm { 245 namespace yaml { 246 247 /// Scans YAML tokens from a MemoryBuffer. 248 class Scanner { 249 public: 250 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true, 251 std::error_code *EC = nullptr); 252 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true, 253 std::error_code *EC = nullptr); 254 255 /// Parse the next token and return it without popping it. 256 Token &peekNext(); 257 258 /// Parse the next token and pop it from the queue. 259 Token getNext(); 260 261 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 262 ArrayRef<SMRange> Ranges = None) { 263 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 264 } 265 266 void setError(const Twine &Message, StringRef::iterator Position) { 267 if (Position >= End) 268 Position = End - 1; 269 270 // propagate the error if possible 271 if (EC) 272 *EC = make_error_code(std::errc::invalid_argument); 273 274 // Don't print out more errors after the first one we encounter. The rest 275 // are just the result of the first, and have no meaning. 276 if (!Failed) 277 printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message); 278 Failed = true; 279 } 280 281 /// Returns true if an error occurred while parsing. 282 bool failed() { 283 return Failed; 284 } 285 286 private: 287 void init(MemoryBufferRef Buffer); 288 289 StringRef currentInput() { 290 return StringRef(Current, End - Current); 291 } 292 293 /// Decode a UTF-8 minimal well-formed code unit subsequence starting 294 /// at \a Position. 295 /// 296 /// If the UTF-8 code units starting at Position do not form a well-formed 297 /// code unit subsequence, then the Unicode scalar value is 0, and the length 298 /// is 0. 299 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 300 return ::decodeUTF8(StringRef(Position, End - Position)); 301 } 302 303 // The following functions are based on the gramar rules in the YAML spec. The 304 // style of the function names it meant to closely match how they are written 305 // in the spec. The number within the [] is the number of the grammar rule in 306 // the spec. 307 // 308 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 309 // 310 // c- 311 // A production starting and ending with a special character. 312 // b- 313 // A production matching a single line break. 314 // nb- 315 // A production starting and ending with a non-break character. 316 // s- 317 // A production starting and ending with a white space character. 318 // ns- 319 // A production starting and ending with a non-space character. 320 // l- 321 // A production matching complete line(s). 322 323 /// Skip a single nb-char[27] starting at Position. 324 /// 325 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 326 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 327 /// 328 /// @returns The code unit after the nb-char, or Position if it's not an 329 /// nb-char. 330 StringRef::iterator skip_nb_char(StringRef::iterator Position); 331 332 /// Skip a single b-break[28] starting at Position. 333 /// 334 /// A b-break is 0xD 0xA | 0xD | 0xA 335 /// 336 /// @returns The code unit after the b-break, or Position if it's not a 337 /// b-break. 338 StringRef::iterator skip_b_break(StringRef::iterator Position); 339 340 /// Skip a single s-space[31] starting at Position. 341 /// 342 /// An s-space is 0x20 343 /// 344 /// @returns The code unit after the s-space, or Position if it's not a 345 /// s-space. 346 StringRef::iterator skip_s_space(StringRef::iterator Position); 347 348 /// Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator); 365 366 /// Skip minimal well-formed code unit subsequences until Func 367 /// returns its input. 368 /// 369 /// @returns The code unit after the last minimal well-formed code unit 370 /// subsequence that Func accepted. 371 StringRef::iterator skip_while( SkipWhileFunc Func 372 , StringRef::iterator Position); 373 374 /// Skip minimal well-formed code unit subsequences until Func returns its 375 /// input. 376 void advanceWhile(SkipWhileFunc Func); 377 378 /// Scan ns-uri-char[39]s starting at Cur. 379 /// 380 /// This updates Cur and Column while scanning. 381 void scan_ns_uri_char(); 382 383 /// Consume a minimal well-formed code unit subsequence starting at 384 /// \a Cur. Return false if it is not the same Unicode scalar value as 385 /// \a Expected. This updates \a Column. 386 bool consume(uint32_t Expected); 387 388 /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 389 void skip(uint32_t Distance); 390 391 /// Return true if the minimal well-formed code unit subsequence at 392 /// Pos is whitespace or a new line 393 bool isBlankOrBreak(StringRef::iterator Position); 394 395 /// Return true if the line is a line break, false otherwise. 396 bool isLineEmpty(StringRef Line); 397 398 /// Consume a single b-break[28] if it's present at the current position. 399 /// 400 /// Return false if the code unit at the current position isn't a line break. 401 bool consumeLineBreakIfPresent(); 402 403 /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 404 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 405 , unsigned AtColumn 406 , bool IsRequired); 407 408 /// Remove simple keys that can no longer be valid simple keys. 409 /// 410 /// Invalid simple keys are not on the current line or are further than 1024 411 /// columns back. 412 void removeStaleSimpleKeyCandidates(); 413 414 /// Remove all simple keys on FlowLevel \a Level. 415 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 416 417 /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 418 /// tokens if needed. 419 bool unrollIndent(int ToColumn); 420 421 /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 422 /// if needed. 423 bool rollIndent( int ToColumn 424 , Token::TokenKind Kind 425 , TokenQueueT::iterator InsertPoint); 426 427 /// Skip a single-line comment when the comment starts at the current 428 /// position of the scanner. 429 void skipComment(); 430 431 /// Skip whitespace and comments until the start of the next token. 432 void scanToNextToken(); 433 434 /// Must be the first token generated. 435 bool scanStreamStart(); 436 437 /// Generate tokens needed to close out the stream. 438 bool scanStreamEnd(); 439 440 /// Scan a %BLAH directive. 441 bool scanDirective(); 442 443 /// Scan a ... or ---. 444 bool scanDocumentIndicator(bool IsStart); 445 446 /// Scan a [ or { and generate the proper flow collection start token. 447 bool scanFlowCollectionStart(bool IsSequence); 448 449 /// Scan a ] or } and generate the proper flow collection end token. 450 bool scanFlowCollectionEnd(bool IsSequence); 451 452 /// Scan the , that separates entries in a flow collection. 453 bool scanFlowEntry(); 454 455 /// Scan the - that starts block sequence entries. 456 bool scanBlockEntry(); 457 458 /// Scan an explicit ? indicating a key. 459 bool scanKey(); 460 461 /// Scan an explicit : indicating a value. 462 bool scanValue(); 463 464 /// Scan a quoted scalar. 465 bool scanFlowScalar(bool IsDoubleQuoted); 466 467 /// Scan an unquoted scalar. 468 bool scanPlainScalar(); 469 470 /// Scan an Alias or Anchor starting with * or &. 471 bool scanAliasOrAnchor(bool IsAlias); 472 473 /// Scan a block scalar starting with | or >. 474 bool scanBlockScalar(bool IsLiteral); 475 476 /// Scan a block scalar style indicator and header. 477 /// 478 /// Note: This is distinct from scanBlockScalarHeader to mirror the fact that 479 /// YAML does not consider the style indicator to be a part of the header. 480 /// 481 /// Return false if an error occurred. 482 bool scanBlockScalarIndicators(char &StyleIndicator, char &ChompingIndicator, 483 unsigned &IndentIndicator, bool &IsDone); 484 485 /// Scan a style indicator in a block scalar header. 486 char scanBlockStyleIndicator(); 487 488 /// Scan a chomping indicator in a block scalar header. 489 char scanBlockChompingIndicator(); 490 491 /// Scan an indentation indicator in a block scalar header. 492 unsigned scanBlockIndentationIndicator(); 493 494 /// Scan a block scalar header. 495 /// 496 /// Return false if an error occurred. 497 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 498 bool &IsDone); 499 500 /// Look for the indentation level of a block scalar. 501 /// 502 /// Return false if an error occurred. 503 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 504 unsigned &LineBreaks, bool &IsDone); 505 506 /// Scan the indentation of a text line in a block scalar. 507 /// 508 /// Return false if an error occurred. 509 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 510 bool &IsDone); 511 512 /// Scan a tag of the form !stuff. 513 bool scanTag(); 514 515 /// Dispatch to the next scanning function based on \a *Cur. 516 bool fetchMoreTokens(); 517 518 /// The SourceMgr used for diagnostics and buffer management. 519 SourceMgr &SM; 520 521 /// The original input. 522 MemoryBufferRef InputBuffer; 523 524 /// The current position of the scanner. 525 StringRef::iterator Current; 526 527 /// The end of the input (one past the last character). 528 StringRef::iterator End; 529 530 /// Current YAML indentation level in spaces. 531 int Indent; 532 533 /// Current column number in Unicode code points. 534 unsigned Column; 535 536 /// Current line number. 537 unsigned Line; 538 539 /// How deep we are in flow style containers. 0 Means at block level. 540 unsigned FlowLevel; 541 542 /// Are we at the start of the stream? 543 bool IsStartOfStream; 544 545 /// Can the next token be the start of a simple key? 546 bool IsSimpleKeyAllowed; 547 548 /// True if an error has occurred. 549 bool Failed; 550 551 /// Should colors be used when printing out the diagnostic messages? 552 bool ShowColors; 553 554 /// Queue of tokens. This is required to queue up tokens while looking 555 /// for the end of a simple key. And for cases where a single character 556 /// can produce multiple tokens (e.g. BlockEnd). 557 TokenQueueT TokenQueue; 558 559 /// Indentation levels. 560 SmallVector<int, 4> Indents; 561 562 /// Potential simple keys. 563 SmallVector<SimpleKey, 4> SimpleKeys; 564 565 std::error_code *EC; 566 }; 567 568 } // end namespace yaml 569 } // end namespace llvm 570 571 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 572 static void encodeUTF8( uint32_t UnicodeScalarValue 573 , SmallVectorImpl<char> &Result) { 574 if (UnicodeScalarValue <= 0x7F) { 575 Result.push_back(UnicodeScalarValue & 0x7F); 576 } else if (UnicodeScalarValue <= 0x7FF) { 577 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 578 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 579 Result.push_back(FirstByte); 580 Result.push_back(SecondByte); 581 } else if (UnicodeScalarValue <= 0xFFFF) { 582 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 583 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 584 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 585 Result.push_back(FirstByte); 586 Result.push_back(SecondByte); 587 Result.push_back(ThirdByte); 588 } else if (UnicodeScalarValue <= 0x10FFFF) { 589 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 590 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 591 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 592 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 593 Result.push_back(FirstByte); 594 Result.push_back(SecondByte); 595 Result.push_back(ThirdByte); 596 Result.push_back(FourthByte); 597 } 598 } 599 600 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 601 SourceMgr SM; 602 Scanner scanner(Input, SM); 603 while (true) { 604 Token T = scanner.getNext(); 605 switch (T.Kind) { 606 case Token::TK_StreamStart: 607 OS << "Stream-Start: "; 608 break; 609 case Token::TK_StreamEnd: 610 OS << "Stream-End: "; 611 break; 612 case Token::TK_VersionDirective: 613 OS << "Version-Directive: "; 614 break; 615 case Token::TK_TagDirective: 616 OS << "Tag-Directive: "; 617 break; 618 case Token::TK_DocumentStart: 619 OS << "Document-Start: "; 620 break; 621 case Token::TK_DocumentEnd: 622 OS << "Document-End: "; 623 break; 624 case Token::TK_BlockEntry: 625 OS << "Block-Entry: "; 626 break; 627 case Token::TK_BlockEnd: 628 OS << "Block-End: "; 629 break; 630 case Token::TK_BlockSequenceStart: 631 OS << "Block-Sequence-Start: "; 632 break; 633 case Token::TK_BlockMappingStart: 634 OS << "Block-Mapping-Start: "; 635 break; 636 case Token::TK_FlowEntry: 637 OS << "Flow-Entry: "; 638 break; 639 case Token::TK_FlowSequenceStart: 640 OS << "Flow-Sequence-Start: "; 641 break; 642 case Token::TK_FlowSequenceEnd: 643 OS << "Flow-Sequence-End: "; 644 break; 645 case Token::TK_FlowMappingStart: 646 OS << "Flow-Mapping-Start: "; 647 break; 648 case Token::TK_FlowMappingEnd: 649 OS << "Flow-Mapping-End: "; 650 break; 651 case Token::TK_Key: 652 OS << "Key: "; 653 break; 654 case Token::TK_Value: 655 OS << "Value: "; 656 break; 657 case Token::TK_Scalar: 658 OS << "Scalar: "; 659 break; 660 case Token::TK_BlockScalar: 661 OS << "Block Scalar: "; 662 break; 663 case Token::TK_Alias: 664 OS << "Alias: "; 665 break; 666 case Token::TK_Anchor: 667 OS << "Anchor: "; 668 break; 669 case Token::TK_Tag: 670 OS << "Tag: "; 671 break; 672 case Token::TK_Error: 673 break; 674 } 675 OS << T.Range << "\n"; 676 if (T.Kind == Token::TK_StreamEnd) 677 break; 678 else if (T.Kind == Token::TK_Error) 679 return false; 680 } 681 return true; 682 } 683 684 bool yaml::scanTokens(StringRef Input) { 685 SourceMgr SM; 686 Scanner scanner(Input, SM); 687 while (true) { 688 Token T = scanner.getNext(); 689 if (T.Kind == Token::TK_StreamEnd) 690 break; 691 else if (T.Kind == Token::TK_Error) 692 return false; 693 } 694 return true; 695 } 696 697 std::string yaml::escape(StringRef Input, bool EscapePrintable) { 698 std::string EscapedInput; 699 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 700 if (*i == '\\') 701 EscapedInput += "\\\\"; 702 else if (*i == '"') 703 EscapedInput += "\\\""; 704 else if (*i == 0) 705 EscapedInput += "\\0"; 706 else if (*i == 0x07) 707 EscapedInput += "\\a"; 708 else if (*i == 0x08) 709 EscapedInput += "\\b"; 710 else if (*i == 0x09) 711 EscapedInput += "\\t"; 712 else if (*i == 0x0A) 713 EscapedInput += "\\n"; 714 else if (*i == 0x0B) 715 EscapedInput += "\\v"; 716 else if (*i == 0x0C) 717 EscapedInput += "\\f"; 718 else if (*i == 0x0D) 719 EscapedInput += "\\r"; 720 else if (*i == 0x1B) 721 EscapedInput += "\\e"; 722 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 723 std::string HexStr = utohexstr(*i); 724 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 725 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 726 UTF8Decoded UnicodeScalarValue 727 = decodeUTF8(StringRef(i, Input.end() - i)); 728 if (UnicodeScalarValue.second == 0) { 729 // Found invalid char. 730 SmallString<4> Val; 731 encodeUTF8(0xFFFD, Val); 732 llvm::append_range(EscapedInput, Val); 733 // FIXME: Error reporting. 734 return EscapedInput; 735 } 736 if (UnicodeScalarValue.first == 0x85) 737 EscapedInput += "\\N"; 738 else if (UnicodeScalarValue.first == 0xA0) 739 EscapedInput += "\\_"; 740 else if (UnicodeScalarValue.first == 0x2028) 741 EscapedInput += "\\L"; 742 else if (UnicodeScalarValue.first == 0x2029) 743 EscapedInput += "\\P"; 744 else if (!EscapePrintable && 745 sys::unicode::isPrintable(UnicodeScalarValue.first)) 746 EscapedInput += StringRef(i, UnicodeScalarValue.second); 747 else { 748 std::string HexStr = utohexstr(UnicodeScalarValue.first); 749 if (HexStr.size() <= 2) 750 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 751 else if (HexStr.size() <= 4) 752 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 753 else if (HexStr.size() <= 8) 754 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 755 } 756 i += UnicodeScalarValue.second - 1; 757 } else 758 EscapedInput.push_back(*i); 759 } 760 return EscapedInput; 761 } 762 763 llvm::Optional<bool> yaml::parseBool(StringRef S) { 764 switch (S.size()) { 765 case 1: 766 switch (S.front()) { 767 case 'y': 768 case 'Y': 769 return true; 770 case 'n': 771 case 'N': 772 return false; 773 default: 774 return None; 775 } 776 case 2: 777 switch (S.front()) { 778 case 'O': 779 if (S[1] == 'N') // ON 780 return true; 781 LLVM_FALLTHROUGH; 782 case 'o': 783 if (S[1] == 'n') //[Oo]n 784 return true; 785 return None; 786 case 'N': 787 if (S[1] == 'O') // NO 788 return false; 789 LLVM_FALLTHROUGH; 790 case 'n': 791 if (S[1] == 'o') //[Nn]o 792 return false; 793 return None; 794 default: 795 return None; 796 } 797 case 3: 798 switch (S.front()) { 799 case 'O': 800 if (S.drop_front() == "FF") // OFF 801 return false; 802 LLVM_FALLTHROUGH; 803 case 'o': 804 if (S.drop_front() == "ff") //[Oo]ff 805 return false; 806 return None; 807 case 'Y': 808 if (S.drop_front() == "ES") // YES 809 return true; 810 LLVM_FALLTHROUGH; 811 case 'y': 812 if (S.drop_front() == "es") //[Yy]es 813 return true; 814 return None; 815 default: 816 return None; 817 } 818 case 4: 819 switch (S.front()) { 820 case 'T': 821 if (S.drop_front() == "RUE") // TRUE 822 return true; 823 LLVM_FALLTHROUGH; 824 case 't': 825 if (S.drop_front() == "rue") //[Tt]rue 826 return true; 827 return None; 828 default: 829 return None; 830 } 831 case 5: 832 switch (S.front()) { 833 case 'F': 834 if (S.drop_front() == "ALSE") // FALSE 835 return false; 836 LLVM_FALLTHROUGH; 837 case 'f': 838 if (S.drop_front() == "alse") //[Ff]alse 839 return false; 840 return None; 841 default: 842 return None; 843 } 844 default: 845 return None; 846 } 847 } 848 849 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors, 850 std::error_code *EC) 851 : SM(sm), ShowColors(ShowColors), EC(EC) { 852 init(MemoryBufferRef(Input, "YAML")); 853 } 854 855 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors, 856 std::error_code *EC) 857 : SM(SM_), ShowColors(ShowColors), EC(EC) { 858 init(Buffer); 859 } 860 861 void Scanner::init(MemoryBufferRef Buffer) { 862 InputBuffer = Buffer; 863 Current = InputBuffer.getBufferStart(); 864 End = InputBuffer.getBufferEnd(); 865 Indent = -1; 866 Column = 0; 867 Line = 0; 868 FlowLevel = 0; 869 IsStartOfStream = true; 870 IsSimpleKeyAllowed = true; 871 Failed = false; 872 std::unique_ptr<MemoryBuffer> InputBufferOwner = 873 MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false); 874 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 875 } 876 877 Token &Scanner::peekNext() { 878 // If the current token is a possible simple key, keep parsing until we 879 // can confirm. 880 bool NeedMore = false; 881 while (true) { 882 if (TokenQueue.empty() || NeedMore) { 883 if (!fetchMoreTokens()) { 884 TokenQueue.clear(); 885 SimpleKeys.clear(); 886 TokenQueue.push_back(Token()); 887 return TokenQueue.front(); 888 } 889 } 890 assert(!TokenQueue.empty() && 891 "fetchMoreTokens lied about getting tokens!"); 892 893 removeStaleSimpleKeyCandidates(); 894 SimpleKey SK; 895 SK.Tok = TokenQueue.begin(); 896 if (!is_contained(SimpleKeys, SK)) 897 break; 898 else 899 NeedMore = true; 900 } 901 return TokenQueue.front(); 902 } 903 904 Token Scanner::getNext() { 905 Token Ret = peekNext(); 906 // TokenQueue can be empty if there was an error getting the next token. 907 if (!TokenQueue.empty()) 908 TokenQueue.pop_front(); 909 910 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 911 // quick deallocation of them all. 912 if (TokenQueue.empty()) 913 TokenQueue.resetAlloc(); 914 915 return Ret; 916 } 917 918 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 919 if (Position == End) 920 return Position; 921 // Check 7 bit c-printable - b-char. 922 if ( *Position == 0x09 923 || (*Position >= 0x20 && *Position <= 0x7E)) 924 return Position + 1; 925 926 // Check for valid UTF-8. 927 if (uint8_t(*Position) & 0x80) { 928 UTF8Decoded u8d = decodeUTF8(Position); 929 if ( u8d.second != 0 930 && u8d.first != 0xFEFF 931 && ( u8d.first == 0x85 932 || ( u8d.first >= 0xA0 933 && u8d.first <= 0xD7FF) 934 || ( u8d.first >= 0xE000 935 && u8d.first <= 0xFFFD) 936 || ( u8d.first >= 0x10000 937 && u8d.first <= 0x10FFFF))) 938 return Position + u8d.second; 939 } 940 return Position; 941 } 942 943 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 944 if (Position == End) 945 return Position; 946 if (*Position == 0x0D) { 947 if (Position + 1 != End && *(Position + 1) == 0x0A) 948 return Position + 2; 949 return Position + 1; 950 } 951 952 if (*Position == 0x0A) 953 return Position + 1; 954 return Position; 955 } 956 957 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 958 if (Position == End) 959 return Position; 960 if (*Position == ' ') 961 return Position + 1; 962 return Position; 963 } 964 965 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 966 if (Position == End) 967 return Position; 968 if (*Position == ' ' || *Position == '\t') 969 return Position + 1; 970 return Position; 971 } 972 973 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 974 if (Position == End) 975 return Position; 976 if (*Position == ' ' || *Position == '\t') 977 return Position; 978 return skip_nb_char(Position); 979 } 980 981 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 982 , StringRef::iterator Position) { 983 while (true) { 984 StringRef::iterator i = (this->*Func)(Position); 985 if (i == Position) 986 break; 987 Position = i; 988 } 989 return Position; 990 } 991 992 void Scanner::advanceWhile(SkipWhileFunc Func) { 993 auto Final = skip_while(Func, Current); 994 Column += Final - Current; 995 Current = Final; 996 } 997 998 static bool is_ns_hex_digit(const char C) { return isAlnum(C); } 999 1000 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); } 1001 1002 void Scanner::scan_ns_uri_char() { 1003 while (true) { 1004 if (Current == End) 1005 break; 1006 if (( *Current == '%' 1007 && Current + 2 < End 1008 && is_ns_hex_digit(*(Current + 1)) 1009 && is_ns_hex_digit(*(Current + 2))) 1010 || is_ns_word_char(*Current) 1011 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 1012 != StringRef::npos) { 1013 ++Current; 1014 ++Column; 1015 } else 1016 break; 1017 } 1018 } 1019 1020 bool Scanner::consume(uint32_t Expected) { 1021 if (Expected >= 0x80) { 1022 setError("Cannot consume non-ascii characters", Current); 1023 return false; 1024 } 1025 if (Current == End) 1026 return false; 1027 if (uint8_t(*Current) >= 0x80) { 1028 setError("Cannot consume non-ascii characters", Current); 1029 return false; 1030 } 1031 if (uint8_t(*Current) == Expected) { 1032 ++Current; 1033 ++Column; 1034 return true; 1035 } 1036 return false; 1037 } 1038 1039 void Scanner::skip(uint32_t Distance) { 1040 Current += Distance; 1041 Column += Distance; 1042 assert(Current <= End && "Skipped past the end"); 1043 } 1044 1045 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 1046 if (Position == End) 1047 return false; 1048 return *Position == ' ' || *Position == '\t' || *Position == '\r' || 1049 *Position == '\n'; 1050 } 1051 1052 bool Scanner::isLineEmpty(StringRef Line) { 1053 for (const auto *Position = Line.begin(); Position != Line.end(); ++Position) 1054 if (!isBlankOrBreak(Position)) 1055 return false; 1056 return true; 1057 } 1058 1059 bool Scanner::consumeLineBreakIfPresent() { 1060 auto Next = skip_b_break(Current); 1061 if (Next == Current) 1062 return false; 1063 Column = 0; 1064 ++Line; 1065 Current = Next; 1066 return true; 1067 } 1068 1069 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 1070 , unsigned AtColumn 1071 , bool IsRequired) { 1072 if (IsSimpleKeyAllowed) { 1073 SimpleKey SK; 1074 SK.Tok = Tok; 1075 SK.Line = Line; 1076 SK.Column = AtColumn; 1077 SK.IsRequired = IsRequired; 1078 SK.FlowLevel = FlowLevel; 1079 SimpleKeys.push_back(SK); 1080 } 1081 } 1082 1083 void Scanner::removeStaleSimpleKeyCandidates() { 1084 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 1085 i != SimpleKeys.end();) { 1086 if (i->Line != Line || i->Column + 1024 < Column) { 1087 if (i->IsRequired) 1088 setError( "Could not find expected : for simple key" 1089 , i->Tok->Range.begin()); 1090 i = SimpleKeys.erase(i); 1091 } else 1092 ++i; 1093 } 1094 } 1095 1096 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1097 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1098 SimpleKeys.pop_back(); 1099 } 1100 1101 bool Scanner::unrollIndent(int ToColumn) { 1102 Token T; 1103 // Indentation is ignored in flow. 1104 if (FlowLevel != 0) 1105 return true; 1106 1107 while (Indent > ToColumn) { 1108 T.Kind = Token::TK_BlockEnd; 1109 T.Range = StringRef(Current, 1); 1110 TokenQueue.push_back(T); 1111 Indent = Indents.pop_back_val(); 1112 } 1113 1114 return true; 1115 } 1116 1117 bool Scanner::rollIndent( int ToColumn 1118 , Token::TokenKind Kind 1119 , TokenQueueT::iterator InsertPoint) { 1120 if (FlowLevel) 1121 return true; 1122 if (Indent < ToColumn) { 1123 Indents.push_back(Indent); 1124 Indent = ToColumn; 1125 1126 Token T; 1127 T.Kind = Kind; 1128 T.Range = StringRef(Current, 0); 1129 TokenQueue.insert(InsertPoint, T); 1130 } 1131 return true; 1132 } 1133 1134 void Scanner::skipComment() { 1135 if (Current == End || *Current != '#') 1136 return; 1137 while (true) { 1138 // This may skip more than one byte, thus Column is only incremented 1139 // for code points. 1140 StringRef::iterator I = skip_nb_char(Current); 1141 if (I == Current) 1142 break; 1143 Current = I; 1144 ++Column; 1145 } 1146 } 1147 1148 void Scanner::scanToNextToken() { 1149 while (true) { 1150 while (Current != End && (*Current == ' ' || *Current == '\t')) { 1151 skip(1); 1152 } 1153 1154 skipComment(); 1155 1156 // Skip EOL. 1157 StringRef::iterator i = skip_b_break(Current); 1158 if (i == Current) 1159 break; 1160 Current = i; 1161 ++Line; 1162 Column = 0; 1163 // New lines may start a simple key. 1164 if (!FlowLevel) 1165 IsSimpleKeyAllowed = true; 1166 } 1167 } 1168 1169 bool Scanner::scanStreamStart() { 1170 IsStartOfStream = false; 1171 1172 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1173 1174 Token T; 1175 T.Kind = Token::TK_StreamStart; 1176 T.Range = StringRef(Current, EI.second); 1177 TokenQueue.push_back(T); 1178 Current += EI.second; 1179 return true; 1180 } 1181 1182 bool Scanner::scanStreamEnd() { 1183 // Force an ending new line if one isn't present. 1184 if (Column != 0) { 1185 Column = 0; 1186 ++Line; 1187 } 1188 1189 unrollIndent(-1); 1190 SimpleKeys.clear(); 1191 IsSimpleKeyAllowed = false; 1192 1193 Token T; 1194 T.Kind = Token::TK_StreamEnd; 1195 T.Range = StringRef(Current, 0); 1196 TokenQueue.push_back(T); 1197 return true; 1198 } 1199 1200 bool Scanner::scanDirective() { 1201 // Reset the indentation level. 1202 unrollIndent(-1); 1203 SimpleKeys.clear(); 1204 IsSimpleKeyAllowed = false; 1205 1206 StringRef::iterator Start = Current; 1207 consume('%'); 1208 StringRef::iterator NameStart = Current; 1209 Current = skip_while(&Scanner::skip_ns_char, Current); 1210 StringRef Name(NameStart, Current - NameStart); 1211 Current = skip_while(&Scanner::skip_s_white, Current); 1212 1213 Token T; 1214 if (Name == "YAML") { 1215 Current = skip_while(&Scanner::skip_ns_char, Current); 1216 T.Kind = Token::TK_VersionDirective; 1217 T.Range = StringRef(Start, Current - Start); 1218 TokenQueue.push_back(T); 1219 return true; 1220 } else if(Name == "TAG") { 1221 Current = skip_while(&Scanner::skip_ns_char, Current); 1222 Current = skip_while(&Scanner::skip_s_white, Current); 1223 Current = skip_while(&Scanner::skip_ns_char, Current); 1224 T.Kind = Token::TK_TagDirective; 1225 T.Range = StringRef(Start, Current - Start); 1226 TokenQueue.push_back(T); 1227 return true; 1228 } 1229 return false; 1230 } 1231 1232 bool Scanner::scanDocumentIndicator(bool IsStart) { 1233 unrollIndent(-1); 1234 SimpleKeys.clear(); 1235 IsSimpleKeyAllowed = false; 1236 1237 Token T; 1238 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1239 T.Range = StringRef(Current, 3); 1240 skip(3); 1241 TokenQueue.push_back(T); 1242 return true; 1243 } 1244 1245 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1246 Token T; 1247 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1248 : Token::TK_FlowMappingStart; 1249 T.Range = StringRef(Current, 1); 1250 skip(1); 1251 TokenQueue.push_back(T); 1252 1253 // [ and { may begin a simple key. 1254 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1255 1256 // And may also be followed by a simple key. 1257 IsSimpleKeyAllowed = true; 1258 ++FlowLevel; 1259 return true; 1260 } 1261 1262 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1263 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1264 IsSimpleKeyAllowed = false; 1265 Token T; 1266 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1267 : Token::TK_FlowMappingEnd; 1268 T.Range = StringRef(Current, 1); 1269 skip(1); 1270 TokenQueue.push_back(T); 1271 if (FlowLevel) 1272 --FlowLevel; 1273 return true; 1274 } 1275 1276 bool Scanner::scanFlowEntry() { 1277 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1278 IsSimpleKeyAllowed = true; 1279 Token T; 1280 T.Kind = Token::TK_FlowEntry; 1281 T.Range = StringRef(Current, 1); 1282 skip(1); 1283 TokenQueue.push_back(T); 1284 return true; 1285 } 1286 1287 bool Scanner::scanBlockEntry() { 1288 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1289 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1290 IsSimpleKeyAllowed = true; 1291 Token T; 1292 T.Kind = Token::TK_BlockEntry; 1293 T.Range = StringRef(Current, 1); 1294 skip(1); 1295 TokenQueue.push_back(T); 1296 return true; 1297 } 1298 1299 bool Scanner::scanKey() { 1300 if (!FlowLevel) 1301 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1302 1303 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1304 IsSimpleKeyAllowed = !FlowLevel; 1305 1306 Token T; 1307 T.Kind = Token::TK_Key; 1308 T.Range = StringRef(Current, 1); 1309 skip(1); 1310 TokenQueue.push_back(T); 1311 return true; 1312 } 1313 1314 bool Scanner::scanValue() { 1315 // If the previous token could have been a simple key, insert the key token 1316 // into the token queue. 1317 if (!SimpleKeys.empty()) { 1318 SimpleKey SK = SimpleKeys.pop_back_val(); 1319 Token T; 1320 T.Kind = Token::TK_Key; 1321 T.Range = SK.Tok->Range; 1322 TokenQueueT::iterator i, e; 1323 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1324 if (i == SK.Tok) 1325 break; 1326 } 1327 if (i == e) { 1328 Failed = true; 1329 return false; 1330 } 1331 i = TokenQueue.insert(i, T); 1332 1333 // We may also need to add a Block-Mapping-Start token. 1334 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1335 1336 IsSimpleKeyAllowed = false; 1337 } else { 1338 if (!FlowLevel) 1339 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1340 IsSimpleKeyAllowed = !FlowLevel; 1341 } 1342 1343 Token T; 1344 T.Kind = Token::TK_Value; 1345 T.Range = StringRef(Current, 1); 1346 skip(1); 1347 TokenQueue.push_back(T); 1348 return true; 1349 } 1350 1351 // Forbidding inlining improves performance by roughly 20%. 1352 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1353 LLVM_ATTRIBUTE_NOINLINE static bool 1354 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1355 1356 // Returns whether a character at 'Position' was escaped with a leading '\'. 1357 // 'First' specifies the position of the first character in the string. 1358 static bool wasEscaped(StringRef::iterator First, 1359 StringRef::iterator Position) { 1360 assert(Position - 1 >= First); 1361 StringRef::iterator I = Position - 1; 1362 // We calculate the number of consecutive '\'s before the current position 1363 // by iterating backwards through our string. 1364 while (I >= First && *I == '\\') --I; 1365 // (Position - 1 - I) now contains the number of '\'s before the current 1366 // position. If it is odd, the character at 'Position' was escaped. 1367 return (Position - 1 - I) % 2 == 1; 1368 } 1369 1370 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1371 StringRef::iterator Start = Current; 1372 unsigned ColStart = Column; 1373 if (IsDoubleQuoted) { 1374 do { 1375 ++Current; 1376 while (Current != End && *Current != '"') 1377 ++Current; 1378 // Repeat until the previous character was not a '\' or was an escaped 1379 // backslash. 1380 } while ( Current != End 1381 && *(Current - 1) == '\\' 1382 && wasEscaped(Start + 1, Current)); 1383 } else { 1384 skip(1); 1385 while (Current != End) { 1386 // Skip a ' followed by another '. 1387 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1388 skip(2); 1389 continue; 1390 } else if (*Current == '\'') 1391 break; 1392 StringRef::iterator i = skip_nb_char(Current); 1393 if (i == Current) { 1394 i = skip_b_break(Current); 1395 if (i == Current) 1396 break; 1397 Current = i; 1398 Column = 0; 1399 ++Line; 1400 } else { 1401 if (i == End) 1402 break; 1403 Current = i; 1404 ++Column; 1405 } 1406 } 1407 } 1408 1409 if (Current == End) { 1410 setError("Expected quote at end of scalar", Current); 1411 return false; 1412 } 1413 1414 skip(1); // Skip ending quote. 1415 Token T; 1416 T.Kind = Token::TK_Scalar; 1417 T.Range = StringRef(Start, Current - Start); 1418 TokenQueue.push_back(T); 1419 1420 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1421 1422 IsSimpleKeyAllowed = false; 1423 1424 return true; 1425 } 1426 1427 bool Scanner::scanPlainScalar() { 1428 StringRef::iterator Start = Current; 1429 unsigned ColStart = Column; 1430 unsigned LeadingBlanks = 0; 1431 assert(Indent >= -1 && "Indent must be >= -1 !"); 1432 unsigned indent = static_cast<unsigned>(Indent + 1); 1433 while (Current != End) { 1434 if (*Current == '#') 1435 break; 1436 1437 while (Current != End && !isBlankOrBreak(Current)) { 1438 if (FlowLevel && *Current == ':' && 1439 (Current + 1 == End || 1440 !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) { 1441 setError("Found unexpected ':' while scanning a plain scalar", Current); 1442 return false; 1443 } 1444 1445 // Check for the end of the plain scalar. 1446 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1447 || ( FlowLevel 1448 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1449 != StringRef::npos))) 1450 break; 1451 1452 StringRef::iterator i = skip_nb_char(Current); 1453 if (i == Current) 1454 break; 1455 Current = i; 1456 ++Column; 1457 } 1458 1459 // Are we at the end? 1460 if (!isBlankOrBreak(Current)) 1461 break; 1462 1463 // Eat blanks. 1464 StringRef::iterator Tmp = Current; 1465 while (isBlankOrBreak(Tmp)) { 1466 StringRef::iterator i = skip_s_white(Tmp); 1467 if (i != Tmp) { 1468 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1469 setError("Found invalid tab character in indentation", Tmp); 1470 return false; 1471 } 1472 Tmp = i; 1473 ++Column; 1474 } else { 1475 i = skip_b_break(Tmp); 1476 if (!LeadingBlanks) 1477 LeadingBlanks = 1; 1478 Tmp = i; 1479 Column = 0; 1480 ++Line; 1481 } 1482 } 1483 1484 if (!FlowLevel && Column < indent) 1485 break; 1486 1487 Current = Tmp; 1488 } 1489 if (Start == Current) { 1490 setError("Got empty plain scalar", Start); 1491 return false; 1492 } 1493 Token T; 1494 T.Kind = Token::TK_Scalar; 1495 T.Range = StringRef(Start, Current - Start); 1496 TokenQueue.push_back(T); 1497 1498 // Plain scalars can be simple keys. 1499 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1500 1501 IsSimpleKeyAllowed = false; 1502 1503 return true; 1504 } 1505 1506 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1507 StringRef::iterator Start = Current; 1508 unsigned ColStart = Column; 1509 skip(1); 1510 while (Current != End) { 1511 if ( *Current == '[' || *Current == ']' 1512 || *Current == '{' || *Current == '}' 1513 || *Current == ',' 1514 || *Current == ':') 1515 break; 1516 StringRef::iterator i = skip_ns_char(Current); 1517 if (i == Current) 1518 break; 1519 Current = i; 1520 ++Column; 1521 } 1522 1523 if (Start + 1 == Current) { 1524 setError("Got empty alias or anchor", Start); 1525 return false; 1526 } 1527 1528 Token T; 1529 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1530 T.Range = StringRef(Start, Current - Start); 1531 TokenQueue.push_back(T); 1532 1533 // Alias and anchors can be simple keys. 1534 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1535 1536 IsSimpleKeyAllowed = false; 1537 1538 return true; 1539 } 1540 1541 bool Scanner::scanBlockScalarIndicators(char &StyleIndicator, 1542 char &ChompingIndicator, 1543 unsigned &IndentIndicator, 1544 bool &IsDone) { 1545 StyleIndicator = scanBlockStyleIndicator(); 1546 if (!scanBlockScalarHeader(ChompingIndicator, IndentIndicator, IsDone)) 1547 return false; 1548 return true; 1549 } 1550 1551 char Scanner::scanBlockStyleIndicator() { 1552 char Indicator = ' '; 1553 if (Current != End && (*Current == '>' || *Current == '|')) { 1554 Indicator = *Current; 1555 skip(1); 1556 } 1557 return Indicator; 1558 } 1559 1560 char Scanner::scanBlockChompingIndicator() { 1561 char Indicator = ' '; 1562 if (Current != End && (*Current == '+' || *Current == '-')) { 1563 Indicator = *Current; 1564 skip(1); 1565 } 1566 return Indicator; 1567 } 1568 1569 /// Get the number of line breaks after chomping. 1570 /// 1571 /// Return the number of trailing line breaks to emit, depending on 1572 /// \p ChompingIndicator. 1573 static unsigned getChompedLineBreaks(char ChompingIndicator, 1574 unsigned LineBreaks, StringRef Str) { 1575 if (ChompingIndicator == '-') // Strip all line breaks. 1576 return 0; 1577 if (ChompingIndicator == '+') // Keep all line breaks. 1578 return LineBreaks; 1579 // Clip trailing lines. 1580 return Str.empty() ? 0 : 1; 1581 } 1582 1583 unsigned Scanner::scanBlockIndentationIndicator() { 1584 unsigned Indent = 0; 1585 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1586 Indent = unsigned(*Current - '0'); 1587 skip(1); 1588 } 1589 return Indent; 1590 } 1591 1592 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1593 unsigned &IndentIndicator, bool &IsDone) { 1594 auto Start = Current; 1595 1596 ChompingIndicator = scanBlockChompingIndicator(); 1597 IndentIndicator = scanBlockIndentationIndicator(); 1598 // Check for the chomping indicator once again. 1599 if (ChompingIndicator == ' ') 1600 ChompingIndicator = scanBlockChompingIndicator(); 1601 Current = skip_while(&Scanner::skip_s_white, Current); 1602 skipComment(); 1603 1604 if (Current == End) { // EOF, we have an empty scalar. 1605 Token T; 1606 T.Kind = Token::TK_BlockScalar; 1607 T.Range = StringRef(Start, Current - Start); 1608 TokenQueue.push_back(T); 1609 IsDone = true; 1610 return true; 1611 } 1612 1613 if (!consumeLineBreakIfPresent()) { 1614 setError("Expected a line break after block scalar header", Current); 1615 return false; 1616 } 1617 return true; 1618 } 1619 1620 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1621 unsigned BlockExitIndent, 1622 unsigned &LineBreaks, bool &IsDone) { 1623 unsigned MaxAllSpaceLineCharacters = 0; 1624 StringRef::iterator LongestAllSpaceLine; 1625 1626 while (true) { 1627 advanceWhile(&Scanner::skip_s_space); 1628 if (skip_nb_char(Current) != Current) { 1629 // This line isn't empty, so try and find the indentation. 1630 if (Column <= BlockExitIndent) { // End of the block literal. 1631 IsDone = true; 1632 return true; 1633 } 1634 // We found the block's indentation. 1635 BlockIndent = Column; 1636 if (MaxAllSpaceLineCharacters > BlockIndent) { 1637 setError( 1638 "Leading all-spaces line must be smaller than the block indent", 1639 LongestAllSpaceLine); 1640 return false; 1641 } 1642 return true; 1643 } 1644 if (skip_b_break(Current) != Current && 1645 Column > MaxAllSpaceLineCharacters) { 1646 // Record the longest all-space line in case it's longer than the 1647 // discovered block indent. 1648 MaxAllSpaceLineCharacters = Column; 1649 LongestAllSpaceLine = Current; 1650 } 1651 1652 // Check for EOF. 1653 if (Current == End) { 1654 IsDone = true; 1655 return true; 1656 } 1657 1658 if (!consumeLineBreakIfPresent()) { 1659 IsDone = true; 1660 return true; 1661 } 1662 ++LineBreaks; 1663 } 1664 return true; 1665 } 1666 1667 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1668 unsigned BlockExitIndent, bool &IsDone) { 1669 // Skip the indentation. 1670 while (Column < BlockIndent) { 1671 auto I = skip_s_space(Current); 1672 if (I == Current) 1673 break; 1674 Current = I; 1675 ++Column; 1676 } 1677 1678 if (skip_nb_char(Current) == Current) 1679 return true; 1680 1681 if (Column <= BlockExitIndent) { // End of the block literal. 1682 IsDone = true; 1683 return true; 1684 } 1685 1686 if (Column < BlockIndent) { 1687 if (Current != End && *Current == '#') { // Trailing comment. 1688 IsDone = true; 1689 return true; 1690 } 1691 setError("A text line is less indented than the block scalar", Current); 1692 return false; 1693 } 1694 return true; // A normal text line. 1695 } 1696 1697 bool Scanner::scanBlockScalar(bool IsLiteral) { 1698 assert(*Current == '|' || *Current == '>'); 1699 char StyleIndicator; 1700 char ChompingIndicator; 1701 unsigned BlockIndent; 1702 bool IsDone = false; 1703 if (!scanBlockScalarIndicators(StyleIndicator, ChompingIndicator, BlockIndent, 1704 IsDone)) 1705 return false; 1706 if (IsDone) 1707 return true; 1708 bool IsFolded = StyleIndicator == '>'; 1709 1710 const auto *Start = Current; 1711 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1712 unsigned LineBreaks = 0; 1713 if (BlockIndent == 0) { 1714 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1715 IsDone)) 1716 return false; 1717 } 1718 1719 // Scan the block's scalars body. 1720 SmallString<256> Str; 1721 while (!IsDone) { 1722 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1723 return false; 1724 if (IsDone) 1725 break; 1726 1727 // Parse the current line. 1728 auto LineStart = Current; 1729 advanceWhile(&Scanner::skip_nb_char); 1730 if (LineStart != Current) { 1731 if (LineBreaks && IsFolded && !Scanner::isLineEmpty(Str)) { 1732 // The folded style "folds" any single line break between content into a 1733 // single space, except when that content is "empty" (only contains 1734 // whitespace) in which case the line break is left as-is. 1735 if (LineBreaks == 1) { 1736 Str.append(LineBreaks, 1737 isLineEmpty(StringRef(LineStart, Current - LineStart)) 1738 ? '\n' 1739 : ' '); 1740 } 1741 // If we saw a single line break, we are completely replacing it and so 1742 // want `LineBreaks == 0`. Otherwise this decrement accounts for the 1743 // fact that the first line break is "trimmed", only being used to 1744 // signal a sequence of line breaks which should not be folded. 1745 LineBreaks--; 1746 } 1747 Str.append(LineBreaks, '\n'); 1748 Str.append(StringRef(LineStart, Current - LineStart)); 1749 LineBreaks = 0; 1750 } 1751 1752 // Check for EOF. 1753 if (Current == End) 1754 break; 1755 1756 if (!consumeLineBreakIfPresent()) 1757 break; 1758 ++LineBreaks; 1759 } 1760 1761 if (Current == End && !LineBreaks) 1762 // Ensure that there is at least one line break before the end of file. 1763 LineBreaks = 1; 1764 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1765 1766 // New lines may start a simple key. 1767 if (!FlowLevel) 1768 IsSimpleKeyAllowed = true; 1769 1770 Token T; 1771 T.Kind = Token::TK_BlockScalar; 1772 T.Range = StringRef(Start, Current - Start); 1773 T.Value = std::string(Str); 1774 TokenQueue.push_back(T); 1775 return true; 1776 } 1777 1778 bool Scanner::scanTag() { 1779 StringRef::iterator Start = Current; 1780 unsigned ColStart = Column; 1781 skip(1); // Eat !. 1782 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1783 else if (*Current == '<') { 1784 skip(1); 1785 scan_ns_uri_char(); 1786 if (!consume('>')) 1787 return false; 1788 } else { 1789 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1790 Current = skip_while(&Scanner::skip_ns_char, Current); 1791 } 1792 1793 Token T; 1794 T.Kind = Token::TK_Tag; 1795 T.Range = StringRef(Start, Current - Start); 1796 TokenQueue.push_back(T); 1797 1798 // Tags can be simple keys. 1799 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1800 1801 IsSimpleKeyAllowed = false; 1802 1803 return true; 1804 } 1805 1806 bool Scanner::fetchMoreTokens() { 1807 if (IsStartOfStream) 1808 return scanStreamStart(); 1809 1810 scanToNextToken(); 1811 1812 if (Current == End) 1813 return scanStreamEnd(); 1814 1815 removeStaleSimpleKeyCandidates(); 1816 1817 unrollIndent(Column); 1818 1819 if (Column == 0 && *Current == '%') 1820 return scanDirective(); 1821 1822 if (Column == 0 && Current + 4 <= End 1823 && *Current == '-' 1824 && *(Current + 1) == '-' 1825 && *(Current + 2) == '-' 1826 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1827 return scanDocumentIndicator(true); 1828 1829 if (Column == 0 && Current + 4 <= End 1830 && *Current == '.' 1831 && *(Current + 1) == '.' 1832 && *(Current + 2) == '.' 1833 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1834 return scanDocumentIndicator(false); 1835 1836 if (*Current == '[') 1837 return scanFlowCollectionStart(true); 1838 1839 if (*Current == '{') 1840 return scanFlowCollectionStart(false); 1841 1842 if (*Current == ']') 1843 return scanFlowCollectionEnd(true); 1844 1845 if (*Current == '}') 1846 return scanFlowCollectionEnd(false); 1847 1848 if (*Current == ',') 1849 return scanFlowEntry(); 1850 1851 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1852 return scanBlockEntry(); 1853 1854 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1855 return scanKey(); 1856 1857 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1858 return scanValue(); 1859 1860 if (*Current == '*') 1861 return scanAliasOrAnchor(true); 1862 1863 if (*Current == '&') 1864 return scanAliasOrAnchor(false); 1865 1866 if (*Current == '!') 1867 return scanTag(); 1868 1869 if (*Current == '|' && !FlowLevel) 1870 return scanBlockScalar(true); 1871 1872 if (*Current == '>' && !FlowLevel) 1873 return scanBlockScalar(false); 1874 1875 if (*Current == '\'') 1876 return scanFlowScalar(false); 1877 1878 if (*Current == '"') 1879 return scanFlowScalar(true); 1880 1881 // Get a plain scalar. 1882 StringRef FirstChar(Current, 1); 1883 if (!(isBlankOrBreak(Current) 1884 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1885 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1886 || (!FlowLevel && (*Current == '?' || *Current == ':') 1887 && isBlankOrBreak(Current + 1)) 1888 || (!FlowLevel && *Current == ':' 1889 && Current + 2 < End 1890 && *(Current + 1) == ':' 1891 && !isBlankOrBreak(Current + 2))) 1892 return scanPlainScalar(); 1893 1894 setError("Unrecognized character while tokenizing.", Current); 1895 return false; 1896 } 1897 1898 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors, 1899 std::error_code *EC) 1900 : scanner(new Scanner(Input, SM, ShowColors, EC)) {} 1901 1902 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors, 1903 std::error_code *EC) 1904 : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)) {} 1905 1906 Stream::~Stream() = default; 1907 1908 bool Stream::failed() { return scanner->failed(); } 1909 1910 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) { 1911 printError(N ? N->getSourceRange() : SMRange(), Msg, Kind); 1912 } 1913 1914 void Stream::printError(const SMRange &Range, const Twine &Msg, 1915 SourceMgr::DiagKind Kind) { 1916 scanner->printError(Range.Start, Kind, Msg, Range); 1917 } 1918 1919 document_iterator Stream::begin() { 1920 if (CurrentDoc) 1921 report_fatal_error("Can only iterate over the stream once"); 1922 1923 // Skip Stream-Start. 1924 scanner->getNext(); 1925 1926 CurrentDoc.reset(new Document(*this)); 1927 return document_iterator(CurrentDoc); 1928 } 1929 1930 document_iterator Stream::end() { 1931 return document_iterator(); 1932 } 1933 1934 void Stream::skip() { 1935 for (Document &Doc : *this) 1936 Doc.skip(); 1937 } 1938 1939 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1940 StringRef T) 1941 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1942 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1943 SourceRange = SMRange(Start, Start); 1944 } 1945 1946 std::string Node::getVerbatimTag() const { 1947 StringRef Raw = getRawTag(); 1948 if (!Raw.empty() && Raw != "!") { 1949 std::string Ret; 1950 if (Raw.find_last_of('!') == 0) { 1951 Ret = std::string(Doc->getTagMap().find("!")->second); 1952 Ret += Raw.substr(1); 1953 return Ret; 1954 } else if (Raw.startswith("!!")) { 1955 Ret = std::string(Doc->getTagMap().find("!!")->second); 1956 Ret += Raw.substr(2); 1957 return Ret; 1958 } else { 1959 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1960 std::map<StringRef, StringRef>::const_iterator It = 1961 Doc->getTagMap().find(TagHandle); 1962 if (It != Doc->getTagMap().end()) 1963 Ret = std::string(It->second); 1964 else { 1965 Token T; 1966 T.Kind = Token::TK_Tag; 1967 T.Range = TagHandle; 1968 setError(Twine("Unknown tag handle ") + TagHandle, T); 1969 } 1970 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1971 return Ret; 1972 } 1973 } 1974 1975 switch (getType()) { 1976 case NK_Null: 1977 return "tag:yaml.org,2002:null"; 1978 case NK_Scalar: 1979 case NK_BlockScalar: 1980 // TODO: Tag resolution. 1981 return "tag:yaml.org,2002:str"; 1982 case NK_Mapping: 1983 return "tag:yaml.org,2002:map"; 1984 case NK_Sequence: 1985 return "tag:yaml.org,2002:seq"; 1986 } 1987 1988 return ""; 1989 } 1990 1991 Token &Node::peekNext() { 1992 return Doc->peekNext(); 1993 } 1994 1995 Token Node::getNext() { 1996 return Doc->getNext(); 1997 } 1998 1999 Node *Node::parseBlockNode() { 2000 return Doc->parseBlockNode(); 2001 } 2002 2003 BumpPtrAllocator &Node::getAllocator() { 2004 return Doc->NodeAllocator; 2005 } 2006 2007 void Node::setError(const Twine &Msg, Token &Tok) const { 2008 Doc->setError(Msg, Tok); 2009 } 2010 2011 bool Node::failed() const { 2012 return Doc->failed(); 2013 } 2014 2015 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 2016 // TODO: Handle newlines properly. We need to remove leading whitespace. 2017 if (Value[0] == '"') { // Double quoted. 2018 // Pull off the leading and trailing "s. 2019 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 2020 // Search for characters that would require unescaping the value. 2021 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 2022 if (i != StringRef::npos) 2023 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 2024 return UnquotedValue; 2025 } else if (Value[0] == '\'') { // Single quoted. 2026 // Pull off the leading and trailing 's. 2027 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 2028 StringRef::size_type i = UnquotedValue.find('\''); 2029 if (i != StringRef::npos) { 2030 // We're going to need Storage. 2031 Storage.clear(); 2032 Storage.reserve(UnquotedValue.size()); 2033 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 2034 StringRef Valid(UnquotedValue.begin(), i); 2035 llvm::append_range(Storage, Valid); 2036 Storage.push_back('\''); 2037 UnquotedValue = UnquotedValue.substr(i + 2); 2038 } 2039 llvm::append_range(Storage, UnquotedValue); 2040 return StringRef(Storage.begin(), Storage.size()); 2041 } 2042 return UnquotedValue; 2043 } 2044 // Plain or block. 2045 return Value.rtrim(' '); 2046 } 2047 2048 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 2049 , StringRef::size_type i 2050 , SmallVectorImpl<char> &Storage) 2051 const { 2052 // Use Storage to build proper value. 2053 Storage.clear(); 2054 Storage.reserve(UnquotedValue.size()); 2055 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 2056 // Insert all previous chars into Storage. 2057 StringRef Valid(UnquotedValue.begin(), i); 2058 llvm::append_range(Storage, Valid); 2059 // Chop off inserted chars. 2060 UnquotedValue = UnquotedValue.substr(i); 2061 2062 assert(!UnquotedValue.empty() && "Can't be empty!"); 2063 2064 // Parse escape or line break. 2065 switch (UnquotedValue[0]) { 2066 case '\r': 2067 case '\n': 2068 Storage.push_back('\n'); 2069 if ( UnquotedValue.size() > 1 2070 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2071 UnquotedValue = UnquotedValue.substr(1); 2072 UnquotedValue = UnquotedValue.substr(1); 2073 break; 2074 default: 2075 if (UnquotedValue.size() == 1) { 2076 Token T; 2077 T.Range = StringRef(UnquotedValue.begin(), 1); 2078 setError("Unrecognized escape code", T); 2079 return ""; 2080 } 2081 UnquotedValue = UnquotedValue.substr(1); 2082 switch (UnquotedValue[0]) { 2083 default: { 2084 Token T; 2085 T.Range = StringRef(UnquotedValue.begin(), 1); 2086 setError("Unrecognized escape code", T); 2087 return ""; 2088 } 2089 case '\r': 2090 case '\n': 2091 // Remove the new line. 2092 if ( UnquotedValue.size() > 1 2093 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 2094 UnquotedValue = UnquotedValue.substr(1); 2095 // If this was just a single byte newline, it will get skipped 2096 // below. 2097 break; 2098 case '0': 2099 Storage.push_back(0x00); 2100 break; 2101 case 'a': 2102 Storage.push_back(0x07); 2103 break; 2104 case 'b': 2105 Storage.push_back(0x08); 2106 break; 2107 case 't': 2108 case 0x09: 2109 Storage.push_back(0x09); 2110 break; 2111 case 'n': 2112 Storage.push_back(0x0A); 2113 break; 2114 case 'v': 2115 Storage.push_back(0x0B); 2116 break; 2117 case 'f': 2118 Storage.push_back(0x0C); 2119 break; 2120 case 'r': 2121 Storage.push_back(0x0D); 2122 break; 2123 case 'e': 2124 Storage.push_back(0x1B); 2125 break; 2126 case ' ': 2127 Storage.push_back(0x20); 2128 break; 2129 case '"': 2130 Storage.push_back(0x22); 2131 break; 2132 case '/': 2133 Storage.push_back(0x2F); 2134 break; 2135 case '\\': 2136 Storage.push_back(0x5C); 2137 break; 2138 case 'N': 2139 encodeUTF8(0x85, Storage); 2140 break; 2141 case '_': 2142 encodeUTF8(0xA0, Storage); 2143 break; 2144 case 'L': 2145 encodeUTF8(0x2028, Storage); 2146 break; 2147 case 'P': 2148 encodeUTF8(0x2029, Storage); 2149 break; 2150 case 'x': { 2151 if (UnquotedValue.size() < 3) 2152 // TODO: Report error. 2153 break; 2154 unsigned int UnicodeScalarValue; 2155 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2156 // TODO: Report error. 2157 UnicodeScalarValue = 0xFFFD; 2158 encodeUTF8(UnicodeScalarValue, Storage); 2159 UnquotedValue = UnquotedValue.substr(2); 2160 break; 2161 } 2162 case 'u': { 2163 if (UnquotedValue.size() < 5) 2164 // TODO: Report error. 2165 break; 2166 unsigned int UnicodeScalarValue; 2167 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2168 // TODO: Report error. 2169 UnicodeScalarValue = 0xFFFD; 2170 encodeUTF8(UnicodeScalarValue, Storage); 2171 UnquotedValue = UnquotedValue.substr(4); 2172 break; 2173 } 2174 case 'U': { 2175 if (UnquotedValue.size() < 9) 2176 // TODO: Report error. 2177 break; 2178 unsigned int UnicodeScalarValue; 2179 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2180 // TODO: Report error. 2181 UnicodeScalarValue = 0xFFFD; 2182 encodeUTF8(UnicodeScalarValue, Storage); 2183 UnquotedValue = UnquotedValue.substr(8); 2184 break; 2185 } 2186 } 2187 UnquotedValue = UnquotedValue.substr(1); 2188 } 2189 } 2190 llvm::append_range(Storage, UnquotedValue); 2191 return StringRef(Storage.begin(), Storage.size()); 2192 } 2193 2194 Node *KeyValueNode::getKey() { 2195 if (Key) 2196 return Key; 2197 // Handle implicit null keys. 2198 { 2199 Token &t = peekNext(); 2200 if ( t.Kind == Token::TK_BlockEnd 2201 || t.Kind == Token::TK_Value 2202 || t.Kind == Token::TK_Error) { 2203 return Key = new (getAllocator()) NullNode(Doc); 2204 } 2205 if (t.Kind == Token::TK_Key) 2206 getNext(); // skip TK_Key. 2207 } 2208 2209 // Handle explicit null keys. 2210 Token &t = peekNext(); 2211 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2212 return Key = new (getAllocator()) NullNode(Doc); 2213 } 2214 2215 // We've got a normal key. 2216 return Key = parseBlockNode(); 2217 } 2218 2219 Node *KeyValueNode::getValue() { 2220 if (Value) 2221 return Value; 2222 2223 if (Node* Key = getKey()) 2224 Key->skip(); 2225 else { 2226 setError("Null key in Key Value.", peekNext()); 2227 return Value = new (getAllocator()) NullNode(Doc); 2228 } 2229 2230 if (failed()) 2231 return Value = new (getAllocator()) NullNode(Doc); 2232 2233 // Handle implicit null values. 2234 { 2235 Token &t = peekNext(); 2236 if ( t.Kind == Token::TK_BlockEnd 2237 || t.Kind == Token::TK_FlowMappingEnd 2238 || t.Kind == Token::TK_Key 2239 || t.Kind == Token::TK_FlowEntry 2240 || t.Kind == Token::TK_Error) { 2241 return Value = new (getAllocator()) NullNode(Doc); 2242 } 2243 2244 if (t.Kind != Token::TK_Value) { 2245 setError("Unexpected token in Key Value.", t); 2246 return Value = new (getAllocator()) NullNode(Doc); 2247 } 2248 getNext(); // skip TK_Value. 2249 } 2250 2251 // Handle explicit null values. 2252 Token &t = peekNext(); 2253 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2254 return Value = new (getAllocator()) NullNode(Doc); 2255 } 2256 2257 // We got a normal value. 2258 return Value = parseBlockNode(); 2259 } 2260 2261 void MappingNode::increment() { 2262 if (failed()) { 2263 IsAtEnd = true; 2264 CurrentEntry = nullptr; 2265 return; 2266 } 2267 if (CurrentEntry) { 2268 CurrentEntry->skip(); 2269 if (Type == MT_Inline) { 2270 IsAtEnd = true; 2271 CurrentEntry = nullptr; 2272 return; 2273 } 2274 } 2275 Token T = peekNext(); 2276 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2277 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2278 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2279 } else if (Type == MT_Block) { 2280 switch (T.Kind) { 2281 case Token::TK_BlockEnd: 2282 getNext(); 2283 IsAtEnd = true; 2284 CurrentEntry = nullptr; 2285 break; 2286 default: 2287 setError("Unexpected token. Expected Key or Block End", T); 2288 LLVM_FALLTHROUGH; 2289 case Token::TK_Error: 2290 IsAtEnd = true; 2291 CurrentEntry = nullptr; 2292 } 2293 } else { 2294 switch (T.Kind) { 2295 case Token::TK_FlowEntry: 2296 // Eat the flow entry and recurse. 2297 getNext(); 2298 return increment(); 2299 case Token::TK_FlowMappingEnd: 2300 getNext(); 2301 LLVM_FALLTHROUGH; 2302 case Token::TK_Error: 2303 // Set this to end iterator. 2304 IsAtEnd = true; 2305 CurrentEntry = nullptr; 2306 break; 2307 default: 2308 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2309 "Mapping End." 2310 , T); 2311 IsAtEnd = true; 2312 CurrentEntry = nullptr; 2313 } 2314 } 2315 } 2316 2317 void SequenceNode::increment() { 2318 if (failed()) { 2319 IsAtEnd = true; 2320 CurrentEntry = nullptr; 2321 return; 2322 } 2323 if (CurrentEntry) 2324 CurrentEntry->skip(); 2325 Token T = peekNext(); 2326 if (SeqType == ST_Block) { 2327 switch (T.Kind) { 2328 case Token::TK_BlockEntry: 2329 getNext(); 2330 CurrentEntry = parseBlockNode(); 2331 if (!CurrentEntry) { // An error occurred. 2332 IsAtEnd = true; 2333 CurrentEntry = nullptr; 2334 } 2335 break; 2336 case Token::TK_BlockEnd: 2337 getNext(); 2338 IsAtEnd = true; 2339 CurrentEntry = nullptr; 2340 break; 2341 default: 2342 setError( "Unexpected token. Expected Block Entry or Block End." 2343 , T); 2344 LLVM_FALLTHROUGH; 2345 case Token::TK_Error: 2346 IsAtEnd = true; 2347 CurrentEntry = nullptr; 2348 } 2349 } else if (SeqType == ST_Indentless) { 2350 switch (T.Kind) { 2351 case Token::TK_BlockEntry: 2352 getNext(); 2353 CurrentEntry = parseBlockNode(); 2354 if (!CurrentEntry) { // An error occurred. 2355 IsAtEnd = true; 2356 CurrentEntry = nullptr; 2357 } 2358 break; 2359 default: 2360 case Token::TK_Error: 2361 IsAtEnd = true; 2362 CurrentEntry = nullptr; 2363 } 2364 } else if (SeqType == ST_Flow) { 2365 switch (T.Kind) { 2366 case Token::TK_FlowEntry: 2367 // Eat the flow entry and recurse. 2368 getNext(); 2369 WasPreviousTokenFlowEntry = true; 2370 return increment(); 2371 case Token::TK_FlowSequenceEnd: 2372 getNext(); 2373 LLVM_FALLTHROUGH; 2374 case Token::TK_Error: 2375 // Set this to end iterator. 2376 IsAtEnd = true; 2377 CurrentEntry = nullptr; 2378 break; 2379 case Token::TK_StreamEnd: 2380 case Token::TK_DocumentEnd: 2381 case Token::TK_DocumentStart: 2382 setError("Could not find closing ]!", T); 2383 // Set this to end iterator. 2384 IsAtEnd = true; 2385 CurrentEntry = nullptr; 2386 break; 2387 default: 2388 if (!WasPreviousTokenFlowEntry) { 2389 setError("Expected , between entries!", T); 2390 IsAtEnd = true; 2391 CurrentEntry = nullptr; 2392 break; 2393 } 2394 // Otherwise it must be a flow entry. 2395 CurrentEntry = parseBlockNode(); 2396 if (!CurrentEntry) { 2397 IsAtEnd = true; 2398 } 2399 WasPreviousTokenFlowEntry = false; 2400 break; 2401 } 2402 } 2403 } 2404 2405 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2406 // Tag maps starts with two default mappings. 2407 TagMap["!"] = "!"; 2408 TagMap["!!"] = "tag:yaml.org,2002:"; 2409 2410 if (parseDirectives()) 2411 expectToken(Token::TK_DocumentStart); 2412 Token &T = peekNext(); 2413 if (T.Kind == Token::TK_DocumentStart) 2414 getNext(); 2415 } 2416 2417 bool Document::skip() { 2418 if (stream.scanner->failed()) 2419 return false; 2420 if (!Root && !getRoot()) 2421 return false; 2422 Root->skip(); 2423 Token &T = peekNext(); 2424 if (T.Kind == Token::TK_StreamEnd) 2425 return false; 2426 if (T.Kind == Token::TK_DocumentEnd) { 2427 getNext(); 2428 return skip(); 2429 } 2430 return true; 2431 } 2432 2433 Token &Document::peekNext() { 2434 return stream.scanner->peekNext(); 2435 } 2436 2437 Token Document::getNext() { 2438 return stream.scanner->getNext(); 2439 } 2440 2441 void Document::setError(const Twine &Message, Token &Location) const { 2442 stream.scanner->setError(Message, Location.Range.begin()); 2443 } 2444 2445 bool Document::failed() const { 2446 return stream.scanner->failed(); 2447 } 2448 2449 Node *Document::parseBlockNode() { 2450 Token T = peekNext(); 2451 // Handle properties. 2452 Token AnchorInfo; 2453 Token TagInfo; 2454 parse_property: 2455 switch (T.Kind) { 2456 case Token::TK_Alias: 2457 getNext(); 2458 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2459 case Token::TK_Anchor: 2460 if (AnchorInfo.Kind == Token::TK_Anchor) { 2461 setError("Already encountered an anchor for this node!", T); 2462 return nullptr; 2463 } 2464 AnchorInfo = getNext(); // Consume TK_Anchor. 2465 T = peekNext(); 2466 goto parse_property; 2467 case Token::TK_Tag: 2468 if (TagInfo.Kind == Token::TK_Tag) { 2469 setError("Already encountered a tag for this node!", T); 2470 return nullptr; 2471 } 2472 TagInfo = getNext(); // Consume TK_Tag. 2473 T = peekNext(); 2474 goto parse_property; 2475 default: 2476 break; 2477 } 2478 2479 switch (T.Kind) { 2480 case Token::TK_BlockEntry: 2481 // We got an unindented BlockEntry sequence. This is not terminated with 2482 // a BlockEnd. 2483 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2484 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2485 , AnchorInfo.Range.substr(1) 2486 , TagInfo.Range 2487 , SequenceNode::ST_Indentless); 2488 case Token::TK_BlockSequenceStart: 2489 getNext(); 2490 return new (NodeAllocator) 2491 SequenceNode( stream.CurrentDoc 2492 , AnchorInfo.Range.substr(1) 2493 , TagInfo.Range 2494 , SequenceNode::ST_Block); 2495 case Token::TK_BlockMappingStart: 2496 getNext(); 2497 return new (NodeAllocator) 2498 MappingNode( stream.CurrentDoc 2499 , AnchorInfo.Range.substr(1) 2500 , TagInfo.Range 2501 , MappingNode::MT_Block); 2502 case Token::TK_FlowSequenceStart: 2503 getNext(); 2504 return new (NodeAllocator) 2505 SequenceNode( stream.CurrentDoc 2506 , AnchorInfo.Range.substr(1) 2507 , TagInfo.Range 2508 , SequenceNode::ST_Flow); 2509 case Token::TK_FlowMappingStart: 2510 getNext(); 2511 return new (NodeAllocator) 2512 MappingNode( stream.CurrentDoc 2513 , AnchorInfo.Range.substr(1) 2514 , TagInfo.Range 2515 , MappingNode::MT_Flow); 2516 case Token::TK_Scalar: 2517 getNext(); 2518 return new (NodeAllocator) 2519 ScalarNode( stream.CurrentDoc 2520 , AnchorInfo.Range.substr(1) 2521 , TagInfo.Range 2522 , T.Range); 2523 case Token::TK_BlockScalar: { 2524 getNext(); 2525 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2526 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2527 return new (NodeAllocator) 2528 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2529 TagInfo.Range, StrCopy, T.Range); 2530 } 2531 case Token::TK_Key: 2532 // Don't eat the TK_Key, KeyValueNode expects it. 2533 return new (NodeAllocator) 2534 MappingNode( stream.CurrentDoc 2535 , AnchorInfo.Range.substr(1) 2536 , TagInfo.Range 2537 , MappingNode::MT_Inline); 2538 case Token::TK_DocumentStart: 2539 case Token::TK_DocumentEnd: 2540 case Token::TK_StreamEnd: 2541 default: 2542 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2543 // !!null null. 2544 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2545 case Token::TK_FlowMappingEnd: 2546 case Token::TK_FlowSequenceEnd: 2547 case Token::TK_FlowEntry: { 2548 if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root))) 2549 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2550 2551 setError("Unexpected token", T); 2552 return nullptr; 2553 } 2554 case Token::TK_Error: 2555 return nullptr; 2556 } 2557 llvm_unreachable("Control flow shouldn't reach here."); 2558 return nullptr; 2559 } 2560 2561 bool Document::parseDirectives() { 2562 bool isDirective = false; 2563 while (true) { 2564 Token T = peekNext(); 2565 if (T.Kind == Token::TK_TagDirective) { 2566 parseTAGDirective(); 2567 isDirective = true; 2568 } else if (T.Kind == Token::TK_VersionDirective) { 2569 parseYAMLDirective(); 2570 isDirective = true; 2571 } else 2572 break; 2573 } 2574 return isDirective; 2575 } 2576 2577 void Document::parseYAMLDirective() { 2578 getNext(); // Eat %YAML <version> 2579 } 2580 2581 void Document::parseTAGDirective() { 2582 Token Tag = getNext(); // %TAG <handle> <prefix> 2583 StringRef T = Tag.Range; 2584 // Strip %TAG 2585 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2586 std::size_t HandleEnd = T.find_first_of(" \t"); 2587 StringRef TagHandle = T.substr(0, HandleEnd); 2588 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2589 TagMap[TagHandle] = TagPrefix; 2590 } 2591 2592 bool Document::expectToken(int TK) { 2593 Token T = getNext(); 2594 if (T.Kind != TK) { 2595 setError("Unexpected token", T); 2596 return false; 2597 } 2598 return true; 2599 } 2600