xref: /freebsd/contrib/llvm-project/llvm/lib/Support/YAMLParser.cpp (revision 1165fc9a526630487a1feb63daef65c5aee1a583)
1 //===- YAMLParser.cpp - Simple YAML parser --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file implements a YAML parser.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/Support/YAMLParser.h"
14 #include "llvm/ADT/AllocatorList.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/None.h"
17 #include "llvm/ADT/STLExtras.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringExtras.h"
21 #include "llvm/ADT/StringRef.h"
22 #include "llvm/ADT/Twine.h"
23 #include "llvm/Support/Compiler.h"
24 #include "llvm/Support/ErrorHandling.h"
25 #include "llvm/Support/MemoryBuffer.h"
26 #include "llvm/Support/SMLoc.h"
27 #include "llvm/Support/SourceMgr.h"
28 #include "llvm/Support/Unicode.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include <cassert>
31 #include <cstddef>
32 #include <cstdint>
33 #include <map>
34 #include <memory>
35 #include <string>
36 #include <system_error>
37 #include <utility>
38 
39 using namespace llvm;
40 using namespace yaml;
41 
42 enum UnicodeEncodingForm {
43   UEF_UTF32_LE, ///< UTF-32 Little Endian
44   UEF_UTF32_BE, ///< UTF-32 Big Endian
45   UEF_UTF16_LE, ///< UTF-16 Little Endian
46   UEF_UTF16_BE, ///< UTF-16 Big Endian
47   UEF_UTF8,     ///< UTF-8 or ascii.
48   UEF_Unknown   ///< Not a valid Unicode encoding.
49 };
50 
51 /// EncodingInfo - Holds the encoding type and length of the byte order mark if
52 ///                it exists. Length is in {0, 2, 3, 4}.
53 using EncodingInfo = std::pair<UnicodeEncodingForm, unsigned>;
54 
55 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode
56 ///                      encoding form of \a Input.
57 ///
58 /// @param Input A string of length 0 or more.
59 /// @returns An EncodingInfo indicating the Unicode encoding form of the input
60 ///          and how long the byte order mark is if one exists.
61 static EncodingInfo getUnicodeEncoding(StringRef Input) {
62   if (Input.empty())
63     return std::make_pair(UEF_Unknown, 0);
64 
65   switch (uint8_t(Input[0])) {
66   case 0x00:
67     if (Input.size() >= 4) {
68       if (  Input[1] == 0
69          && uint8_t(Input[2]) == 0xFE
70          && uint8_t(Input[3]) == 0xFF)
71         return std::make_pair(UEF_UTF32_BE, 4);
72       if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0)
73         return std::make_pair(UEF_UTF32_BE, 0);
74     }
75 
76     if (Input.size() >= 2 && Input[1] != 0)
77       return std::make_pair(UEF_UTF16_BE, 0);
78     return std::make_pair(UEF_Unknown, 0);
79   case 0xFF:
80     if (  Input.size() >= 4
81        && uint8_t(Input[1]) == 0xFE
82        && Input[2] == 0
83        && Input[3] == 0)
84       return std::make_pair(UEF_UTF32_LE, 4);
85 
86     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE)
87       return std::make_pair(UEF_UTF16_LE, 2);
88     return std::make_pair(UEF_Unknown, 0);
89   case 0xFE:
90     if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF)
91       return std::make_pair(UEF_UTF16_BE, 2);
92     return std::make_pair(UEF_Unknown, 0);
93   case 0xEF:
94     if (  Input.size() >= 3
95        && uint8_t(Input[1]) == 0xBB
96        && uint8_t(Input[2]) == 0xBF)
97       return std::make_pair(UEF_UTF8, 3);
98     return std::make_pair(UEF_Unknown, 0);
99   }
100 
101   // It could still be utf-32 or utf-16.
102   if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0)
103     return std::make_pair(UEF_UTF32_LE, 0);
104 
105   if (Input.size() >= 2 && Input[1] == 0)
106     return std::make_pair(UEF_UTF16_LE, 0);
107 
108   return std::make_pair(UEF_UTF8, 0);
109 }
110 
111 /// Pin the vtables to this file.
112 void Node::anchor() {}
113 void NullNode::anchor() {}
114 void ScalarNode::anchor() {}
115 void BlockScalarNode::anchor() {}
116 void KeyValueNode::anchor() {}
117 void MappingNode::anchor() {}
118 void SequenceNode::anchor() {}
119 void AliasNode::anchor() {}
120 
121 namespace llvm {
122 namespace yaml {
123 
124 /// Token - A single YAML token.
125 struct Token {
126   enum TokenKind {
127     TK_Error, // Uninitialized token.
128     TK_StreamStart,
129     TK_StreamEnd,
130     TK_VersionDirective,
131     TK_TagDirective,
132     TK_DocumentStart,
133     TK_DocumentEnd,
134     TK_BlockEntry,
135     TK_BlockEnd,
136     TK_BlockSequenceStart,
137     TK_BlockMappingStart,
138     TK_FlowEntry,
139     TK_FlowSequenceStart,
140     TK_FlowSequenceEnd,
141     TK_FlowMappingStart,
142     TK_FlowMappingEnd,
143     TK_Key,
144     TK_Value,
145     TK_Scalar,
146     TK_BlockScalar,
147     TK_Alias,
148     TK_Anchor,
149     TK_Tag
150   } Kind = TK_Error;
151 
152   /// A string of length 0 or more whose begin() points to the logical location
153   /// of the token in the input.
154   StringRef Range;
155 
156   /// The value of a block scalar node.
157   std::string Value;
158 
159   Token() = default;
160 };
161 
162 } // end namespace yaml
163 } // end namespace llvm
164 
165 using TokenQueueT = BumpPtrList<Token>;
166 
167 namespace {
168 
169 /// This struct is used to track simple keys.
170 ///
171 /// Simple keys are handled by creating an entry in SimpleKeys for each Token
172 /// which could legally be the start of a simple key. When peekNext is called,
173 /// if the Token To be returned is referenced by a SimpleKey, we continue
174 /// tokenizing until that potential simple key has either been found to not be
175 /// a simple key (we moved on to the next line or went further than 1024 chars).
176 /// Or when we run into a Value, and then insert a Key token (and possibly
177 /// others) before the SimpleKey's Tok.
178 struct SimpleKey {
179   TokenQueueT::iterator Tok;
180   unsigned Column = 0;
181   unsigned Line = 0;
182   unsigned FlowLevel = 0;
183   bool IsRequired = false;
184 
185   bool operator ==(const SimpleKey &Other) {
186     return Tok == Other.Tok;
187   }
188 };
189 
190 } // end anonymous namespace
191 
192 /// The Unicode scalar value of a UTF-8 minimal well-formed code unit
193 ///        subsequence and the subsequence's length in code units (uint8_t).
194 ///        A length of 0 represents an error.
195 using UTF8Decoded = std::pair<uint32_t, unsigned>;
196 
197 static UTF8Decoded decodeUTF8(StringRef Range) {
198   StringRef::iterator Position= Range.begin();
199   StringRef::iterator End = Range.end();
200   // 1 byte: [0x00, 0x7f]
201   // Bit pattern: 0xxxxxxx
202   if (Position < End && (*Position & 0x80) == 0) {
203     return std::make_pair(*Position, 1);
204   }
205   // 2 bytes: [0x80, 0x7ff]
206   // Bit pattern: 110xxxxx 10xxxxxx
207   if (Position + 1 < End && ((*Position & 0xE0) == 0xC0) &&
208       ((*(Position + 1) & 0xC0) == 0x80)) {
209     uint32_t codepoint = ((*Position & 0x1F) << 6) |
210                           (*(Position + 1) & 0x3F);
211     if (codepoint >= 0x80)
212       return std::make_pair(codepoint, 2);
213   }
214   // 3 bytes: [0x8000, 0xffff]
215   // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx
216   if (Position + 2 < End && ((*Position & 0xF0) == 0xE0) &&
217       ((*(Position + 1) & 0xC0) == 0x80) &&
218       ((*(Position + 2) & 0xC0) == 0x80)) {
219     uint32_t codepoint = ((*Position & 0x0F) << 12) |
220                          ((*(Position + 1) & 0x3F) << 6) |
221                           (*(Position + 2) & 0x3F);
222     // Codepoints between 0xD800 and 0xDFFF are invalid, as
223     // they are high / low surrogate halves used by UTF-16.
224     if (codepoint >= 0x800 &&
225         (codepoint < 0xD800 || codepoint > 0xDFFF))
226       return std::make_pair(codepoint, 3);
227   }
228   // 4 bytes: [0x10000, 0x10FFFF]
229   // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
230   if (Position + 3 < End && ((*Position & 0xF8) == 0xF0) &&
231       ((*(Position + 1) & 0xC0) == 0x80) &&
232       ((*(Position + 2) & 0xC0) == 0x80) &&
233       ((*(Position + 3) & 0xC0) == 0x80)) {
234     uint32_t codepoint = ((*Position & 0x07) << 18) |
235                          ((*(Position + 1) & 0x3F) << 12) |
236                          ((*(Position + 2) & 0x3F) << 6) |
237                           (*(Position + 3) & 0x3F);
238     if (codepoint >= 0x10000 && codepoint <= 0x10FFFF)
239       return std::make_pair(codepoint, 4);
240   }
241   return std::make_pair(0, 0);
242 }
243 
244 namespace llvm {
245 namespace yaml {
246 
247 /// Scans YAML tokens from a MemoryBuffer.
248 class Scanner {
249 public:
250   Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true,
251           std::error_code *EC = nullptr);
252   Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true,
253           std::error_code *EC = nullptr);
254 
255   /// Parse the next token and return it without popping it.
256   Token &peekNext();
257 
258   /// Parse the next token and pop it from the queue.
259   Token getNext();
260 
261   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
262                   ArrayRef<SMRange> Ranges = None) {
263     SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors);
264   }
265 
266   void setError(const Twine &Message, StringRef::iterator Position) {
267     if (Position >= End)
268       Position = End - 1;
269 
270     // propagate the error if possible
271     if (EC)
272       *EC = make_error_code(std::errc::invalid_argument);
273 
274     // Don't print out more errors after the first one we encounter. The rest
275     // are just the result of the first, and have no meaning.
276     if (!Failed)
277       printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message);
278     Failed = true;
279   }
280 
281   /// Returns true if an error occurred while parsing.
282   bool failed() {
283     return Failed;
284   }
285 
286 private:
287   void init(MemoryBufferRef Buffer);
288 
289   StringRef currentInput() {
290     return StringRef(Current, End - Current);
291   }
292 
293   /// Decode a UTF-8 minimal well-formed code unit subsequence starting
294   ///        at \a Position.
295   ///
296   /// If the UTF-8 code units starting at Position do not form a well-formed
297   /// code unit subsequence, then the Unicode scalar value is 0, and the length
298   /// is 0.
299   UTF8Decoded decodeUTF8(StringRef::iterator Position) {
300     return ::decodeUTF8(StringRef(Position, End - Position));
301   }
302 
303   // The following functions are based on the gramar rules in the YAML spec. The
304   // style of the function names it meant to closely match how they are written
305   // in the spec. The number within the [] is the number of the grammar rule in
306   // the spec.
307   //
308   // See 4.2 [Production Naming Conventions] for the meaning of the prefixes.
309   //
310   // c-
311   //   A production starting and ending with a special character.
312   // b-
313   //   A production matching a single line break.
314   // nb-
315   //   A production starting and ending with a non-break character.
316   // s-
317   //   A production starting and ending with a white space character.
318   // ns-
319   //   A production starting and ending with a non-space character.
320   // l-
321   //   A production matching complete line(s).
322 
323   /// Skip a single nb-char[27] starting at Position.
324   ///
325   /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE]
326   ///                  | [0xFF00-0xFFFD] | [0x10000-0x10FFFF]
327   ///
328   /// @returns The code unit after the nb-char, or Position if it's not an
329   ///          nb-char.
330   StringRef::iterator skip_nb_char(StringRef::iterator Position);
331 
332   /// Skip a single b-break[28] starting at Position.
333   ///
334   /// A b-break is 0xD 0xA | 0xD | 0xA
335   ///
336   /// @returns The code unit after the b-break, or Position if it's not a
337   ///          b-break.
338   StringRef::iterator skip_b_break(StringRef::iterator Position);
339 
340   /// Skip a single s-space[31] starting at Position.
341   ///
342   /// An s-space is 0x20
343   ///
344   /// @returns The code unit after the s-space, or Position if it's not a
345   ///          s-space.
346   StringRef::iterator skip_s_space(StringRef::iterator Position);
347 
348   /// Skip a single s-white[33] starting at Position.
349   ///
350   /// A s-white is 0x20 | 0x9
351   ///
352   /// @returns The code unit after the s-white, or Position if it's not a
353   ///          s-white.
354   StringRef::iterator skip_s_white(StringRef::iterator Position);
355 
356   /// Skip a single ns-char[34] starting at Position.
357   ///
358   /// A ns-char is nb-char - s-white
359   ///
360   /// @returns The code unit after the ns-char, or Position if it's not a
361   ///          ns-char.
362   StringRef::iterator skip_ns_char(StringRef::iterator Position);
363 
364   using SkipWhileFunc = StringRef::iterator (Scanner::*)(StringRef::iterator);
365 
366   /// Skip minimal well-formed code unit subsequences until Func
367   ///        returns its input.
368   ///
369   /// @returns The code unit after the last minimal well-formed code unit
370   ///          subsequence that Func accepted.
371   StringRef::iterator skip_while( SkipWhileFunc Func
372                                 , StringRef::iterator Position);
373 
374   /// Skip minimal well-formed code unit subsequences until Func returns its
375   /// input.
376   void advanceWhile(SkipWhileFunc Func);
377 
378   /// Scan ns-uri-char[39]s starting at Cur.
379   ///
380   /// This updates Cur and Column while scanning.
381   void scan_ns_uri_char();
382 
383   /// Consume a minimal well-formed code unit subsequence starting at
384   ///        \a Cur. Return false if it is not the same Unicode scalar value as
385   ///        \a Expected. This updates \a Column.
386   bool consume(uint32_t Expected);
387 
388   /// Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column.
389   void skip(uint32_t Distance);
390 
391   /// Return true if the minimal well-formed code unit subsequence at
392   ///        Pos is whitespace or a new line
393   bool isBlankOrBreak(StringRef::iterator Position);
394 
395   /// Consume a single b-break[28] if it's present at the current position.
396   ///
397   /// Return false if the code unit at the current position isn't a line break.
398   bool consumeLineBreakIfPresent();
399 
400   /// If IsSimpleKeyAllowed, create and push_back a new SimpleKey.
401   void saveSimpleKeyCandidate( TokenQueueT::iterator Tok
402                              , unsigned AtColumn
403                              , bool IsRequired);
404 
405   /// Remove simple keys that can no longer be valid simple keys.
406   ///
407   /// Invalid simple keys are not on the current line or are further than 1024
408   /// columns back.
409   void removeStaleSimpleKeyCandidates();
410 
411   /// Remove all simple keys on FlowLevel \a Level.
412   void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level);
413 
414   /// Unroll indentation in \a Indents back to \a Col. Creates BlockEnd
415   ///        tokens if needed.
416   bool unrollIndent(int ToColumn);
417 
418   /// Increase indent to \a Col. Creates \a Kind token at \a InsertPoint
419   ///        if needed.
420   bool rollIndent( int ToColumn
421                  , Token::TokenKind Kind
422                  , TokenQueueT::iterator InsertPoint);
423 
424   /// Skip a single-line comment when the comment starts at the current
425   /// position of the scanner.
426   void skipComment();
427 
428   /// Skip whitespace and comments until the start of the next token.
429   void scanToNextToken();
430 
431   /// Must be the first token generated.
432   bool scanStreamStart();
433 
434   /// Generate tokens needed to close out the stream.
435   bool scanStreamEnd();
436 
437   /// Scan a %BLAH directive.
438   bool scanDirective();
439 
440   /// Scan a ... or ---.
441   bool scanDocumentIndicator(bool IsStart);
442 
443   /// Scan a [ or { and generate the proper flow collection start token.
444   bool scanFlowCollectionStart(bool IsSequence);
445 
446   /// Scan a ] or } and generate the proper flow collection end token.
447   bool scanFlowCollectionEnd(bool IsSequence);
448 
449   /// Scan the , that separates entries in a flow collection.
450   bool scanFlowEntry();
451 
452   /// Scan the - that starts block sequence entries.
453   bool scanBlockEntry();
454 
455   /// Scan an explicit ? indicating a key.
456   bool scanKey();
457 
458   /// Scan an explicit : indicating a value.
459   bool scanValue();
460 
461   /// Scan a quoted scalar.
462   bool scanFlowScalar(bool IsDoubleQuoted);
463 
464   /// Scan an unquoted scalar.
465   bool scanPlainScalar();
466 
467   /// Scan an Alias or Anchor starting with * or &.
468   bool scanAliasOrAnchor(bool IsAlias);
469 
470   /// Scan a block scalar starting with | or >.
471   bool scanBlockScalar(bool IsLiteral);
472 
473   /// Scan a chomping indicator in a block scalar header.
474   char scanBlockChompingIndicator();
475 
476   /// Scan an indentation indicator in a block scalar header.
477   unsigned scanBlockIndentationIndicator();
478 
479   /// Scan a block scalar header.
480   ///
481   /// Return false if an error occurred.
482   bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator,
483                              bool &IsDone);
484 
485   /// Look for the indentation level of a block scalar.
486   ///
487   /// Return false if an error occurred.
488   bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent,
489                              unsigned &LineBreaks, bool &IsDone);
490 
491   /// Scan the indentation of a text line in a block scalar.
492   ///
493   /// Return false if an error occurred.
494   bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent,
495                              bool &IsDone);
496 
497   /// Scan a tag of the form !stuff.
498   bool scanTag();
499 
500   /// Dispatch to the next scanning function based on \a *Cur.
501   bool fetchMoreTokens();
502 
503   /// The SourceMgr used for diagnostics and buffer management.
504   SourceMgr &SM;
505 
506   /// The original input.
507   MemoryBufferRef InputBuffer;
508 
509   /// The current position of the scanner.
510   StringRef::iterator Current;
511 
512   /// The end of the input (one past the last character).
513   StringRef::iterator End;
514 
515   /// Current YAML indentation level in spaces.
516   int Indent;
517 
518   /// Current column number in Unicode code points.
519   unsigned Column;
520 
521   /// Current line number.
522   unsigned Line;
523 
524   /// How deep we are in flow style containers. 0 Means at block level.
525   unsigned FlowLevel;
526 
527   /// Are we at the start of the stream?
528   bool IsStartOfStream;
529 
530   /// Can the next token be the start of a simple key?
531   bool IsSimpleKeyAllowed;
532 
533   /// True if an error has occurred.
534   bool Failed;
535 
536   /// Should colors be used when printing out the diagnostic messages?
537   bool ShowColors;
538 
539   /// Queue of tokens. This is required to queue up tokens while looking
540   ///        for the end of a simple key. And for cases where a single character
541   ///        can produce multiple tokens (e.g. BlockEnd).
542   TokenQueueT TokenQueue;
543 
544   /// Indentation levels.
545   SmallVector<int, 4> Indents;
546 
547   /// Potential simple keys.
548   SmallVector<SimpleKey, 4> SimpleKeys;
549 
550   std::error_code *EC;
551 };
552 
553 } // end namespace yaml
554 } // end namespace llvm
555 
556 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result.
557 static void encodeUTF8( uint32_t UnicodeScalarValue
558                       , SmallVectorImpl<char> &Result) {
559   if (UnicodeScalarValue <= 0x7F) {
560     Result.push_back(UnicodeScalarValue & 0x7F);
561   } else if (UnicodeScalarValue <= 0x7FF) {
562     uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6);
563     uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F);
564     Result.push_back(FirstByte);
565     Result.push_back(SecondByte);
566   } else if (UnicodeScalarValue <= 0xFFFF) {
567     uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12);
568     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
569     uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F);
570     Result.push_back(FirstByte);
571     Result.push_back(SecondByte);
572     Result.push_back(ThirdByte);
573   } else if (UnicodeScalarValue <= 0x10FFFF) {
574     uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18);
575     uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12);
576     uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6);
577     uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F);
578     Result.push_back(FirstByte);
579     Result.push_back(SecondByte);
580     Result.push_back(ThirdByte);
581     Result.push_back(FourthByte);
582   }
583 }
584 
585 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) {
586   SourceMgr SM;
587   Scanner scanner(Input, SM);
588   while (true) {
589     Token T = scanner.getNext();
590     switch (T.Kind) {
591     case Token::TK_StreamStart:
592       OS << "Stream-Start: ";
593       break;
594     case Token::TK_StreamEnd:
595       OS << "Stream-End: ";
596       break;
597     case Token::TK_VersionDirective:
598       OS << "Version-Directive: ";
599       break;
600     case Token::TK_TagDirective:
601       OS << "Tag-Directive: ";
602       break;
603     case Token::TK_DocumentStart:
604       OS << "Document-Start: ";
605       break;
606     case Token::TK_DocumentEnd:
607       OS << "Document-End: ";
608       break;
609     case Token::TK_BlockEntry:
610       OS << "Block-Entry: ";
611       break;
612     case Token::TK_BlockEnd:
613       OS << "Block-End: ";
614       break;
615     case Token::TK_BlockSequenceStart:
616       OS << "Block-Sequence-Start: ";
617       break;
618     case Token::TK_BlockMappingStart:
619       OS << "Block-Mapping-Start: ";
620       break;
621     case Token::TK_FlowEntry:
622       OS << "Flow-Entry: ";
623       break;
624     case Token::TK_FlowSequenceStart:
625       OS << "Flow-Sequence-Start: ";
626       break;
627     case Token::TK_FlowSequenceEnd:
628       OS << "Flow-Sequence-End: ";
629       break;
630     case Token::TK_FlowMappingStart:
631       OS << "Flow-Mapping-Start: ";
632       break;
633     case Token::TK_FlowMappingEnd:
634       OS << "Flow-Mapping-End: ";
635       break;
636     case Token::TK_Key:
637       OS << "Key: ";
638       break;
639     case Token::TK_Value:
640       OS << "Value: ";
641       break;
642     case Token::TK_Scalar:
643       OS << "Scalar: ";
644       break;
645     case Token::TK_BlockScalar:
646       OS << "Block Scalar: ";
647       break;
648     case Token::TK_Alias:
649       OS << "Alias: ";
650       break;
651     case Token::TK_Anchor:
652       OS << "Anchor: ";
653       break;
654     case Token::TK_Tag:
655       OS << "Tag: ";
656       break;
657     case Token::TK_Error:
658       break;
659     }
660     OS << T.Range << "\n";
661     if (T.Kind == Token::TK_StreamEnd)
662       break;
663     else if (T.Kind == Token::TK_Error)
664       return false;
665   }
666   return true;
667 }
668 
669 bool yaml::scanTokens(StringRef Input) {
670   SourceMgr SM;
671   Scanner scanner(Input, SM);
672   while (true) {
673     Token T = scanner.getNext();
674     if (T.Kind == Token::TK_StreamEnd)
675       break;
676     else if (T.Kind == Token::TK_Error)
677       return false;
678   }
679   return true;
680 }
681 
682 std::string yaml::escape(StringRef Input, bool EscapePrintable) {
683   std::string EscapedInput;
684   for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) {
685     if (*i == '\\')
686       EscapedInput += "\\\\";
687     else if (*i == '"')
688       EscapedInput += "\\\"";
689     else if (*i == 0)
690       EscapedInput += "\\0";
691     else if (*i == 0x07)
692       EscapedInput += "\\a";
693     else if (*i == 0x08)
694       EscapedInput += "\\b";
695     else if (*i == 0x09)
696       EscapedInput += "\\t";
697     else if (*i == 0x0A)
698       EscapedInput += "\\n";
699     else if (*i == 0x0B)
700       EscapedInput += "\\v";
701     else if (*i == 0x0C)
702       EscapedInput += "\\f";
703     else if (*i == 0x0D)
704       EscapedInput += "\\r";
705     else if (*i == 0x1B)
706       EscapedInput += "\\e";
707     else if ((unsigned char)*i < 0x20) { // Control characters not handled above.
708       std::string HexStr = utohexstr(*i);
709       EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
710     } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence.
711       UTF8Decoded UnicodeScalarValue
712         = decodeUTF8(StringRef(i, Input.end() - i));
713       if (UnicodeScalarValue.second == 0) {
714         // Found invalid char.
715         SmallString<4> Val;
716         encodeUTF8(0xFFFD, Val);
717         llvm::append_range(EscapedInput, Val);
718         // FIXME: Error reporting.
719         return EscapedInput;
720       }
721       if (UnicodeScalarValue.first == 0x85)
722         EscapedInput += "\\N";
723       else if (UnicodeScalarValue.first == 0xA0)
724         EscapedInput += "\\_";
725       else if (UnicodeScalarValue.first == 0x2028)
726         EscapedInput += "\\L";
727       else if (UnicodeScalarValue.first == 0x2029)
728         EscapedInput += "\\P";
729       else if (!EscapePrintable &&
730                sys::unicode::isPrintable(UnicodeScalarValue.first))
731         EscapedInput += StringRef(i, UnicodeScalarValue.second);
732       else {
733         std::string HexStr = utohexstr(UnicodeScalarValue.first);
734         if (HexStr.size() <= 2)
735           EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr;
736         else if (HexStr.size() <= 4)
737           EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr;
738         else if (HexStr.size() <= 8)
739           EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr;
740       }
741       i += UnicodeScalarValue.second - 1;
742     } else
743       EscapedInput.push_back(*i);
744   }
745   return EscapedInput;
746 }
747 
748 llvm::Optional<bool> yaml::parseBool(StringRef S) {
749   switch (S.size()) {
750   case 1:
751     switch (S.front()) {
752     case 'y':
753     case 'Y':
754       return true;
755     case 'n':
756     case 'N':
757       return false;
758     default:
759       return None;
760     }
761   case 2:
762     switch (S.front()) {
763     case 'O':
764       if (S[1] == 'N') // ON
765         return true;
766       LLVM_FALLTHROUGH;
767     case 'o':
768       if (S[1] == 'n') //[Oo]n
769         return true;
770       return None;
771     case 'N':
772       if (S[1] == 'O') // NO
773         return false;
774       LLVM_FALLTHROUGH;
775     case 'n':
776       if (S[1] == 'o') //[Nn]o
777         return false;
778       return None;
779     default:
780       return None;
781     }
782   case 3:
783     switch (S.front()) {
784     case 'O':
785       if (S.drop_front() == "FF") // OFF
786         return false;
787       LLVM_FALLTHROUGH;
788     case 'o':
789       if (S.drop_front() == "ff") //[Oo]ff
790         return false;
791       return None;
792     case 'Y':
793       if (S.drop_front() == "ES") // YES
794         return true;
795       LLVM_FALLTHROUGH;
796     case 'y':
797       if (S.drop_front() == "es") //[Yy]es
798         return true;
799       return None;
800     default:
801       return None;
802     }
803   case 4:
804     switch (S.front()) {
805     case 'T':
806       if (S.drop_front() == "RUE") // TRUE
807         return true;
808       LLVM_FALLTHROUGH;
809     case 't':
810       if (S.drop_front() == "rue") //[Tt]rue
811         return true;
812       return None;
813     default:
814       return None;
815     }
816   case 5:
817     switch (S.front()) {
818     case 'F':
819       if (S.drop_front() == "ALSE") // FALSE
820         return false;
821       LLVM_FALLTHROUGH;
822     case 'f':
823       if (S.drop_front() == "alse") //[Ff]alse
824         return false;
825       return None;
826     default:
827       return None;
828     }
829   default:
830     return None;
831   }
832 }
833 
834 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors,
835                  std::error_code *EC)
836     : SM(sm), ShowColors(ShowColors), EC(EC) {
837   init(MemoryBufferRef(Input, "YAML"));
838 }
839 
840 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors,
841                  std::error_code *EC)
842     : SM(SM_), ShowColors(ShowColors), EC(EC) {
843   init(Buffer);
844 }
845 
846 void Scanner::init(MemoryBufferRef Buffer) {
847   InputBuffer = Buffer;
848   Current = InputBuffer.getBufferStart();
849   End = InputBuffer.getBufferEnd();
850   Indent = -1;
851   Column = 0;
852   Line = 0;
853   FlowLevel = 0;
854   IsStartOfStream = true;
855   IsSimpleKeyAllowed = true;
856   Failed = false;
857   std::unique_ptr<MemoryBuffer> InputBufferOwner =
858       MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
859   SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc());
860 }
861 
862 Token &Scanner::peekNext() {
863   // If the current token is a possible simple key, keep parsing until we
864   // can confirm.
865   bool NeedMore = false;
866   while (true) {
867     if (TokenQueue.empty() || NeedMore) {
868       if (!fetchMoreTokens()) {
869         TokenQueue.clear();
870         SimpleKeys.clear();
871         TokenQueue.push_back(Token());
872         return TokenQueue.front();
873       }
874     }
875     assert(!TokenQueue.empty() &&
876             "fetchMoreTokens lied about getting tokens!");
877 
878     removeStaleSimpleKeyCandidates();
879     SimpleKey SK;
880     SK.Tok = TokenQueue.begin();
881     if (!is_contained(SimpleKeys, SK))
882       break;
883     else
884       NeedMore = true;
885   }
886   return TokenQueue.front();
887 }
888 
889 Token Scanner::getNext() {
890   Token Ret = peekNext();
891   // TokenQueue can be empty if there was an error getting the next token.
892   if (!TokenQueue.empty())
893     TokenQueue.pop_front();
894 
895   // There cannot be any referenced Token's if the TokenQueue is empty. So do a
896   // quick deallocation of them all.
897   if (TokenQueue.empty())
898     TokenQueue.resetAlloc();
899 
900   return Ret;
901 }
902 
903 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) {
904   if (Position == End)
905     return Position;
906   // Check 7 bit c-printable - b-char.
907   if (   *Position == 0x09
908       || (*Position >= 0x20 && *Position <= 0x7E))
909     return Position + 1;
910 
911   // Check for valid UTF-8.
912   if (uint8_t(*Position) & 0x80) {
913     UTF8Decoded u8d = decodeUTF8(Position);
914     if (   u8d.second != 0
915         && u8d.first != 0xFEFF
916         && ( u8d.first == 0x85
917           || ( u8d.first >= 0xA0
918             && u8d.first <= 0xD7FF)
919           || ( u8d.first >= 0xE000
920             && u8d.first <= 0xFFFD)
921           || ( u8d.first >= 0x10000
922             && u8d.first <= 0x10FFFF)))
923       return Position + u8d.second;
924   }
925   return Position;
926 }
927 
928 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) {
929   if (Position == End)
930     return Position;
931   if (*Position == 0x0D) {
932     if (Position + 1 != End && *(Position + 1) == 0x0A)
933       return Position + 2;
934     return Position + 1;
935   }
936 
937   if (*Position == 0x0A)
938     return Position + 1;
939   return Position;
940 }
941 
942 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) {
943   if (Position == End)
944     return Position;
945   if (*Position == ' ')
946     return Position + 1;
947   return Position;
948 }
949 
950 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) {
951   if (Position == End)
952     return Position;
953   if (*Position == ' ' || *Position == '\t')
954     return Position + 1;
955   return Position;
956 }
957 
958 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) {
959   if (Position == End)
960     return Position;
961   if (*Position == ' ' || *Position == '\t')
962     return Position;
963   return skip_nb_char(Position);
964 }
965 
966 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func
967                                        , StringRef::iterator Position) {
968   while (true) {
969     StringRef::iterator i = (this->*Func)(Position);
970     if (i == Position)
971       break;
972     Position = i;
973   }
974   return Position;
975 }
976 
977 void Scanner::advanceWhile(SkipWhileFunc Func) {
978   auto Final = skip_while(Func, Current);
979   Column += Final - Current;
980   Current = Final;
981 }
982 
983 static bool is_ns_hex_digit(const char C) { return isAlnum(C); }
984 
985 static bool is_ns_word_char(const char C) { return C == '-' || isAlpha(C); }
986 
987 void Scanner::scan_ns_uri_char() {
988   while (true) {
989     if (Current == End)
990       break;
991     if ((   *Current == '%'
992           && Current + 2 < End
993           && is_ns_hex_digit(*(Current + 1))
994           && is_ns_hex_digit(*(Current + 2)))
995         || is_ns_word_char(*Current)
996         || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]")
997           != StringRef::npos) {
998       ++Current;
999       ++Column;
1000     } else
1001       break;
1002   }
1003 }
1004 
1005 bool Scanner::consume(uint32_t Expected) {
1006   if (Expected >= 0x80) {
1007     setError("Cannot consume non-ascii characters", Current);
1008     return false;
1009   }
1010   if (Current == End)
1011     return false;
1012   if (uint8_t(*Current) >= 0x80) {
1013     setError("Cannot consume non-ascii characters", Current);
1014     return false;
1015   }
1016   if (uint8_t(*Current) == Expected) {
1017     ++Current;
1018     ++Column;
1019     return true;
1020   }
1021   return false;
1022 }
1023 
1024 void Scanner::skip(uint32_t Distance) {
1025   Current += Distance;
1026   Column += Distance;
1027   assert(Current <= End && "Skipped past the end");
1028 }
1029 
1030 bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
1031   if (Position == End)
1032     return false;
1033   return *Position == ' ' || *Position == '\t' || *Position == '\r' ||
1034          *Position == '\n';
1035 }
1036 
1037 bool Scanner::consumeLineBreakIfPresent() {
1038   auto Next = skip_b_break(Current);
1039   if (Next == Current)
1040     return false;
1041   Column = 0;
1042   ++Line;
1043   Current = Next;
1044   return true;
1045 }
1046 
1047 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok
1048                                     , unsigned AtColumn
1049                                     , bool IsRequired) {
1050   if (IsSimpleKeyAllowed) {
1051     SimpleKey SK;
1052     SK.Tok = Tok;
1053     SK.Line = Line;
1054     SK.Column = AtColumn;
1055     SK.IsRequired = IsRequired;
1056     SK.FlowLevel = FlowLevel;
1057     SimpleKeys.push_back(SK);
1058   }
1059 }
1060 
1061 void Scanner::removeStaleSimpleKeyCandidates() {
1062   for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin();
1063                                             i != SimpleKeys.end();) {
1064     if (i->Line != Line || i->Column + 1024 < Column) {
1065       if (i->IsRequired)
1066         setError( "Could not find expected : for simple key"
1067                 , i->Tok->Range.begin());
1068       i = SimpleKeys.erase(i);
1069     } else
1070       ++i;
1071   }
1072 }
1073 
1074 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) {
1075   if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level)
1076     SimpleKeys.pop_back();
1077 }
1078 
1079 bool Scanner::unrollIndent(int ToColumn) {
1080   Token T;
1081   // Indentation is ignored in flow.
1082   if (FlowLevel != 0)
1083     return true;
1084 
1085   while (Indent > ToColumn) {
1086     T.Kind = Token::TK_BlockEnd;
1087     T.Range = StringRef(Current, 1);
1088     TokenQueue.push_back(T);
1089     Indent = Indents.pop_back_val();
1090   }
1091 
1092   return true;
1093 }
1094 
1095 bool Scanner::rollIndent( int ToColumn
1096                         , Token::TokenKind Kind
1097                         , TokenQueueT::iterator InsertPoint) {
1098   if (FlowLevel)
1099     return true;
1100   if (Indent < ToColumn) {
1101     Indents.push_back(Indent);
1102     Indent = ToColumn;
1103 
1104     Token T;
1105     T.Kind = Kind;
1106     T.Range = StringRef(Current, 0);
1107     TokenQueue.insert(InsertPoint, T);
1108   }
1109   return true;
1110 }
1111 
1112 void Scanner::skipComment() {
1113   if (Current == End || *Current != '#')
1114     return;
1115   while (true) {
1116     // This may skip more than one byte, thus Column is only incremented
1117     // for code points.
1118     StringRef::iterator I = skip_nb_char(Current);
1119     if (I == Current)
1120       break;
1121     Current = I;
1122     ++Column;
1123   }
1124 }
1125 
1126 void Scanner::scanToNextToken() {
1127   while (true) {
1128     while (Current != End && (*Current == ' ' || *Current == '\t')) {
1129       skip(1);
1130     }
1131 
1132     skipComment();
1133 
1134     // Skip EOL.
1135     StringRef::iterator i = skip_b_break(Current);
1136     if (i == Current)
1137       break;
1138     Current = i;
1139     ++Line;
1140     Column = 0;
1141     // New lines may start a simple key.
1142     if (!FlowLevel)
1143       IsSimpleKeyAllowed = true;
1144   }
1145 }
1146 
1147 bool Scanner::scanStreamStart() {
1148   IsStartOfStream = false;
1149 
1150   EncodingInfo EI = getUnicodeEncoding(currentInput());
1151 
1152   Token T;
1153   T.Kind = Token::TK_StreamStart;
1154   T.Range = StringRef(Current, EI.second);
1155   TokenQueue.push_back(T);
1156   Current += EI.second;
1157   return true;
1158 }
1159 
1160 bool Scanner::scanStreamEnd() {
1161   // Force an ending new line if one isn't present.
1162   if (Column != 0) {
1163     Column = 0;
1164     ++Line;
1165   }
1166 
1167   unrollIndent(-1);
1168   SimpleKeys.clear();
1169   IsSimpleKeyAllowed = false;
1170 
1171   Token T;
1172   T.Kind = Token::TK_StreamEnd;
1173   T.Range = StringRef(Current, 0);
1174   TokenQueue.push_back(T);
1175   return true;
1176 }
1177 
1178 bool Scanner::scanDirective() {
1179   // Reset the indentation level.
1180   unrollIndent(-1);
1181   SimpleKeys.clear();
1182   IsSimpleKeyAllowed = false;
1183 
1184   StringRef::iterator Start = Current;
1185   consume('%');
1186   StringRef::iterator NameStart = Current;
1187   Current = skip_while(&Scanner::skip_ns_char, Current);
1188   StringRef Name(NameStart, Current - NameStart);
1189   Current = skip_while(&Scanner::skip_s_white, Current);
1190 
1191   Token T;
1192   if (Name == "YAML") {
1193     Current = skip_while(&Scanner::skip_ns_char, Current);
1194     T.Kind = Token::TK_VersionDirective;
1195     T.Range = StringRef(Start, Current - Start);
1196     TokenQueue.push_back(T);
1197     return true;
1198   } else if(Name == "TAG") {
1199     Current = skip_while(&Scanner::skip_ns_char, Current);
1200     Current = skip_while(&Scanner::skip_s_white, Current);
1201     Current = skip_while(&Scanner::skip_ns_char, Current);
1202     T.Kind = Token::TK_TagDirective;
1203     T.Range = StringRef(Start, Current - Start);
1204     TokenQueue.push_back(T);
1205     return true;
1206   }
1207   return false;
1208 }
1209 
1210 bool Scanner::scanDocumentIndicator(bool IsStart) {
1211   unrollIndent(-1);
1212   SimpleKeys.clear();
1213   IsSimpleKeyAllowed = false;
1214 
1215   Token T;
1216   T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
1217   T.Range = StringRef(Current, 3);
1218   skip(3);
1219   TokenQueue.push_back(T);
1220   return true;
1221 }
1222 
1223 bool Scanner::scanFlowCollectionStart(bool IsSequence) {
1224   Token T;
1225   T.Kind = IsSequence ? Token::TK_FlowSequenceStart
1226                       : Token::TK_FlowMappingStart;
1227   T.Range = StringRef(Current, 1);
1228   skip(1);
1229   TokenQueue.push_back(T);
1230 
1231   // [ and { may begin a simple key.
1232   saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false);
1233 
1234   // And may also be followed by a simple key.
1235   IsSimpleKeyAllowed = true;
1236   ++FlowLevel;
1237   return true;
1238 }
1239 
1240 bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
1241   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1242   IsSimpleKeyAllowed = false;
1243   Token T;
1244   T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
1245                       : Token::TK_FlowMappingEnd;
1246   T.Range = StringRef(Current, 1);
1247   skip(1);
1248   TokenQueue.push_back(T);
1249   if (FlowLevel)
1250     --FlowLevel;
1251   return true;
1252 }
1253 
1254 bool Scanner::scanFlowEntry() {
1255   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1256   IsSimpleKeyAllowed = true;
1257   Token T;
1258   T.Kind = Token::TK_FlowEntry;
1259   T.Range = StringRef(Current, 1);
1260   skip(1);
1261   TokenQueue.push_back(T);
1262   return true;
1263 }
1264 
1265 bool Scanner::scanBlockEntry() {
1266   rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
1267   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1268   IsSimpleKeyAllowed = true;
1269   Token T;
1270   T.Kind = Token::TK_BlockEntry;
1271   T.Range = StringRef(Current, 1);
1272   skip(1);
1273   TokenQueue.push_back(T);
1274   return true;
1275 }
1276 
1277 bool Scanner::scanKey() {
1278   if (!FlowLevel)
1279     rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1280 
1281   removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
1282   IsSimpleKeyAllowed = !FlowLevel;
1283 
1284   Token T;
1285   T.Kind = Token::TK_Key;
1286   T.Range = StringRef(Current, 1);
1287   skip(1);
1288   TokenQueue.push_back(T);
1289   return true;
1290 }
1291 
1292 bool Scanner::scanValue() {
1293   // If the previous token could have been a simple key, insert the key token
1294   // into the token queue.
1295   if (!SimpleKeys.empty()) {
1296     SimpleKey SK = SimpleKeys.pop_back_val();
1297     Token T;
1298     T.Kind = Token::TK_Key;
1299     T.Range = SK.Tok->Range;
1300     TokenQueueT::iterator i, e;
1301     for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) {
1302       if (i == SK.Tok)
1303         break;
1304     }
1305     if (i == e) {
1306       Failed = true;
1307       return false;
1308     }
1309     i = TokenQueue.insert(i, T);
1310 
1311     // We may also need to add a Block-Mapping-Start token.
1312     rollIndent(SK.Column, Token::TK_BlockMappingStart, i);
1313 
1314     IsSimpleKeyAllowed = false;
1315   } else {
1316     if (!FlowLevel)
1317       rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
1318     IsSimpleKeyAllowed = !FlowLevel;
1319   }
1320 
1321   Token T;
1322   T.Kind = Token::TK_Value;
1323   T.Range = StringRef(Current, 1);
1324   skip(1);
1325   TokenQueue.push_back(T);
1326   return true;
1327 }
1328 
1329 // Forbidding inlining improves performance by roughly 20%.
1330 // FIXME: Remove once llvm optimizes this to the faster version without hints.
1331 LLVM_ATTRIBUTE_NOINLINE static bool
1332 wasEscaped(StringRef::iterator First, StringRef::iterator Position);
1333 
1334 // Returns whether a character at 'Position' was escaped with a leading '\'.
1335 // 'First' specifies the position of the first character in the string.
1336 static bool wasEscaped(StringRef::iterator First,
1337                        StringRef::iterator Position) {
1338   assert(Position - 1 >= First);
1339   StringRef::iterator I = Position - 1;
1340   // We calculate the number of consecutive '\'s before the current position
1341   // by iterating backwards through our string.
1342   while (I >= First && *I == '\\') --I;
1343   // (Position - 1 - I) now contains the number of '\'s before the current
1344   // position. If it is odd, the character at 'Position' was escaped.
1345   return (Position - 1 - I) % 2 == 1;
1346 }
1347 
1348 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
1349   StringRef::iterator Start = Current;
1350   unsigned ColStart = Column;
1351   if (IsDoubleQuoted) {
1352     do {
1353       ++Current;
1354       while (Current != End && *Current != '"')
1355         ++Current;
1356       // Repeat until the previous character was not a '\' or was an escaped
1357       // backslash.
1358     } while (   Current != End
1359              && *(Current - 1) == '\\'
1360              && wasEscaped(Start + 1, Current));
1361   } else {
1362     skip(1);
1363     while (Current != End) {
1364       // Skip a ' followed by another '.
1365       if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') {
1366         skip(2);
1367         continue;
1368       } else if (*Current == '\'')
1369         break;
1370       StringRef::iterator i = skip_nb_char(Current);
1371       if (i == Current) {
1372         i = skip_b_break(Current);
1373         if (i == Current)
1374           break;
1375         Current = i;
1376         Column = 0;
1377         ++Line;
1378       } else {
1379         if (i == End)
1380           break;
1381         Current = i;
1382         ++Column;
1383       }
1384     }
1385   }
1386 
1387   if (Current == End) {
1388     setError("Expected quote at end of scalar", Current);
1389     return false;
1390   }
1391 
1392   skip(1); // Skip ending quote.
1393   Token T;
1394   T.Kind = Token::TK_Scalar;
1395   T.Range = StringRef(Start, Current - Start);
1396   TokenQueue.push_back(T);
1397 
1398   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1399 
1400   IsSimpleKeyAllowed = false;
1401 
1402   return true;
1403 }
1404 
1405 bool Scanner::scanPlainScalar() {
1406   StringRef::iterator Start = Current;
1407   unsigned ColStart = Column;
1408   unsigned LeadingBlanks = 0;
1409   assert(Indent >= -1 && "Indent must be >= -1 !");
1410   unsigned indent = static_cast<unsigned>(Indent + 1);
1411   while (Current != End) {
1412     if (*Current == '#')
1413       break;
1414 
1415     while (Current != End && !isBlankOrBreak(Current)) {
1416       if (FlowLevel && *Current == ':' &&
1417           (Current + 1 == End ||
1418            !(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
1419         setError("Found unexpected ':' while scanning a plain scalar", Current);
1420         return false;
1421       }
1422 
1423       // Check for the end of the plain scalar.
1424       if (  (*Current == ':' && isBlankOrBreak(Current + 1))
1425           || (  FlowLevel
1426           && (StringRef(Current, 1).find_first_of(",:?[]{}")
1427               != StringRef::npos)))
1428         break;
1429 
1430       StringRef::iterator i = skip_nb_char(Current);
1431       if (i == Current)
1432         break;
1433       Current = i;
1434       ++Column;
1435     }
1436 
1437     // Are we at the end?
1438     if (!isBlankOrBreak(Current))
1439       break;
1440 
1441     // Eat blanks.
1442     StringRef::iterator Tmp = Current;
1443     while (isBlankOrBreak(Tmp)) {
1444       StringRef::iterator i = skip_s_white(Tmp);
1445       if (i != Tmp) {
1446         if (LeadingBlanks && (Column < indent) && *Tmp == '\t') {
1447           setError("Found invalid tab character in indentation", Tmp);
1448           return false;
1449         }
1450         Tmp = i;
1451         ++Column;
1452       } else {
1453         i = skip_b_break(Tmp);
1454         if (!LeadingBlanks)
1455           LeadingBlanks = 1;
1456         Tmp = i;
1457         Column = 0;
1458         ++Line;
1459       }
1460     }
1461 
1462     if (!FlowLevel && Column < indent)
1463       break;
1464 
1465     Current = Tmp;
1466   }
1467   if (Start == Current) {
1468     setError("Got empty plain scalar", Start);
1469     return false;
1470   }
1471   Token T;
1472   T.Kind = Token::TK_Scalar;
1473   T.Range = StringRef(Start, Current - Start);
1474   TokenQueue.push_back(T);
1475 
1476   // Plain scalars can be simple keys.
1477   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1478 
1479   IsSimpleKeyAllowed = false;
1480 
1481   return true;
1482 }
1483 
1484 bool Scanner::scanAliasOrAnchor(bool IsAlias) {
1485   StringRef::iterator Start = Current;
1486   unsigned ColStart = Column;
1487   skip(1);
1488   while (Current != End) {
1489     if (   *Current == '[' || *Current == ']'
1490         || *Current == '{' || *Current == '}'
1491         || *Current == ','
1492         || *Current == ':')
1493       break;
1494     StringRef::iterator i = skip_ns_char(Current);
1495     if (i == Current)
1496       break;
1497     Current = i;
1498     ++Column;
1499   }
1500 
1501   if (Start + 1 == Current) {
1502     setError("Got empty alias or anchor", Start);
1503     return false;
1504   }
1505 
1506   Token T;
1507   T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor;
1508   T.Range = StringRef(Start, Current - Start);
1509   TokenQueue.push_back(T);
1510 
1511   // Alias and anchors can be simple keys.
1512   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1513 
1514   IsSimpleKeyAllowed = false;
1515 
1516   return true;
1517 }
1518 
1519 char Scanner::scanBlockChompingIndicator() {
1520   char Indicator = ' ';
1521   if (Current != End && (*Current == '+' || *Current == '-')) {
1522     Indicator = *Current;
1523     skip(1);
1524   }
1525   return Indicator;
1526 }
1527 
1528 /// Get the number of line breaks after chomping.
1529 ///
1530 /// Return the number of trailing line breaks to emit, depending on
1531 /// \p ChompingIndicator.
1532 static unsigned getChompedLineBreaks(char ChompingIndicator,
1533                                      unsigned LineBreaks, StringRef Str) {
1534   if (ChompingIndicator == '-') // Strip all line breaks.
1535     return 0;
1536   if (ChompingIndicator == '+') // Keep all line breaks.
1537     return LineBreaks;
1538   // Clip trailing lines.
1539   return Str.empty() ? 0 : 1;
1540 }
1541 
1542 unsigned Scanner::scanBlockIndentationIndicator() {
1543   unsigned Indent = 0;
1544   if (Current != End && (*Current >= '1' && *Current <= '9')) {
1545     Indent = unsigned(*Current - '0');
1546     skip(1);
1547   }
1548   return Indent;
1549 }
1550 
1551 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator,
1552                                     unsigned &IndentIndicator, bool &IsDone) {
1553   auto Start = Current;
1554 
1555   ChompingIndicator = scanBlockChompingIndicator();
1556   IndentIndicator = scanBlockIndentationIndicator();
1557   // Check for the chomping indicator once again.
1558   if (ChompingIndicator == ' ')
1559     ChompingIndicator = scanBlockChompingIndicator();
1560   Current = skip_while(&Scanner::skip_s_white, Current);
1561   skipComment();
1562 
1563   if (Current == End) { // EOF, we have an empty scalar.
1564     Token T;
1565     T.Kind = Token::TK_BlockScalar;
1566     T.Range = StringRef(Start, Current - Start);
1567     TokenQueue.push_back(T);
1568     IsDone = true;
1569     return true;
1570   }
1571 
1572   if (!consumeLineBreakIfPresent()) {
1573     setError("Expected a line break after block scalar header", Current);
1574     return false;
1575   }
1576   return true;
1577 }
1578 
1579 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent,
1580                                     unsigned BlockExitIndent,
1581                                     unsigned &LineBreaks, bool &IsDone) {
1582   unsigned MaxAllSpaceLineCharacters = 0;
1583   StringRef::iterator LongestAllSpaceLine;
1584 
1585   while (true) {
1586     advanceWhile(&Scanner::skip_s_space);
1587     if (skip_nb_char(Current) != Current) {
1588       // This line isn't empty, so try and find the indentation.
1589       if (Column <= BlockExitIndent) { // End of the block literal.
1590         IsDone = true;
1591         return true;
1592       }
1593       // We found the block's indentation.
1594       BlockIndent = Column;
1595       if (MaxAllSpaceLineCharacters > BlockIndent) {
1596         setError(
1597             "Leading all-spaces line must be smaller than the block indent",
1598             LongestAllSpaceLine);
1599         return false;
1600       }
1601       return true;
1602     }
1603     if (skip_b_break(Current) != Current &&
1604         Column > MaxAllSpaceLineCharacters) {
1605       // Record the longest all-space line in case it's longer than the
1606       // discovered block indent.
1607       MaxAllSpaceLineCharacters = Column;
1608       LongestAllSpaceLine = Current;
1609     }
1610 
1611     // Check for EOF.
1612     if (Current == End) {
1613       IsDone = true;
1614       return true;
1615     }
1616 
1617     if (!consumeLineBreakIfPresent()) {
1618       IsDone = true;
1619       return true;
1620     }
1621     ++LineBreaks;
1622   }
1623   return true;
1624 }
1625 
1626 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent,
1627                                     unsigned BlockExitIndent, bool &IsDone) {
1628   // Skip the indentation.
1629   while (Column < BlockIndent) {
1630     auto I = skip_s_space(Current);
1631     if (I == Current)
1632       break;
1633     Current = I;
1634     ++Column;
1635   }
1636 
1637   if (skip_nb_char(Current) == Current)
1638     return true;
1639 
1640   if (Column <= BlockExitIndent) { // End of the block literal.
1641     IsDone = true;
1642     return true;
1643   }
1644 
1645   if (Column < BlockIndent) {
1646     if (Current != End && *Current == '#') { // Trailing comment.
1647       IsDone = true;
1648       return true;
1649     }
1650     setError("A text line is less indented than the block scalar", Current);
1651     return false;
1652   }
1653   return true; // A normal text line.
1654 }
1655 
1656 bool Scanner::scanBlockScalar(bool IsLiteral) {
1657   // Eat '|' or '>'
1658   assert(*Current == '|' || *Current == '>');
1659   skip(1);
1660 
1661   char ChompingIndicator;
1662   unsigned BlockIndent;
1663   bool IsDone = false;
1664   if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone))
1665     return false;
1666   if (IsDone)
1667     return true;
1668 
1669   auto Start = Current;
1670   unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent;
1671   unsigned LineBreaks = 0;
1672   if (BlockIndent == 0) {
1673     if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks,
1674                                IsDone))
1675       return false;
1676   }
1677 
1678   // Scan the block's scalars body.
1679   SmallString<256> Str;
1680   while (!IsDone) {
1681     if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone))
1682       return false;
1683     if (IsDone)
1684       break;
1685 
1686     // Parse the current line.
1687     auto LineStart = Current;
1688     advanceWhile(&Scanner::skip_nb_char);
1689     if (LineStart != Current) {
1690       Str.append(LineBreaks, '\n');
1691       Str.append(StringRef(LineStart, Current - LineStart));
1692       LineBreaks = 0;
1693     }
1694 
1695     // Check for EOF.
1696     if (Current == End)
1697       break;
1698 
1699     if (!consumeLineBreakIfPresent())
1700       break;
1701     ++LineBreaks;
1702   }
1703 
1704   if (Current == End && !LineBreaks)
1705     // Ensure that there is at least one line break before the end of file.
1706     LineBreaks = 1;
1707   Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n');
1708 
1709   // New lines may start a simple key.
1710   if (!FlowLevel)
1711     IsSimpleKeyAllowed = true;
1712 
1713   Token T;
1714   T.Kind = Token::TK_BlockScalar;
1715   T.Range = StringRef(Start, Current - Start);
1716   T.Value = std::string(Str);
1717   TokenQueue.push_back(T);
1718   return true;
1719 }
1720 
1721 bool Scanner::scanTag() {
1722   StringRef::iterator Start = Current;
1723   unsigned ColStart = Column;
1724   skip(1); // Eat !.
1725   if (Current == End || isBlankOrBreak(Current)); // An empty tag.
1726   else if (*Current == '<') {
1727     skip(1);
1728     scan_ns_uri_char();
1729     if (!consume('>'))
1730       return false;
1731   } else {
1732     // FIXME: Actually parse the c-ns-shorthand-tag rule.
1733     Current = skip_while(&Scanner::skip_ns_char, Current);
1734   }
1735 
1736   Token T;
1737   T.Kind = Token::TK_Tag;
1738   T.Range = StringRef(Start, Current - Start);
1739   TokenQueue.push_back(T);
1740 
1741   // Tags can be simple keys.
1742   saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);
1743 
1744   IsSimpleKeyAllowed = false;
1745 
1746   return true;
1747 }
1748 
1749 bool Scanner::fetchMoreTokens() {
1750   if (IsStartOfStream)
1751     return scanStreamStart();
1752 
1753   scanToNextToken();
1754 
1755   if (Current == End)
1756     return scanStreamEnd();
1757 
1758   removeStaleSimpleKeyCandidates();
1759 
1760   unrollIndent(Column);
1761 
1762   if (Column == 0 && *Current == '%')
1763     return scanDirective();
1764 
1765   if (Column == 0 && Current + 4 <= End
1766       && *Current == '-'
1767       && *(Current + 1) == '-'
1768       && *(Current + 2) == '-'
1769       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1770     return scanDocumentIndicator(true);
1771 
1772   if (Column == 0 && Current + 4 <= End
1773       && *Current == '.'
1774       && *(Current + 1) == '.'
1775       && *(Current + 2) == '.'
1776       && (Current + 3 == End || isBlankOrBreak(Current + 3)))
1777     return scanDocumentIndicator(false);
1778 
1779   if (*Current == '[')
1780     return scanFlowCollectionStart(true);
1781 
1782   if (*Current == '{')
1783     return scanFlowCollectionStart(false);
1784 
1785   if (*Current == ']')
1786     return scanFlowCollectionEnd(true);
1787 
1788   if (*Current == '}')
1789     return scanFlowCollectionEnd(false);
1790 
1791   if (*Current == ',')
1792     return scanFlowEntry();
1793 
1794   if (*Current == '-' && isBlankOrBreak(Current + 1))
1795     return scanBlockEntry();
1796 
1797   if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
1798     return scanKey();
1799 
1800   if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
1801     return scanValue();
1802 
1803   if (*Current == '*')
1804     return scanAliasOrAnchor(true);
1805 
1806   if (*Current == '&')
1807     return scanAliasOrAnchor(false);
1808 
1809   if (*Current == '!')
1810     return scanTag();
1811 
1812   if (*Current == '|' && !FlowLevel)
1813     return scanBlockScalar(true);
1814 
1815   if (*Current == '>' && !FlowLevel)
1816     return scanBlockScalar(false);
1817 
1818   if (*Current == '\'')
1819     return scanFlowScalar(false);
1820 
1821   if (*Current == '"')
1822     return scanFlowScalar(true);
1823 
1824   // Get a plain scalar.
1825   StringRef FirstChar(Current, 1);
1826   if (!(isBlankOrBreak(Current)
1827         || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
1828       || (*Current == '-' && !isBlankOrBreak(Current + 1))
1829       || (!FlowLevel && (*Current == '?' || *Current == ':')
1830           && isBlankOrBreak(Current + 1))
1831       || (!FlowLevel && *Current == ':'
1832                       && Current + 2 < End
1833                       && *(Current + 1) == ':'
1834                       && !isBlankOrBreak(Current + 2)))
1835     return scanPlainScalar();
1836 
1837   setError("Unrecognized character while tokenizing.", Current);
1838   return false;
1839 }
1840 
1841 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors,
1842                std::error_code *EC)
1843     : scanner(new Scanner(Input, SM, ShowColors, EC)), CurrentDoc() {}
1844 
1845 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors,
1846                std::error_code *EC)
1847     : scanner(new Scanner(InputBuffer, SM, ShowColors, EC)), CurrentDoc() {}
1848 
1849 Stream::~Stream() = default;
1850 
1851 bool Stream::failed() { return scanner->failed(); }
1852 
1853 void Stream::printError(Node *N, const Twine &Msg, SourceMgr::DiagKind Kind) {
1854   printError(N ? N->getSourceRange() : SMRange(), Msg, Kind);
1855 }
1856 
1857 void Stream::printError(const SMRange &Range, const Twine &Msg,
1858                         SourceMgr::DiagKind Kind) {
1859   scanner->printError(Range.Start, Kind, Msg, Range);
1860 }
1861 
1862 document_iterator Stream::begin() {
1863   if (CurrentDoc)
1864     report_fatal_error("Can only iterate over the stream once");
1865 
1866   // Skip Stream-Start.
1867   scanner->getNext();
1868 
1869   CurrentDoc.reset(new Document(*this));
1870   return document_iterator(CurrentDoc);
1871 }
1872 
1873 document_iterator Stream::end() {
1874   return document_iterator();
1875 }
1876 
1877 void Stream::skip() {
1878   for (Document &Doc : *this)
1879     Doc.skip();
1880 }
1881 
1882 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A,
1883            StringRef T)
1884     : Doc(D), TypeID(Type), Anchor(A), Tag(T) {
1885   SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin());
1886   SourceRange = SMRange(Start, Start);
1887 }
1888 
1889 std::string Node::getVerbatimTag() const {
1890   StringRef Raw = getRawTag();
1891   if (!Raw.empty() && Raw != "!") {
1892     std::string Ret;
1893     if (Raw.find_last_of('!') == 0) {
1894       Ret = std::string(Doc->getTagMap().find("!")->second);
1895       Ret += Raw.substr(1);
1896       return Ret;
1897     } else if (Raw.startswith("!!")) {
1898       Ret = std::string(Doc->getTagMap().find("!!")->second);
1899       Ret += Raw.substr(2);
1900       return Ret;
1901     } else {
1902       StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1);
1903       std::map<StringRef, StringRef>::const_iterator It =
1904           Doc->getTagMap().find(TagHandle);
1905       if (It != Doc->getTagMap().end())
1906         Ret = std::string(It->second);
1907       else {
1908         Token T;
1909         T.Kind = Token::TK_Tag;
1910         T.Range = TagHandle;
1911         setError(Twine("Unknown tag handle ") + TagHandle, T);
1912       }
1913       Ret += Raw.substr(Raw.find_last_of('!') + 1);
1914       return Ret;
1915     }
1916   }
1917 
1918   switch (getType()) {
1919   case NK_Null:
1920     return "tag:yaml.org,2002:null";
1921   case NK_Scalar:
1922   case NK_BlockScalar:
1923     // TODO: Tag resolution.
1924     return "tag:yaml.org,2002:str";
1925   case NK_Mapping:
1926     return "tag:yaml.org,2002:map";
1927   case NK_Sequence:
1928     return "tag:yaml.org,2002:seq";
1929   }
1930 
1931   return "";
1932 }
1933 
1934 Token &Node::peekNext() {
1935   return Doc->peekNext();
1936 }
1937 
1938 Token Node::getNext() {
1939   return Doc->getNext();
1940 }
1941 
1942 Node *Node::parseBlockNode() {
1943   return Doc->parseBlockNode();
1944 }
1945 
1946 BumpPtrAllocator &Node::getAllocator() {
1947   return Doc->NodeAllocator;
1948 }
1949 
1950 void Node::setError(const Twine &Msg, Token &Tok) const {
1951   Doc->setError(Msg, Tok);
1952 }
1953 
1954 bool Node::failed() const {
1955   return Doc->failed();
1956 }
1957 
1958 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const {
1959   // TODO: Handle newlines properly. We need to remove leading whitespace.
1960   if (Value[0] == '"') { // Double quoted.
1961     // Pull off the leading and trailing "s.
1962     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1963     // Search for characters that would require unescaping the value.
1964     StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n");
1965     if (i != StringRef::npos)
1966       return unescapeDoubleQuoted(UnquotedValue, i, Storage);
1967     return UnquotedValue;
1968   } else if (Value[0] == '\'') { // Single quoted.
1969     // Pull off the leading and trailing 's.
1970     StringRef UnquotedValue = Value.substr(1, Value.size() - 2);
1971     StringRef::size_type i = UnquotedValue.find('\'');
1972     if (i != StringRef::npos) {
1973       // We're going to need Storage.
1974       Storage.clear();
1975       Storage.reserve(UnquotedValue.size());
1976       for (; i != StringRef::npos; i = UnquotedValue.find('\'')) {
1977         StringRef Valid(UnquotedValue.begin(), i);
1978         llvm::append_range(Storage, Valid);
1979         Storage.push_back('\'');
1980         UnquotedValue = UnquotedValue.substr(i + 2);
1981       }
1982       llvm::append_range(Storage, UnquotedValue);
1983       return StringRef(Storage.begin(), Storage.size());
1984     }
1985     return UnquotedValue;
1986   }
1987   // Plain or block.
1988   return Value.rtrim(' ');
1989 }
1990 
1991 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue
1992                                           , StringRef::size_type i
1993                                           , SmallVectorImpl<char> &Storage)
1994                                           const {
1995   // Use Storage to build proper value.
1996   Storage.clear();
1997   Storage.reserve(UnquotedValue.size());
1998   for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) {
1999     // Insert all previous chars into Storage.
2000     StringRef Valid(UnquotedValue.begin(), i);
2001     llvm::append_range(Storage, Valid);
2002     // Chop off inserted chars.
2003     UnquotedValue = UnquotedValue.substr(i);
2004 
2005     assert(!UnquotedValue.empty() && "Can't be empty!");
2006 
2007     // Parse escape or line break.
2008     switch (UnquotedValue[0]) {
2009     case '\r':
2010     case '\n':
2011       Storage.push_back('\n');
2012       if (   UnquotedValue.size() > 1
2013           && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
2014         UnquotedValue = UnquotedValue.substr(1);
2015       UnquotedValue = UnquotedValue.substr(1);
2016       break;
2017     default:
2018       if (UnquotedValue.size() == 1) {
2019         Token T;
2020         T.Range = StringRef(UnquotedValue.begin(), 1);
2021         setError("Unrecognized escape code", T);
2022         return "";
2023       }
2024       UnquotedValue = UnquotedValue.substr(1);
2025       switch (UnquotedValue[0]) {
2026       default: {
2027           Token T;
2028           T.Range = StringRef(UnquotedValue.begin(), 1);
2029           setError("Unrecognized escape code", T);
2030           return "";
2031         }
2032       case '\r':
2033       case '\n':
2034         // Remove the new line.
2035         if (   UnquotedValue.size() > 1
2036             && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n'))
2037           UnquotedValue = UnquotedValue.substr(1);
2038         // If this was just a single byte newline, it will get skipped
2039         // below.
2040         break;
2041       case '0':
2042         Storage.push_back(0x00);
2043         break;
2044       case 'a':
2045         Storage.push_back(0x07);
2046         break;
2047       case 'b':
2048         Storage.push_back(0x08);
2049         break;
2050       case 't':
2051       case 0x09:
2052         Storage.push_back(0x09);
2053         break;
2054       case 'n':
2055         Storage.push_back(0x0A);
2056         break;
2057       case 'v':
2058         Storage.push_back(0x0B);
2059         break;
2060       case 'f':
2061         Storage.push_back(0x0C);
2062         break;
2063       case 'r':
2064         Storage.push_back(0x0D);
2065         break;
2066       case 'e':
2067         Storage.push_back(0x1B);
2068         break;
2069       case ' ':
2070         Storage.push_back(0x20);
2071         break;
2072       case '"':
2073         Storage.push_back(0x22);
2074         break;
2075       case '/':
2076         Storage.push_back(0x2F);
2077         break;
2078       case '\\':
2079         Storage.push_back(0x5C);
2080         break;
2081       case 'N':
2082         encodeUTF8(0x85, Storage);
2083         break;
2084       case '_':
2085         encodeUTF8(0xA0, Storage);
2086         break;
2087       case 'L':
2088         encodeUTF8(0x2028, Storage);
2089         break;
2090       case 'P':
2091         encodeUTF8(0x2029, Storage);
2092         break;
2093       case 'x': {
2094           if (UnquotedValue.size() < 3)
2095             // TODO: Report error.
2096             break;
2097           unsigned int UnicodeScalarValue;
2098           if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue))
2099             // TODO: Report error.
2100             UnicodeScalarValue = 0xFFFD;
2101           encodeUTF8(UnicodeScalarValue, Storage);
2102           UnquotedValue = UnquotedValue.substr(2);
2103           break;
2104         }
2105       case 'u': {
2106           if (UnquotedValue.size() < 5)
2107             // TODO: Report error.
2108             break;
2109           unsigned int UnicodeScalarValue;
2110           if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue))
2111             // TODO: Report error.
2112             UnicodeScalarValue = 0xFFFD;
2113           encodeUTF8(UnicodeScalarValue, Storage);
2114           UnquotedValue = UnquotedValue.substr(4);
2115           break;
2116         }
2117       case 'U': {
2118           if (UnquotedValue.size() < 9)
2119             // TODO: Report error.
2120             break;
2121           unsigned int UnicodeScalarValue;
2122           if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue))
2123             // TODO: Report error.
2124             UnicodeScalarValue = 0xFFFD;
2125           encodeUTF8(UnicodeScalarValue, Storage);
2126           UnquotedValue = UnquotedValue.substr(8);
2127           break;
2128         }
2129       }
2130       UnquotedValue = UnquotedValue.substr(1);
2131     }
2132   }
2133   llvm::append_range(Storage, UnquotedValue);
2134   return StringRef(Storage.begin(), Storage.size());
2135 }
2136 
2137 Node *KeyValueNode::getKey() {
2138   if (Key)
2139     return Key;
2140   // Handle implicit null keys.
2141   {
2142     Token &t = peekNext();
2143     if (   t.Kind == Token::TK_BlockEnd
2144         || t.Kind == Token::TK_Value
2145         || t.Kind == Token::TK_Error) {
2146       return Key = new (getAllocator()) NullNode(Doc);
2147     }
2148     if (t.Kind == Token::TK_Key)
2149       getNext(); // skip TK_Key.
2150   }
2151 
2152   // Handle explicit null keys.
2153   Token &t = peekNext();
2154   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) {
2155     return Key = new (getAllocator()) NullNode(Doc);
2156   }
2157 
2158   // We've got a normal key.
2159   return Key = parseBlockNode();
2160 }
2161 
2162 Node *KeyValueNode::getValue() {
2163   if (Value)
2164     return Value;
2165 
2166   if (Node* Key = getKey())
2167     Key->skip();
2168   else {
2169     setError("Null key in Key Value.", peekNext());
2170     return Value = new (getAllocator()) NullNode(Doc);
2171   }
2172 
2173   if (failed())
2174     return Value = new (getAllocator()) NullNode(Doc);
2175 
2176   // Handle implicit null values.
2177   {
2178     Token &t = peekNext();
2179     if (   t.Kind == Token::TK_BlockEnd
2180         || t.Kind == Token::TK_FlowMappingEnd
2181         || t.Kind == Token::TK_Key
2182         || t.Kind == Token::TK_FlowEntry
2183         || t.Kind == Token::TK_Error) {
2184       return Value = new (getAllocator()) NullNode(Doc);
2185     }
2186 
2187     if (t.Kind != Token::TK_Value) {
2188       setError("Unexpected token in Key Value.", t);
2189       return Value = new (getAllocator()) NullNode(Doc);
2190     }
2191     getNext(); // skip TK_Value.
2192   }
2193 
2194   // Handle explicit null values.
2195   Token &t = peekNext();
2196   if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) {
2197     return Value = new (getAllocator()) NullNode(Doc);
2198   }
2199 
2200   // We got a normal value.
2201   return Value = parseBlockNode();
2202 }
2203 
2204 void MappingNode::increment() {
2205   if (failed()) {
2206     IsAtEnd = true;
2207     CurrentEntry = nullptr;
2208     return;
2209   }
2210   if (CurrentEntry) {
2211     CurrentEntry->skip();
2212     if (Type == MT_Inline) {
2213       IsAtEnd = true;
2214       CurrentEntry = nullptr;
2215       return;
2216     }
2217   }
2218   Token T = peekNext();
2219   if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) {
2220     // KeyValueNode eats the TK_Key. That way it can detect null keys.
2221     CurrentEntry = new (getAllocator()) KeyValueNode(Doc);
2222   } else if (Type == MT_Block) {
2223     switch (T.Kind) {
2224     case Token::TK_BlockEnd:
2225       getNext();
2226       IsAtEnd = true;
2227       CurrentEntry = nullptr;
2228       break;
2229     default:
2230       setError("Unexpected token. Expected Key or Block End", T);
2231       LLVM_FALLTHROUGH;
2232     case Token::TK_Error:
2233       IsAtEnd = true;
2234       CurrentEntry = nullptr;
2235     }
2236   } else {
2237     switch (T.Kind) {
2238     case Token::TK_FlowEntry:
2239       // Eat the flow entry and recurse.
2240       getNext();
2241       return increment();
2242     case Token::TK_FlowMappingEnd:
2243       getNext();
2244       LLVM_FALLTHROUGH;
2245     case Token::TK_Error:
2246       // Set this to end iterator.
2247       IsAtEnd = true;
2248       CurrentEntry = nullptr;
2249       break;
2250     default:
2251       setError( "Unexpected token. Expected Key, Flow Entry, or Flow "
2252                 "Mapping End."
2253               , T);
2254       IsAtEnd = true;
2255       CurrentEntry = nullptr;
2256     }
2257   }
2258 }
2259 
2260 void SequenceNode::increment() {
2261   if (failed()) {
2262     IsAtEnd = true;
2263     CurrentEntry = nullptr;
2264     return;
2265   }
2266   if (CurrentEntry)
2267     CurrentEntry->skip();
2268   Token T = peekNext();
2269   if (SeqType == ST_Block) {
2270     switch (T.Kind) {
2271     case Token::TK_BlockEntry:
2272       getNext();
2273       CurrentEntry = parseBlockNode();
2274       if (!CurrentEntry) { // An error occurred.
2275         IsAtEnd = true;
2276         CurrentEntry = nullptr;
2277       }
2278       break;
2279     case Token::TK_BlockEnd:
2280       getNext();
2281       IsAtEnd = true;
2282       CurrentEntry = nullptr;
2283       break;
2284     default:
2285       setError( "Unexpected token. Expected Block Entry or Block End."
2286               , T);
2287       LLVM_FALLTHROUGH;
2288     case Token::TK_Error:
2289       IsAtEnd = true;
2290       CurrentEntry = nullptr;
2291     }
2292   } else if (SeqType == ST_Indentless) {
2293     switch (T.Kind) {
2294     case Token::TK_BlockEntry:
2295       getNext();
2296       CurrentEntry = parseBlockNode();
2297       if (!CurrentEntry) { // An error occurred.
2298         IsAtEnd = true;
2299         CurrentEntry = nullptr;
2300       }
2301       break;
2302     default:
2303     case Token::TK_Error:
2304       IsAtEnd = true;
2305       CurrentEntry = nullptr;
2306     }
2307   } else if (SeqType == ST_Flow) {
2308     switch (T.Kind) {
2309     case Token::TK_FlowEntry:
2310       // Eat the flow entry and recurse.
2311       getNext();
2312       WasPreviousTokenFlowEntry = true;
2313       return increment();
2314     case Token::TK_FlowSequenceEnd:
2315       getNext();
2316       LLVM_FALLTHROUGH;
2317     case Token::TK_Error:
2318       // Set this to end iterator.
2319       IsAtEnd = true;
2320       CurrentEntry = nullptr;
2321       break;
2322     case Token::TK_StreamEnd:
2323     case Token::TK_DocumentEnd:
2324     case Token::TK_DocumentStart:
2325       setError("Could not find closing ]!", T);
2326       // Set this to end iterator.
2327       IsAtEnd = true;
2328       CurrentEntry = nullptr;
2329       break;
2330     default:
2331       if (!WasPreviousTokenFlowEntry) {
2332         setError("Expected , between entries!", T);
2333         IsAtEnd = true;
2334         CurrentEntry = nullptr;
2335         break;
2336       }
2337       // Otherwise it must be a flow entry.
2338       CurrentEntry = parseBlockNode();
2339       if (!CurrentEntry) {
2340         IsAtEnd = true;
2341       }
2342       WasPreviousTokenFlowEntry = false;
2343       break;
2344     }
2345   }
2346 }
2347 
2348 Document::Document(Stream &S) : stream(S), Root(nullptr) {
2349   // Tag maps starts with two default mappings.
2350   TagMap["!"] = "!";
2351   TagMap["!!"] = "tag:yaml.org,2002:";
2352 
2353   if (parseDirectives())
2354     expectToken(Token::TK_DocumentStart);
2355   Token &T = peekNext();
2356   if (T.Kind == Token::TK_DocumentStart)
2357     getNext();
2358 }
2359 
2360 bool Document::skip()  {
2361   if (stream.scanner->failed())
2362     return false;
2363   if (!Root && !getRoot())
2364     return false;
2365   Root->skip();
2366   Token &T = peekNext();
2367   if (T.Kind == Token::TK_StreamEnd)
2368     return false;
2369   if (T.Kind == Token::TK_DocumentEnd) {
2370     getNext();
2371     return skip();
2372   }
2373   return true;
2374 }
2375 
2376 Token &Document::peekNext() {
2377   return stream.scanner->peekNext();
2378 }
2379 
2380 Token Document::getNext() {
2381   return stream.scanner->getNext();
2382 }
2383 
2384 void Document::setError(const Twine &Message, Token &Location) const {
2385   stream.scanner->setError(Message, Location.Range.begin());
2386 }
2387 
2388 bool Document::failed() const {
2389   return stream.scanner->failed();
2390 }
2391 
2392 Node *Document::parseBlockNode() {
2393   Token T = peekNext();
2394   // Handle properties.
2395   Token AnchorInfo;
2396   Token TagInfo;
2397 parse_property:
2398   switch (T.Kind) {
2399   case Token::TK_Alias:
2400     getNext();
2401     return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1));
2402   case Token::TK_Anchor:
2403     if (AnchorInfo.Kind == Token::TK_Anchor) {
2404       setError("Already encountered an anchor for this node!", T);
2405       return nullptr;
2406     }
2407     AnchorInfo = getNext(); // Consume TK_Anchor.
2408     T = peekNext();
2409     goto parse_property;
2410   case Token::TK_Tag:
2411     if (TagInfo.Kind == Token::TK_Tag) {
2412       setError("Already encountered a tag for this node!", T);
2413       return nullptr;
2414     }
2415     TagInfo = getNext(); // Consume TK_Tag.
2416     T = peekNext();
2417     goto parse_property;
2418   default:
2419     break;
2420   }
2421 
2422   switch (T.Kind) {
2423   case Token::TK_BlockEntry:
2424     // We got an unindented BlockEntry sequence. This is not terminated with
2425     // a BlockEnd.
2426     // Don't eat the TK_BlockEntry, SequenceNode needs it.
2427     return new (NodeAllocator) SequenceNode( stream.CurrentDoc
2428                                            , AnchorInfo.Range.substr(1)
2429                                            , TagInfo.Range
2430                                            , SequenceNode::ST_Indentless);
2431   case Token::TK_BlockSequenceStart:
2432     getNext();
2433     return new (NodeAllocator)
2434       SequenceNode( stream.CurrentDoc
2435                   , AnchorInfo.Range.substr(1)
2436                   , TagInfo.Range
2437                   , SequenceNode::ST_Block);
2438   case Token::TK_BlockMappingStart:
2439     getNext();
2440     return new (NodeAllocator)
2441       MappingNode( stream.CurrentDoc
2442                  , AnchorInfo.Range.substr(1)
2443                  , TagInfo.Range
2444                  , MappingNode::MT_Block);
2445   case Token::TK_FlowSequenceStart:
2446     getNext();
2447     return new (NodeAllocator)
2448       SequenceNode( stream.CurrentDoc
2449                   , AnchorInfo.Range.substr(1)
2450                   , TagInfo.Range
2451                   , SequenceNode::ST_Flow);
2452   case Token::TK_FlowMappingStart:
2453     getNext();
2454     return new (NodeAllocator)
2455       MappingNode( stream.CurrentDoc
2456                  , AnchorInfo.Range.substr(1)
2457                  , TagInfo.Range
2458                  , MappingNode::MT_Flow);
2459   case Token::TK_Scalar:
2460     getNext();
2461     return new (NodeAllocator)
2462       ScalarNode( stream.CurrentDoc
2463                 , AnchorInfo.Range.substr(1)
2464                 , TagInfo.Range
2465                 , T.Range);
2466   case Token::TK_BlockScalar: {
2467     getNext();
2468     StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1);
2469     StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back();
2470     return new (NodeAllocator)
2471         BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1),
2472                         TagInfo.Range, StrCopy, T.Range);
2473   }
2474   case Token::TK_Key:
2475     // Don't eat the TK_Key, KeyValueNode expects it.
2476     return new (NodeAllocator)
2477       MappingNode( stream.CurrentDoc
2478                  , AnchorInfo.Range.substr(1)
2479                  , TagInfo.Range
2480                  , MappingNode::MT_Inline);
2481   case Token::TK_DocumentStart:
2482   case Token::TK_DocumentEnd:
2483   case Token::TK_StreamEnd:
2484   default:
2485     // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not
2486     //       !!null null.
2487     return new (NodeAllocator) NullNode(stream.CurrentDoc);
2488   case Token::TK_FlowMappingEnd:
2489   case Token::TK_FlowSequenceEnd:
2490   case Token::TK_FlowEntry: {
2491     if (Root && (isa<MappingNode>(Root) || isa<SequenceNode>(Root)))
2492       return new (NodeAllocator) NullNode(stream.CurrentDoc);
2493 
2494     setError("Unexpected token", T);
2495     return nullptr;
2496   }
2497   case Token::TK_Error:
2498     return nullptr;
2499   }
2500   llvm_unreachable("Control flow shouldn't reach here.");
2501   return nullptr;
2502 }
2503 
2504 bool Document::parseDirectives() {
2505   bool isDirective = false;
2506   while (true) {
2507     Token T = peekNext();
2508     if (T.Kind == Token::TK_TagDirective) {
2509       parseTAGDirective();
2510       isDirective = true;
2511     } else if (T.Kind == Token::TK_VersionDirective) {
2512       parseYAMLDirective();
2513       isDirective = true;
2514     } else
2515       break;
2516   }
2517   return isDirective;
2518 }
2519 
2520 void Document::parseYAMLDirective() {
2521   getNext(); // Eat %YAML <version>
2522 }
2523 
2524 void Document::parseTAGDirective() {
2525   Token Tag = getNext(); // %TAG <handle> <prefix>
2526   StringRef T = Tag.Range;
2527   // Strip %TAG
2528   T = T.substr(T.find_first_of(" \t")).ltrim(" \t");
2529   std::size_t HandleEnd = T.find_first_of(" \t");
2530   StringRef TagHandle = T.substr(0, HandleEnd);
2531   StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t");
2532   TagMap[TagHandle] = TagPrefix;
2533 }
2534 
2535 bool Document::expectToken(int TK) {
2536   Token T = getNext();
2537   if (T.Kind != TK) {
2538     setError("Unexpected token", T);
2539     return false;
2540   }
2541   return true;
2542 }
2543