xref: /freebsd/contrib/llvm-project/clang/lib/Lex/LiteralSupport.cpp (revision 78cd75393ec79565c63927bf200f06f839a1dc05)
1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the NumericLiteralParser, CharLiteralParser, and
10 // StringLiteralParser interfaces.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/Lex/LiteralSupport.h"
15 #include "clang/Basic/CharInfo.h"
16 #include "clang/Basic/LangOptions.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "clang/Lex/LexDiagnostic.h"
20 #include "clang/Lex/Lexer.h"
21 #include "clang/Lex/Preprocessor.h"
22 #include "clang/Lex/Token.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/StringExtras.h"
26 #include "llvm/ADT/StringSwitch.h"
27 #include "llvm/Support/ConvertUTF.h"
28 #include "llvm/Support/Error.h"
29 #include "llvm/Support/ErrorHandling.h"
30 #include "llvm/Support/Unicode.h"
31 #include <algorithm>
32 #include <cassert>
33 #include <cstddef>
34 #include <cstdint>
35 #include <cstring>
36 #include <string>
37 
38 using namespace clang;
39 
40 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41   switch (kind) {
42   default: llvm_unreachable("Unknown token type!");
43   case tok::char_constant:
44   case tok::string_literal:
45   case tok::utf8_char_constant:
46   case tok::utf8_string_literal:
47     return Target.getCharWidth();
48   case tok::wide_char_constant:
49   case tok::wide_string_literal:
50     return Target.getWCharWidth();
51   case tok::utf16_char_constant:
52   case tok::utf16_string_literal:
53     return Target.getChar16Width();
54   case tok::utf32_char_constant:
55   case tok::utf32_string_literal:
56     return Target.getChar32Width();
57   }
58 }
59 
60 static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
61   switch (kind) {
62   default:
63     llvm_unreachable("Unknown token type!");
64   case tok::char_constant:
65   case tok::string_literal:
66     return 0;
67   case tok::utf8_char_constant:
68   case tok::utf8_string_literal:
69     return 2;
70   case tok::wide_char_constant:
71   case tok::wide_string_literal:
72   case tok::utf16_char_constant:
73   case tok::utf16_string_literal:
74   case tok::utf32_char_constant:
75   case tok::utf32_string_literal:
76     return 1;
77   }
78 }
79 
80 static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
81                                            FullSourceLoc TokLoc,
82                                            const char *TokBegin,
83                                            const char *TokRangeBegin,
84                                            const char *TokRangeEnd) {
85   SourceLocation Begin =
86     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
87                                    TokLoc.getManager(), Features);
88   SourceLocation End =
89     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
90                                    TokLoc.getManager(), Features);
91   return CharSourceRange::getCharRange(Begin, End);
92 }
93 
94 /// Produce a diagnostic highlighting some portion of a literal.
95 ///
96 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
97 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
98 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
99 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
100                               const LangOptions &Features, FullSourceLoc TokLoc,
101                               const char *TokBegin, const char *TokRangeBegin,
102                               const char *TokRangeEnd, unsigned DiagID) {
103   SourceLocation Begin =
104     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
105                                    TokLoc.getManager(), Features);
106   return Diags->Report(Begin, DiagID) <<
107     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
108 }
109 
110 static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
111   switch (Escape) {
112   case '\'':
113   case '"':
114   case '?':
115   case '\\':
116   case 'a':
117   case 'b':
118   case 'f':
119   case 'n':
120   case 'r':
121   case 't':
122   case 'v':
123     return true;
124   }
125   return false;
126 }
127 
128 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
129 /// either a character or a string literal.
130 static unsigned ProcessCharEscape(const char *ThisTokBegin,
131                                   const char *&ThisTokBuf,
132                                   const char *ThisTokEnd, bool &HadError,
133                                   FullSourceLoc Loc, unsigned CharWidth,
134                                   DiagnosticsEngine *Diags,
135                                   const LangOptions &Features,
136                                   StringLiteralEvalMethod EvalMethod) {
137   const char *EscapeBegin = ThisTokBuf;
138   bool Delimited = false;
139   bool EndDelimiterFound = false;
140 
141   // Skip the '\' char.
142   ++ThisTokBuf;
143 
144   // We know that this character can't be off the end of the buffer, because
145   // that would have been \", which would not have been the end of string.
146   unsigned ResultChar = *ThisTokBuf++;
147   char Escape = ResultChar;
148   switch (ResultChar) {
149   // These map to themselves.
150   case '\\': case '\'': case '"': case '?': break;
151 
152     // These have fixed mappings.
153   case 'a':
154     // TODO: K&R: the meaning of '\\a' is different in traditional C
155     ResultChar = 7;
156     break;
157   case 'b':
158     ResultChar = 8;
159     break;
160   case 'e':
161     if (Diags)
162       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
163            diag::ext_nonstandard_escape) << "e";
164     ResultChar = 27;
165     break;
166   case 'E':
167     if (Diags)
168       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
169            diag::ext_nonstandard_escape) << "E";
170     ResultChar = 27;
171     break;
172   case 'f':
173     ResultChar = 12;
174     break;
175   case 'n':
176     ResultChar = 10;
177     break;
178   case 'r':
179     ResultChar = 13;
180     break;
181   case 't':
182     ResultChar = 9;
183     break;
184   case 'v':
185     ResultChar = 11;
186     break;
187   case 'x': { // Hex escape.
188     ResultChar = 0;
189     if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
190       Delimited = true;
191       ThisTokBuf++;
192       if (*ThisTokBuf == '}') {
193         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194              diag::err_delimited_escape_empty);
195         return ResultChar;
196       }
197     } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
198       if (Diags)
199         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
200              diag::err_hex_escape_no_digits) << "x";
201       return ResultChar;
202     }
203 
204     // Hex escapes are a maximal series of hex digits.
205     bool Overflow = false;
206     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207       if (Delimited && *ThisTokBuf == '}') {
208         ThisTokBuf++;
209         EndDelimiterFound = true;
210         break;
211       }
212       int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213       if (CharVal == -1) {
214         // Non delimited hex escape sequences stop at the first non-hex digit.
215         if (!Delimited)
216           break;
217         HadError = true;
218         if (Diags)
219           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220                diag::err_delimited_escape_invalid)
221               << StringRef(ThisTokBuf, 1);
222         continue;
223       }
224       // About to shift out a digit?
225       if (ResultChar & 0xF0000000)
226         Overflow = true;
227       ResultChar <<= 4;
228       ResultChar |= CharVal;
229     }
230     // See if any bits will be truncated when evaluated as a character.
231     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
232       Overflow = true;
233       ResultChar &= ~0U >> (32-CharWidth);
234     }
235 
236     // Check for overflow.
237     if (!HadError && Overflow) { // Too many digits to fit in
238       HadError = true;
239       if (Diags)
240         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241              diag::err_escape_too_large)
242             << 0;
243     }
244     break;
245   }
246   case '0': case '1': case '2': case '3':
247   case '4': case '5': case '6': case '7': {
248     // Octal escapes.
249     --ThisTokBuf;
250     ResultChar = 0;
251 
252     // Octal escapes are a series of octal digits with maximum length 3.
253     // "\0123" is a two digit sequence equal to "\012" "3".
254     unsigned NumDigits = 0;
255     do {
256       ResultChar <<= 3;
257       ResultChar |= *ThisTokBuf++ - '0';
258       ++NumDigits;
259     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
260              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
261 
262     // Check for overflow.  Reject '\777', but not L'\777'.
263     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
264       if (Diags)
265         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
266              diag::err_escape_too_large) << 1;
267       ResultChar &= ~0U >> (32-CharWidth);
268     }
269     break;
270   }
271   case 'o': {
272     bool Overflow = false;
273     if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
274       HadError = true;
275       if (Diags)
276         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
277              diag::err_delimited_escape_missing_brace)
278             << "o";
279 
280       break;
281     }
282     ResultChar = 0;
283     Delimited = true;
284     ++ThisTokBuf;
285     if (*ThisTokBuf == '}') {
286       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287            diag::err_delimited_escape_empty);
288       return ResultChar;
289     }
290 
291     while (ThisTokBuf != ThisTokEnd) {
292       if (*ThisTokBuf == '}') {
293         EndDelimiterFound = true;
294         ThisTokBuf++;
295         break;
296       }
297       if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
298         HadError = true;
299         if (Diags)
300           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301                diag::err_delimited_escape_invalid)
302               << StringRef(ThisTokBuf, 1);
303         ThisTokBuf++;
304         continue;
305       }
306       // Check if one of the top three bits is set before shifting them out.
307       if (ResultChar & 0xE0000000)
308         Overflow = true;
309 
310       ResultChar <<= 3;
311       ResultChar |= *ThisTokBuf++ - '0';
312     }
313     // Check for overflow.  Reject '\777', but not L'\777'.
314     if (!HadError &&
315         (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
316       HadError = true;
317       if (Diags)
318         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319              diag::err_escape_too_large)
320             << 1;
321       ResultChar &= ~0U >> (32 - CharWidth);
322     }
323     break;
324   }
325     // Otherwise, these are not valid escapes.
326   case '(': case '{': case '[': case '%':
327     // GCC accepts these as extensions.  We warn about them as such though.
328     if (Diags)
329       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
330            diag::ext_nonstandard_escape)
331         << std::string(1, ResultChar);
332     break;
333   default:
334     if (!Diags)
335       break;
336 
337     if (isPrintable(ResultChar))
338       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
339            diag::ext_unknown_escape)
340         << std::string(1, ResultChar);
341     else
342       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
343            diag::ext_unknown_escape)
344         << "x" + llvm::utohexstr(ResultChar);
345     break;
346   }
347 
348   if (Delimited && Diags) {
349     if (!EndDelimiterFound)
350       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351            diag::err_expected)
352           << tok::r_brace;
353     else if (!HadError) {
354       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
355            Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356                                 : diag::ext_delimited_escape_sequence)
357           << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
358     }
359   }
360 
361   if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
362       !IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
363     Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
364          diag::err_unevaluated_string_invalid_escape_sequence)
365         << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
366     HadError = true;
367   }
368 
369   return ResultChar;
370 }
371 
372 static void appendCodePoint(unsigned Codepoint,
373                             llvm::SmallVectorImpl<char> &Str) {
374   char ResultBuf[4];
375   char *ResultPtr = ResultBuf;
376   if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
377     Str.append(ResultBuf, ResultPtr);
378 }
379 
380 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
381   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
382     if (*I != '\\') {
383       Buf.push_back(*I);
384       continue;
385     }
386 
387     ++I;
388     char Kind = *I;
389     ++I;
390 
391     assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392     uint32_t CodePoint = 0;
393 
394     if (Kind == 'u' && *I == '{') {
395       for (++I; *I != '}'; ++I) {
396         unsigned Value = llvm::hexDigitValue(*I);
397         assert(Value != -1U);
398         CodePoint <<= 4;
399         CodePoint += Value;
400       }
401       appendCodePoint(CodePoint, Buf);
402       continue;
403     }
404 
405     if (Kind == 'N') {
406       assert(*I == '{');
407       ++I;
408       auto Delim = std::find(I, Input.end(), '}');
409       assert(Delim != Input.end());
410       std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
411           llvm::sys::unicode::nameToCodepointLooseMatching(
412               StringRef(I, std::distance(I, Delim)));
413       assert(Res);
414       CodePoint = Res->CodePoint;
415       assert(CodePoint != 0xFFFFFFFF);
416       appendCodePoint(CodePoint, Buf);
417       I = Delim;
418       continue;
419     }
420 
421     unsigned NumHexDigits;
422     if (Kind == 'u')
423       NumHexDigits = 4;
424     else
425       NumHexDigits = 8;
426 
427     assert(I + NumHexDigits <= E);
428 
429     for (; NumHexDigits != 0; ++I, --NumHexDigits) {
430       unsigned Value = llvm::hexDigitValue(*I);
431       assert(Value != -1U);
432 
433       CodePoint <<= 4;
434       CodePoint += Value;
435     }
436 
437     appendCodePoint(CodePoint, Buf);
438     --I;
439   }
440 }
441 
442 static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
443                                     const char *&ThisTokBuf,
444                                     const char *ThisTokEnd, uint32_t &UcnVal,
445                                     unsigned short &UcnLen, bool &Delimited,
446                                     FullSourceLoc Loc, DiagnosticsEngine *Diags,
447                                     const LangOptions &Features,
448                                     bool in_char_string_literal = false) {
449   const char *UcnBegin = ThisTokBuf;
450   bool HasError = false;
451   bool EndDelimiterFound = false;
452 
453   // Skip the '\u' char's.
454   ThisTokBuf += 2;
455   Delimited = false;
456   if (UcnBegin[1] == 'u' && in_char_string_literal &&
457       ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
458     Delimited = true;
459     ThisTokBuf++;
460   } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
461     if (Diags)
462       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
463            diag::err_hex_escape_no_digits)
464           << StringRef(&ThisTokBuf[-1], 1);
465     return false;
466   }
467   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
468 
469   bool Overflow = false;
470   unsigned short Count = 0;
471   for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
472        ++ThisTokBuf) {
473     if (Delimited && *ThisTokBuf == '}') {
474       ++ThisTokBuf;
475       EndDelimiterFound = true;
476       break;
477     }
478     int CharVal = llvm::hexDigitValue(*ThisTokBuf);
479     if (CharVal == -1) {
480       HasError = true;
481       if (!Delimited)
482         break;
483       if (Diags) {
484         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
485              diag::err_delimited_escape_invalid)
486             << StringRef(ThisTokBuf, 1);
487       }
488       Count++;
489       continue;
490     }
491     if (UcnVal & 0xF0000000) {
492       Overflow = true;
493       continue;
494     }
495     UcnVal <<= 4;
496     UcnVal |= CharVal;
497     Count++;
498   }
499 
500   if (Overflow) {
501     if (Diags)
502       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
503            diag::err_escape_too_large)
504           << 0;
505     return false;
506   }
507 
508   if (Delimited && !EndDelimiterFound) {
509     if (Diags) {
510       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
511            diag::err_expected)
512           << tok::r_brace;
513     }
514     return false;
515   }
516 
517   // If we didn't consume the proper number of digits, there is a problem.
518   if (Count == 0 || (!Delimited && Count != UcnLen)) {
519     if (Diags)
520       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
521            Delimited ? diag::err_delimited_escape_empty
522                      : diag::err_ucn_escape_incomplete);
523     return false;
524   }
525   return !HasError;
526 }
527 
528 static void DiagnoseInvalidUnicodeCharacterName(
529     DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
530     const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
531     llvm::StringRef Name) {
532 
533   Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
534        diag::err_invalid_ucn_name)
535       << Name;
536 
537   namespace u = llvm::sys::unicode;
538 
539   std::optional<u::LooseMatchingResult> Res =
540       u::nameToCodepointLooseMatching(Name);
541   if (Res) {
542     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
543          diag::note_invalid_ucn_name_loose_matching)
544         << FixItHint::CreateReplacement(
545                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
546                                    TokRangeEnd),
547                Res->Name);
548     return;
549   }
550 
551   unsigned Distance = 0;
552   SmallVector<u::MatchForCodepointName> Matches =
553       u::nearestMatchesForCodepointName(Name, 5);
554   assert(!Matches.empty() && "No unicode characters found");
555 
556   for (const auto &Match : Matches) {
557     if (Distance == 0)
558       Distance = Match.Distance;
559     if (std::max(Distance, Match.Distance) -
560             std::min(Distance, Match.Distance) >
561         3)
562       break;
563     Distance = Match.Distance;
564 
565     std::string Str;
566     llvm::UTF32 V = Match.Value;
567     bool Converted =
568         llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
569     (void)Converted;
570     assert(Converted && "Found a match wich is not a unicode character");
571 
572     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
573          diag::note_invalid_ucn_name_candidate)
574         << Match.Name << llvm::utohexstr(Match.Value)
575         << Str // FIXME: Fix the rendering of non printable characters
576         << FixItHint::CreateReplacement(
577                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
578                                    TokRangeEnd),
579                Match.Name);
580   }
581 }
582 
583 static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
584                                   const char *&ThisTokBuf,
585                                   const char *ThisTokEnd, uint32_t &UcnVal,
586                                   unsigned short &UcnLen, FullSourceLoc Loc,
587                                   DiagnosticsEngine *Diags,
588                                   const LangOptions &Features) {
589   const char *UcnBegin = ThisTokBuf;
590   assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
591   ThisTokBuf += 2;
592   if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
593     if (Diags) {
594       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
595            diag::err_delimited_escape_missing_brace)
596           << StringRef(&ThisTokBuf[-1], 1);
597     }
598     return false;
599   }
600   ThisTokBuf++;
601   const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
602     return C == '}' || isVerticalWhitespace(C);
603   });
604   bool Incomplete = ClosingBrace == ThisTokEnd;
605   bool Empty = ClosingBrace == ThisTokBuf;
606   if (Incomplete || Empty) {
607     if (Diags) {
608       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
609            Incomplete ? diag::err_ucn_escape_incomplete
610                       : diag::err_delimited_escape_empty)
611           << StringRef(&UcnBegin[1], 1);
612     }
613     ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
614     return false;
615   }
616   StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
617   ThisTokBuf = ClosingBrace + 1;
618   std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
619   if (!Res) {
620     if (Diags)
621       DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
622                                           &UcnBegin[3], ClosingBrace, Name);
623     return false;
624   }
625   UcnVal = *Res;
626   UcnLen = UcnVal > 0xFFFF ? 8 : 4;
627   return true;
628 }
629 
630 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
631 /// return the UTF32.
632 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
633                              const char *ThisTokEnd, uint32_t &UcnVal,
634                              unsigned short &UcnLen, FullSourceLoc Loc,
635                              DiagnosticsEngine *Diags,
636                              const LangOptions &Features,
637                              bool in_char_string_literal = false) {
638 
639   bool HasError;
640   const char *UcnBegin = ThisTokBuf;
641   bool IsDelimitedEscapeSequence = false;
642   bool IsNamedEscapeSequence = false;
643   if (ThisTokBuf[1] == 'N') {
644     IsNamedEscapeSequence = true;
645     HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
646                                       UcnVal, UcnLen, Loc, Diags, Features);
647   } else {
648     HasError =
649         !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
650                                  UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
651                                  Features, in_char_string_literal);
652   }
653   if (HasError)
654     return false;
655 
656   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
657   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
658       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
659     if (Diags)
660       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
661            diag::err_ucn_escape_invalid);
662     return false;
663   }
664 
665   // C2x and C++11 allow UCNs that refer to control characters
666   // and basic source characters inside character and string literals
667   if (UcnVal < 0xa0 &&
668       // $, @, ` are allowed in all language modes
669       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
670     bool IsError =
671         (!(Features.CPlusPlus11 || Features.C2x) || !in_char_string_literal);
672     if (Diags) {
673       char BasicSCSChar = UcnVal;
674       if (UcnVal >= 0x20 && UcnVal < 0x7f)
675         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
676              IsError ? diag::err_ucn_escape_basic_scs
677              : Features.CPlusPlus
678                  ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
679                  : diag::warn_c2x_compat_literal_ucn_escape_basic_scs)
680             << StringRef(&BasicSCSChar, 1);
681       else
682         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
683              IsError ? diag::err_ucn_control_character
684              : Features.CPlusPlus
685                  ? diag::warn_cxx98_compat_literal_ucn_control_character
686                  : diag::warn_c2x_compat_literal_ucn_control_character);
687     }
688     if (IsError)
689       return false;
690   }
691 
692   if (!Features.CPlusPlus && !Features.C99 && Diags)
693     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
694          diag::warn_ucn_not_valid_in_c89_literal);
695 
696   if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
697     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
698          Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
699                               : diag::ext_delimited_escape_sequence)
700         << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
701 
702   return true;
703 }
704 
705 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
706 /// which this UCN will occupy.
707 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
708                             const char *ThisTokEnd, unsigned CharByteWidth,
709                             const LangOptions &Features, bool &HadError) {
710   // UTF-32: 4 bytes per escape.
711   if (CharByteWidth == 4)
712     return 4;
713 
714   uint32_t UcnVal = 0;
715   unsigned short UcnLen = 0;
716   FullSourceLoc Loc;
717 
718   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
719                         UcnLen, Loc, nullptr, Features, true)) {
720     HadError = true;
721     return 0;
722   }
723 
724   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
725   if (CharByteWidth == 2)
726     return UcnVal <= 0xFFFF ? 2 : 4;
727 
728   // UTF-8.
729   if (UcnVal < 0x80)
730     return 1;
731   if (UcnVal < 0x800)
732     return 2;
733   if (UcnVal < 0x10000)
734     return 3;
735   return 4;
736 }
737 
738 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
739 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
740 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
741 /// we will likely rework our support for UCN's.
742 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
743                             const char *ThisTokEnd,
744                             char *&ResultBuf, bool &HadError,
745                             FullSourceLoc Loc, unsigned CharByteWidth,
746                             DiagnosticsEngine *Diags,
747                             const LangOptions &Features) {
748   typedef uint32_t UTF32;
749   UTF32 UcnVal = 0;
750   unsigned short UcnLen = 0;
751   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
752                         Loc, Diags, Features, true)) {
753     HadError = true;
754     return;
755   }
756 
757   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
758          "only character widths of 1, 2, or 4 bytes supported");
759 
760   (void)UcnLen;
761   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
762 
763   if (CharByteWidth == 4) {
764     // FIXME: Make the type of the result buffer correct instead of
765     // using reinterpret_cast.
766     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
767     *ResultPtr = UcnVal;
768     ResultBuf += 4;
769     return;
770   }
771 
772   if (CharByteWidth == 2) {
773     // FIXME: Make the type of the result buffer correct instead of
774     // using reinterpret_cast.
775     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
776 
777     if (UcnVal <= (UTF32)0xFFFF) {
778       *ResultPtr = UcnVal;
779       ResultBuf += 2;
780       return;
781     }
782 
783     // Convert to UTF16.
784     UcnVal -= 0x10000;
785     *ResultPtr     = 0xD800 + (UcnVal >> 10);
786     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
787     ResultBuf += 4;
788     return;
789   }
790 
791   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
792 
793   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
794   // The conversion below was inspired by:
795   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
796   // First, we determine how many bytes the result will require.
797   typedef uint8_t UTF8;
798 
799   unsigned short bytesToWrite = 0;
800   if (UcnVal < (UTF32)0x80)
801     bytesToWrite = 1;
802   else if (UcnVal < (UTF32)0x800)
803     bytesToWrite = 2;
804   else if (UcnVal < (UTF32)0x10000)
805     bytesToWrite = 3;
806   else
807     bytesToWrite = 4;
808 
809   const unsigned byteMask = 0xBF;
810   const unsigned byteMark = 0x80;
811 
812   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
813   // into the first byte, depending on how many bytes follow.
814   static const UTF8 firstByteMark[5] = {
815     0x00, 0x00, 0xC0, 0xE0, 0xF0
816   };
817   // Finally, we write the bytes into ResultBuf.
818   ResultBuf += bytesToWrite;
819   switch (bytesToWrite) { // note: everything falls through.
820   case 4:
821     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
822     [[fallthrough]];
823   case 3:
824     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
825     [[fallthrough]];
826   case 2:
827     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
828     [[fallthrough]];
829   case 1:
830     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
831   }
832   // Update the buffer.
833   ResultBuf += bytesToWrite;
834 }
835 
836 ///       integer-constant: [C99 6.4.4.1]
837 ///         decimal-constant integer-suffix
838 ///         octal-constant integer-suffix
839 ///         hexadecimal-constant integer-suffix
840 ///         binary-literal integer-suffix [GNU, C++1y]
841 ///       user-defined-integer-literal: [C++11 lex.ext]
842 ///         decimal-literal ud-suffix
843 ///         octal-literal ud-suffix
844 ///         hexadecimal-literal ud-suffix
845 ///         binary-literal ud-suffix [GNU, C++1y]
846 ///       decimal-constant:
847 ///         nonzero-digit
848 ///         decimal-constant digit
849 ///       octal-constant:
850 ///         0
851 ///         octal-constant octal-digit
852 ///       hexadecimal-constant:
853 ///         hexadecimal-prefix hexadecimal-digit
854 ///         hexadecimal-constant hexadecimal-digit
855 ///       hexadecimal-prefix: one of
856 ///         0x 0X
857 ///       binary-literal:
858 ///         0b binary-digit
859 ///         0B binary-digit
860 ///         binary-literal binary-digit
861 ///       integer-suffix:
862 ///         unsigned-suffix [long-suffix]
863 ///         unsigned-suffix [long-long-suffix]
864 ///         long-suffix [unsigned-suffix]
865 ///         long-long-suffix [unsigned-sufix]
866 ///       nonzero-digit:
867 ///         1 2 3 4 5 6 7 8 9
868 ///       octal-digit:
869 ///         0 1 2 3 4 5 6 7
870 ///       hexadecimal-digit:
871 ///         0 1 2 3 4 5 6 7 8 9
872 ///         a b c d e f
873 ///         A B C D E F
874 ///       binary-digit:
875 ///         0
876 ///         1
877 ///       unsigned-suffix: one of
878 ///         u U
879 ///       long-suffix: one of
880 ///         l L
881 ///       long-long-suffix: one of
882 ///         ll LL
883 ///
884 ///       floating-constant: [C99 6.4.4.2]
885 ///         TODO: add rules...
886 ///
887 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
888                                            SourceLocation TokLoc,
889                                            const SourceManager &SM,
890                                            const LangOptions &LangOpts,
891                                            const TargetInfo &Target,
892                                            DiagnosticsEngine &Diags)
893     : SM(SM), LangOpts(LangOpts), Diags(Diags),
894       ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
895 
896   s = DigitsBegin = ThisTokBegin;
897   saw_exponent = false;
898   saw_period = false;
899   saw_ud_suffix = false;
900   saw_fixed_point_suffix = false;
901   isLong = false;
902   isUnsigned = false;
903   isLongLong = false;
904   isSizeT = false;
905   isHalf = false;
906   isFloat = false;
907   isImaginary = false;
908   isFloat16 = false;
909   isFloat128 = false;
910   MicrosoftInteger = 0;
911   isFract = false;
912   isAccum = false;
913   hadError = false;
914   isBitInt = false;
915 
916   // This routine assumes that the range begin/end matches the regex for integer
917   // and FP constants (specifically, the 'pp-number' regex), and assumes that
918   // the byte at "*end" is both valid and not part of the regex.  Because of
919   // this, it doesn't have to check for 'overscan' in various places.
920   if (isPreprocessingNumberBody(*ThisTokEnd)) {
921     Diags.Report(TokLoc, diag::err_lexing_numeric);
922     hadError = true;
923     return;
924   }
925 
926   if (*s == '0') { // parse radix
927     ParseNumberStartingWithZero(TokLoc);
928     if (hadError)
929       return;
930   } else { // the first digit is non-zero
931     radix = 10;
932     s = SkipDigits(s);
933     if (s == ThisTokEnd) {
934       // Done.
935     } else {
936       ParseDecimalOrOctalCommon(TokLoc);
937       if (hadError)
938         return;
939     }
940   }
941 
942   SuffixBegin = s;
943   checkSeparator(TokLoc, s, CSK_AfterDigits);
944 
945   // Initial scan to lookahead for fixed point suffix.
946   if (LangOpts.FixedPoint) {
947     for (const char *c = s; c != ThisTokEnd; ++c) {
948       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
949         saw_fixed_point_suffix = true;
950         break;
951       }
952     }
953   }
954 
955   // Parse the suffix.  At this point we can classify whether we have an FP or
956   // integer constant.
957   bool isFixedPointConstant = isFixedPointLiteral();
958   bool isFPConstant = isFloatingLiteral();
959   bool HasSize = false;
960 
961   // Loop over all of the characters of the suffix.  If we see something bad,
962   // we break out of the loop.
963   for (; s != ThisTokEnd; ++s) {
964     switch (*s) {
965     case 'R':
966     case 'r':
967       if (!LangOpts.FixedPoint)
968         break;
969       if (isFract || isAccum) break;
970       if (!(saw_period || saw_exponent)) break;
971       isFract = true;
972       continue;
973     case 'K':
974     case 'k':
975       if (!LangOpts.FixedPoint)
976         break;
977       if (isFract || isAccum) break;
978       if (!(saw_period || saw_exponent)) break;
979       isAccum = true;
980       continue;
981     case 'h':      // FP Suffix for "half".
982     case 'H':
983       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
984       if (!(LangOpts.Half || LangOpts.FixedPoint))
985         break;
986       if (isIntegerLiteral()) break;  // Error for integer constant.
987       if (HasSize)
988         break;
989       HasSize = true;
990       isHalf = true;
991       continue;  // Success.
992     case 'f':      // FP Suffix for "float"
993     case 'F':
994       if (!isFPConstant) break;  // Error for integer constant.
995       if (HasSize)
996         break;
997       HasSize = true;
998 
999       // CUDA host and device may have different _Float16 support, therefore
1000       // allows f16 literals to avoid false alarm.
1001       // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1002       // should also be supported.
1003       // ToDo: more precise check for CUDA.
1004       // TODO: AMDGPU might also support it in the future.
1005       if ((Target.hasFloat16Type() || LangOpts.CUDA ||
1006            (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1007           s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
1008         s += 2; // success, eat up 2 characters.
1009         isFloat16 = true;
1010         continue;
1011       }
1012 
1013       isFloat = true;
1014       continue;  // Success.
1015     case 'q':    // FP Suffix for "__float128"
1016     case 'Q':
1017       if (!isFPConstant) break;  // Error for integer constant.
1018       if (HasSize)
1019         break;
1020       HasSize = true;
1021       isFloat128 = true;
1022       continue;  // Success.
1023     case 'u':
1024     case 'U':
1025       if (isFPConstant) break;  // Error for floating constant.
1026       if (isUnsigned) break;    // Cannot be repeated.
1027       isUnsigned = true;
1028       continue;  // Success.
1029     case 'l':
1030     case 'L':
1031       if (HasSize)
1032         break;
1033       HasSize = true;
1034 
1035       // Check for long long.  The L's need to be adjacent and the same case.
1036       if (s[1] == s[0]) {
1037         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1038         if (isFPConstant) break;        // long long invalid for floats.
1039         isLongLong = true;
1040         ++s;  // Eat both of them.
1041       } else {
1042         isLong = true;
1043       }
1044       continue; // Success.
1045     case 'z':
1046     case 'Z':
1047       if (isFPConstant)
1048         break; // Invalid for floats.
1049       if (HasSize)
1050         break;
1051       HasSize = true;
1052       isSizeT = true;
1053       continue;
1054     case 'i':
1055     case 'I':
1056       if (LangOpts.MicrosoftExt && !isFPConstant) {
1057         // Allow i8, i16, i32, and i64. First, look ahead and check if
1058         // suffixes are Microsoft integers and not the imaginary unit.
1059         uint8_t Bits = 0;
1060         size_t ToSkip = 0;
1061         switch (s[1]) {
1062         case '8': // i8 suffix
1063           Bits = 8;
1064           ToSkip = 2;
1065           break;
1066         case '1':
1067           if (s[2] == '6') { // i16 suffix
1068             Bits = 16;
1069             ToSkip = 3;
1070           }
1071           break;
1072         case '3':
1073           if (s[2] == '2') { // i32 suffix
1074             Bits = 32;
1075             ToSkip = 3;
1076           }
1077           break;
1078         case '6':
1079           if (s[2] == '4') { // i64 suffix
1080             Bits = 64;
1081             ToSkip = 3;
1082           }
1083           break;
1084         default:
1085           break;
1086         }
1087         if (Bits) {
1088           if (HasSize)
1089             break;
1090           HasSize = true;
1091           MicrosoftInteger = Bits;
1092           s += ToSkip;
1093           assert(s <= ThisTokEnd && "didn't maximally munch?");
1094           break;
1095         }
1096       }
1097       [[fallthrough]];
1098     case 'j':
1099     case 'J':
1100       if (isImaginary) break;   // Cannot be repeated.
1101       isImaginary = true;
1102       continue;  // Success.
1103     case 'w':
1104     case 'W':
1105       if (isFPConstant)
1106         break; // Invalid for floats.
1107       if (HasSize)
1108         break; // Invalid if we already have a size for the literal.
1109 
1110       // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1111       // explicitly do not support the suffix in C++ as an extension because a
1112       // library-based UDL that resolves to a library type may be more
1113       // appropriate there.
1114       if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
1115           (s[0] == 'W' && s[1] == 'B'))) {
1116         isBitInt = true;
1117         HasSize = true;
1118         ++s; // Skip both characters (2nd char skipped on continue).
1119         continue; // Success.
1120       }
1121     }
1122     // If we reached here, there was an error or a ud-suffix.
1123     break;
1124   }
1125 
1126   // "i", "if", and "il" are user-defined suffixes in C++1y.
1127   if (s != ThisTokEnd || isImaginary) {
1128     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1129     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1130     if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1131       if (!isImaginary) {
1132         // Any suffix pieces we might have parsed are actually part of the
1133         // ud-suffix.
1134         isLong = false;
1135         isUnsigned = false;
1136         isLongLong = false;
1137         isSizeT = false;
1138         isFloat = false;
1139         isFloat16 = false;
1140         isHalf = false;
1141         isImaginary = false;
1142         isBitInt = false;
1143         MicrosoftInteger = 0;
1144         saw_fixed_point_suffix = false;
1145         isFract = false;
1146         isAccum = false;
1147       }
1148 
1149       saw_ud_suffix = true;
1150       return;
1151     }
1152 
1153     if (s != ThisTokEnd) {
1154       // Report an error if there are any.
1155       Diags.Report(Lexer::AdvanceToTokenCharacter(
1156                        TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1157                    diag::err_invalid_suffix_constant)
1158           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1159           << (isFixedPointConstant ? 2 : isFPConstant);
1160       hadError = true;
1161     }
1162   }
1163 
1164   if (!hadError && saw_fixed_point_suffix) {
1165     assert(isFract || isAccum);
1166   }
1167 }
1168 
1169 /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1170 /// numbers. It issues an error for illegal digits, and handles floating point
1171 /// parsing. If it detects a floating point number, the radix is set to 10.
1172 void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1173   assert((radix == 8 || radix == 10) && "Unexpected radix");
1174 
1175   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1176   // the code is using an incorrect base.
1177   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1178       !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1179     Diags.Report(
1180         Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1181         diag::err_invalid_digit)
1182         << StringRef(s, 1) << (radix == 8 ? 1 : 0);
1183     hadError = true;
1184     return;
1185   }
1186 
1187   if (*s == '.') {
1188     checkSeparator(TokLoc, s, CSK_AfterDigits);
1189     s++;
1190     radix = 10;
1191     saw_period = true;
1192     checkSeparator(TokLoc, s, CSK_BeforeDigits);
1193     s = SkipDigits(s); // Skip suffix.
1194   }
1195   if (*s == 'e' || *s == 'E') { // exponent
1196     checkSeparator(TokLoc, s, CSK_AfterDigits);
1197     const char *Exponent = s;
1198     s++;
1199     radix = 10;
1200     saw_exponent = true;
1201     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1202     const char *first_non_digit = SkipDigits(s);
1203     if (containsDigits(s, first_non_digit)) {
1204       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1205       s = first_non_digit;
1206     } else {
1207       if (!hadError) {
1208         Diags.Report(Lexer::AdvanceToTokenCharacter(
1209                          TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1210                      diag::err_exponent_has_no_digits);
1211         hadError = true;
1212       }
1213       return;
1214     }
1215   }
1216 }
1217 
1218 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1219 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
1220 /// treat it as an invalid suffix.
1221 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1222                                            StringRef Suffix) {
1223   if (!LangOpts.CPlusPlus11 || Suffix.empty())
1224     return false;
1225 
1226   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1227   if (Suffix[0] == '_')
1228     return true;
1229 
1230   // In C++11, there are no library suffixes.
1231   if (!LangOpts.CPlusPlus14)
1232     return false;
1233 
1234   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1235   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1236   // In C++2a "d" and "y" are used in the library.
1237   return llvm::StringSwitch<bool>(Suffix)
1238       .Cases("h", "min", "s", true)
1239       .Cases("ms", "us", "ns", true)
1240       .Cases("il", "i", "if", true)
1241       .Cases("d", "y", LangOpts.CPlusPlus20)
1242       .Default(false);
1243 }
1244 
1245 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1246                                           const char *Pos,
1247                                           CheckSeparatorKind IsAfterDigits) {
1248   if (IsAfterDigits == CSK_AfterDigits) {
1249     if (Pos == ThisTokBegin)
1250       return;
1251     --Pos;
1252   } else if (Pos == ThisTokEnd)
1253     return;
1254 
1255   if (isDigitSeparator(*Pos)) {
1256     Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1257                                                 LangOpts),
1258                  diag::err_digit_separator_not_between_digits)
1259         << IsAfterDigits;
1260     hadError = true;
1261   }
1262 }
1263 
1264 /// ParseNumberStartingWithZero - This method is called when the first character
1265 /// of the number is found to be a zero.  This means it is either an octal
1266 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1267 /// a floating point number (01239.123e4).  Eat the prefix, determining the
1268 /// radix etc.
1269 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1270   assert(s[0] == '0' && "Invalid method call");
1271   s++;
1272 
1273   int c1 = s[0];
1274 
1275   // Handle a hex number like 0x1234.
1276   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1277     s++;
1278     assert(s < ThisTokEnd && "didn't maximally munch?");
1279     radix = 16;
1280     DigitsBegin = s;
1281     s = SkipHexDigits(s);
1282     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1283     if (s == ThisTokEnd) {
1284       // Done.
1285     } else if (*s == '.') {
1286       s++;
1287       saw_period = true;
1288       const char *floatDigitsBegin = s;
1289       s = SkipHexDigits(s);
1290       if (containsDigits(floatDigitsBegin, s))
1291         HasSignificandDigits = true;
1292       if (HasSignificandDigits)
1293         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1294     }
1295 
1296     if (!HasSignificandDigits) {
1297       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1298                                                   LangOpts),
1299                    diag::err_hex_constant_requires)
1300           << LangOpts.CPlusPlus << 1;
1301       hadError = true;
1302       return;
1303     }
1304 
1305     // A binary exponent can appear with or with a '.'. If dotted, the
1306     // binary exponent is required.
1307     if (*s == 'p' || *s == 'P') {
1308       checkSeparator(TokLoc, s, CSK_AfterDigits);
1309       const char *Exponent = s;
1310       s++;
1311       saw_exponent = true;
1312       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1313       const char *first_non_digit = SkipDigits(s);
1314       if (!containsDigits(s, first_non_digit)) {
1315         if (!hadError) {
1316           Diags.Report(Lexer::AdvanceToTokenCharacter(
1317                            TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1318                        diag::err_exponent_has_no_digits);
1319           hadError = true;
1320         }
1321         return;
1322       }
1323       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1324       s = first_non_digit;
1325 
1326       if (!LangOpts.HexFloats)
1327         Diags.Report(TokLoc, LangOpts.CPlusPlus
1328                                  ? diag::ext_hex_literal_invalid
1329                                  : diag::ext_hex_constant_invalid);
1330       else if (LangOpts.CPlusPlus17)
1331         Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1332     } else if (saw_period) {
1333       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1334                                                   LangOpts),
1335                    diag::err_hex_constant_requires)
1336           << LangOpts.CPlusPlus << 0;
1337       hadError = true;
1338     }
1339     return;
1340   }
1341 
1342   // Handle simple binary numbers 0b01010
1343   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1344     // 0b101010 is a C++1y / GCC extension.
1345     Diags.Report(TokLoc, LangOpts.CPlusPlus14
1346                              ? diag::warn_cxx11_compat_binary_literal
1347                          : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
1348                                               : diag::ext_binary_literal);
1349     ++s;
1350     assert(s < ThisTokEnd && "didn't maximally munch?");
1351     radix = 2;
1352     DigitsBegin = s;
1353     s = SkipBinaryDigits(s);
1354     if (s == ThisTokEnd) {
1355       // Done.
1356     } else if (isHexDigit(*s) &&
1357                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1358       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1359                                                   LangOpts),
1360                    diag::err_invalid_digit)
1361           << StringRef(s, 1) << 2;
1362       hadError = true;
1363     }
1364     // Other suffixes will be diagnosed by the caller.
1365     return;
1366   }
1367 
1368   // For now, the radix is set to 8. If we discover that we have a
1369   // floating point constant, the radix will change to 10. Octal floating
1370   // point constants are not permitted (only decimal and hexadecimal).
1371   radix = 8;
1372   const char *PossibleNewDigitStart = s;
1373   s = SkipOctalDigits(s);
1374   // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1375   // as the start of the digits. So if skipping octal digits does not skip
1376   // anything, we leave the digit start where it was.
1377   if (s != PossibleNewDigitStart)
1378     DigitsBegin = PossibleNewDigitStart;
1379 
1380   if (s == ThisTokEnd)
1381     return; // Done, simple octal number like 01234
1382 
1383   // If we have some other non-octal digit that *is* a decimal digit, see if
1384   // this is part of a floating point number like 094.123 or 09e1.
1385   if (isDigit(*s)) {
1386     const char *EndDecimal = SkipDigits(s);
1387     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1388       s = EndDecimal;
1389       radix = 10;
1390     }
1391   }
1392 
1393   ParseDecimalOrOctalCommon(TokLoc);
1394 }
1395 
1396 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1397   switch (Radix) {
1398   case 2:
1399     return NumDigits <= 64;
1400   case 8:
1401     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1402   case 10:
1403     return NumDigits <= 19; // floor(log10(2^64))
1404   case 16:
1405     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1406   default:
1407     llvm_unreachable("impossible Radix");
1408   }
1409 }
1410 
1411 /// GetIntegerValue - Convert this numeric literal value to an APInt that
1412 /// matches Val's input width.  If there is an overflow, set Val to the low bits
1413 /// of the result and return true.  Otherwise, return false.
1414 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1415   // Fast path: Compute a conservative bound on the maximum number of
1416   // bits per digit in this radix. If we can't possibly overflow a
1417   // uint64 based on that bound then do the simple conversion to
1418   // integer. This avoids the expensive overflow checking below, and
1419   // handles the common cases that matter (small decimal integers and
1420   // hex/octal values which don't overflow).
1421   const unsigned NumDigits = SuffixBegin - DigitsBegin;
1422   if (alwaysFitsInto64Bits(radix, NumDigits)) {
1423     uint64_t N = 0;
1424     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1425       if (!isDigitSeparator(*Ptr))
1426         N = N * radix + llvm::hexDigitValue(*Ptr);
1427 
1428     // This will truncate the value to Val's input width. Simply check
1429     // for overflow by comparing.
1430     Val = N;
1431     return Val.getZExtValue() != N;
1432   }
1433 
1434   Val = 0;
1435   const char *Ptr = DigitsBegin;
1436 
1437   llvm::APInt RadixVal(Val.getBitWidth(), radix);
1438   llvm::APInt CharVal(Val.getBitWidth(), 0);
1439   llvm::APInt OldVal = Val;
1440 
1441   bool OverflowOccurred = false;
1442   while (Ptr < SuffixBegin) {
1443     if (isDigitSeparator(*Ptr)) {
1444       ++Ptr;
1445       continue;
1446     }
1447 
1448     unsigned C = llvm::hexDigitValue(*Ptr++);
1449 
1450     // If this letter is out of bound for this radix, reject it.
1451     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1452 
1453     CharVal = C;
1454 
1455     // Add the digit to the value in the appropriate radix.  If adding in digits
1456     // made the value smaller, then this overflowed.
1457     OldVal = Val;
1458 
1459     // Multiply by radix, did overflow occur on the multiply?
1460     Val *= RadixVal;
1461     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1462 
1463     // Add value, did overflow occur on the value?
1464     //   (a + b) ult b  <=> overflow
1465     Val += CharVal;
1466     OverflowOccurred |= Val.ult(CharVal);
1467   }
1468   return OverflowOccurred;
1469 }
1470 
1471 llvm::APFloat::opStatus
1472 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1473   using llvm::APFloat;
1474 
1475   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1476 
1477   llvm::SmallString<16> Buffer;
1478   StringRef Str(ThisTokBegin, n);
1479   if (Str.contains('\'')) {
1480     Buffer.reserve(n);
1481     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1482                         &isDigitSeparator);
1483     Str = Buffer;
1484   }
1485 
1486   auto StatusOrErr =
1487       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1488   assert(StatusOrErr && "Invalid floating point representation");
1489   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1490                                                : APFloat::opInvalidOp;
1491 }
1492 
1493 static inline bool IsExponentPart(char c) {
1494   return c == 'p' || c == 'P' || c == 'e' || c == 'E';
1495 }
1496 
1497 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1498   assert(radix == 16 || radix == 10);
1499 
1500   // Find how many digits are needed to store the whole literal.
1501   unsigned NumDigits = SuffixBegin - DigitsBegin;
1502   if (saw_period) --NumDigits;
1503 
1504   // Initial scan of the exponent if it exists
1505   bool ExpOverflowOccurred = false;
1506   bool NegativeExponent = false;
1507   const char *ExponentBegin;
1508   uint64_t Exponent = 0;
1509   int64_t BaseShift = 0;
1510   if (saw_exponent) {
1511     const char *Ptr = DigitsBegin;
1512 
1513     while (!IsExponentPart(*Ptr)) ++Ptr;
1514     ExponentBegin = Ptr;
1515     ++Ptr;
1516     NegativeExponent = *Ptr == '-';
1517     if (NegativeExponent) ++Ptr;
1518 
1519     unsigned NumExpDigits = SuffixBegin - Ptr;
1520     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1521       llvm::StringRef ExpStr(Ptr, NumExpDigits);
1522       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1523       Exponent = ExpInt.getZExtValue();
1524     } else {
1525       ExpOverflowOccurred = true;
1526     }
1527 
1528     if (NegativeExponent) BaseShift -= Exponent;
1529     else BaseShift += Exponent;
1530   }
1531 
1532   // Number of bits needed for decimal literal is
1533   //   ceil(NumDigits * log2(10))       Integral part
1534   // + Scale                            Fractional part
1535   // + ceil(Exponent * log2(10))        Exponent
1536   // --------------------------------------------------
1537   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1538   //
1539   // But for simplicity in handling integers, we can round up log2(10) to 4,
1540   // making:
1541   // 4 * (NumDigits + Exponent) + Scale
1542   //
1543   // Number of digits needed for hexadecimal literal is
1544   //   4 * NumDigits                    Integral part
1545   // + Scale                            Fractional part
1546   // + Exponent                         Exponent
1547   // --------------------------------------------------
1548   //   (4 * NumDigits) + Scale + Exponent
1549   uint64_t NumBitsNeeded;
1550   if (radix == 10)
1551     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1552   else
1553     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1554 
1555   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1556     ExpOverflowOccurred = true;
1557   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1558 
1559   bool FoundDecimal = false;
1560 
1561   int64_t FractBaseShift = 0;
1562   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1563   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1564     if (*Ptr == '.') {
1565       FoundDecimal = true;
1566       continue;
1567     }
1568 
1569     // Normal reading of an integer
1570     unsigned C = llvm::hexDigitValue(*Ptr);
1571     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1572 
1573     Val *= radix;
1574     Val += C;
1575 
1576     if (FoundDecimal)
1577       // Keep track of how much we will need to adjust this value by from the
1578       // number of digits past the radix point.
1579       --FractBaseShift;
1580   }
1581 
1582   // For a radix of 16, we will be multiplying by 2 instead of 16.
1583   if (radix == 16) FractBaseShift *= 4;
1584   BaseShift += FractBaseShift;
1585 
1586   Val <<= Scale;
1587 
1588   uint64_t Base = (radix == 16) ? 2 : 10;
1589   if (BaseShift > 0) {
1590     for (int64_t i = 0; i < BaseShift; ++i) {
1591       Val *= Base;
1592     }
1593   } else if (BaseShift < 0) {
1594     for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1595       Val = Val.udiv(Base);
1596   }
1597 
1598   bool IntOverflowOccurred = false;
1599   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1600   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1601     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1602     StoreVal = Val.trunc(StoreVal.getBitWidth());
1603   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1604     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1605     StoreVal = Val.zext(StoreVal.getBitWidth());
1606   } else {
1607     StoreVal = Val;
1608   }
1609 
1610   return IntOverflowOccurred || ExpOverflowOccurred;
1611 }
1612 
1613 /// \verbatim
1614 ///       user-defined-character-literal: [C++11 lex.ext]
1615 ///         character-literal ud-suffix
1616 ///       ud-suffix:
1617 ///         identifier
1618 ///       character-literal: [C++11 lex.ccon]
1619 ///         ' c-char-sequence '
1620 ///         u' c-char-sequence '
1621 ///         U' c-char-sequence '
1622 ///         L' c-char-sequence '
1623 ///         u8' c-char-sequence ' [C++1z lex.ccon]
1624 ///       c-char-sequence:
1625 ///         c-char
1626 ///         c-char-sequence c-char
1627 ///       c-char:
1628 ///         any member of the source character set except the single-quote ',
1629 ///           backslash \, or new-line character
1630 ///         escape-sequence
1631 ///         universal-character-name
1632 ///       escape-sequence:
1633 ///         simple-escape-sequence
1634 ///         octal-escape-sequence
1635 ///         hexadecimal-escape-sequence
1636 ///       simple-escape-sequence:
1637 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1638 ///       octal-escape-sequence:
1639 ///         \ octal-digit
1640 ///         \ octal-digit octal-digit
1641 ///         \ octal-digit octal-digit octal-digit
1642 ///       hexadecimal-escape-sequence:
1643 ///         \x hexadecimal-digit
1644 ///         hexadecimal-escape-sequence hexadecimal-digit
1645 ///       universal-character-name: [C++11 lex.charset]
1646 ///         \u hex-quad
1647 ///         \U hex-quad hex-quad
1648 ///       hex-quad:
1649 ///         hex-digit hex-digit hex-digit hex-digit
1650 /// \endverbatim
1651 ///
1652 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1653                                      SourceLocation Loc, Preprocessor &PP,
1654                                      tok::TokenKind kind) {
1655   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1656   HadError = false;
1657 
1658   Kind = kind;
1659 
1660   const char *TokBegin = begin;
1661 
1662   // Skip over wide character determinant.
1663   if (Kind != tok::char_constant)
1664     ++begin;
1665   if (Kind == tok::utf8_char_constant)
1666     ++begin;
1667 
1668   // Skip over the entry quote.
1669   if (begin[0] != '\'') {
1670     PP.Diag(Loc, diag::err_lexing_char);
1671     HadError = true;
1672     return;
1673   }
1674 
1675   ++begin;
1676 
1677   // Remove an optional ud-suffix.
1678   if (end[-1] != '\'') {
1679     const char *UDSuffixEnd = end;
1680     do {
1681       --end;
1682     } while (end[-1] != '\'');
1683     // FIXME: Don't bother with this if !tok.hasUCN().
1684     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1685     UDSuffixOffset = end - TokBegin;
1686   }
1687 
1688   // Trim the ending quote.
1689   assert(end != begin && "Invalid token lexed");
1690   --end;
1691 
1692   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1693   // up to 64-bits.
1694   // FIXME: This extensively assumes that 'char' is 8-bits.
1695   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1696          "Assumes char is 8 bits");
1697   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1698          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1699          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1700   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1701          "Assumes sizeof(wchar) on target is <= 64");
1702 
1703   SmallVector<uint32_t, 4> codepoint_buffer;
1704   codepoint_buffer.resize(end - begin);
1705   uint32_t *buffer_begin = &codepoint_buffer.front();
1706   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1707 
1708   // Unicode escapes representing characters that cannot be correctly
1709   // represented in a single code unit are disallowed in character literals
1710   // by this implementation.
1711   uint32_t largest_character_for_kind;
1712   if (tok::wide_char_constant == Kind) {
1713     largest_character_for_kind =
1714         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1715   } else if (tok::utf8_char_constant == Kind) {
1716     largest_character_for_kind = 0x7F;
1717   } else if (tok::utf16_char_constant == Kind) {
1718     largest_character_for_kind = 0xFFFF;
1719   } else if (tok::utf32_char_constant == Kind) {
1720     largest_character_for_kind = 0x10FFFF;
1721   } else {
1722     largest_character_for_kind = 0x7Fu;
1723   }
1724 
1725   while (begin != end) {
1726     // Is this a span of non-escape characters?
1727     if (begin[0] != '\\') {
1728       char const *start = begin;
1729       do {
1730         ++begin;
1731       } while (begin != end && *begin != '\\');
1732 
1733       char const *tmp_in_start = start;
1734       uint32_t *tmp_out_start = buffer_begin;
1735       llvm::ConversionResult res =
1736           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1737                              reinterpret_cast<llvm::UTF8 const *>(begin),
1738                              &buffer_begin, buffer_end, llvm::strictConversion);
1739       if (res != llvm::conversionOK) {
1740         // If we see bad encoding for unprefixed character literals, warn and
1741         // simply copy the byte values, for compatibility with gcc and
1742         // older versions of clang.
1743         bool NoErrorOnBadEncoding = isOrdinary();
1744         unsigned Msg = diag::err_bad_character_encoding;
1745         if (NoErrorOnBadEncoding)
1746           Msg = diag::warn_bad_character_encoding;
1747         PP.Diag(Loc, Msg);
1748         if (NoErrorOnBadEncoding) {
1749           start = tmp_in_start;
1750           buffer_begin = tmp_out_start;
1751           for (; start != begin; ++start, ++buffer_begin)
1752             *buffer_begin = static_cast<uint8_t>(*start);
1753         } else {
1754           HadError = true;
1755         }
1756       } else {
1757         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1758           if (*tmp_out_start > largest_character_for_kind) {
1759             HadError = true;
1760             PP.Diag(Loc, diag::err_character_too_large);
1761           }
1762         }
1763       }
1764 
1765       continue;
1766     }
1767     // Is this a Universal Character Name escape?
1768     if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1769       unsigned short UcnLen = 0;
1770       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1771                             FullSourceLoc(Loc, PP.getSourceManager()),
1772                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1773         HadError = true;
1774       } else if (*buffer_begin > largest_character_for_kind) {
1775         HadError = true;
1776         PP.Diag(Loc, diag::err_character_too_large);
1777       }
1778 
1779       ++buffer_begin;
1780       continue;
1781     }
1782     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1783     uint64_t result =
1784         ProcessCharEscape(TokBegin, begin, end, HadError,
1785                           FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1786                           &PP.getDiagnostics(), PP.getLangOpts(),
1787                           StringLiteralEvalMethod::Evaluated);
1788     *buffer_begin++ = result;
1789   }
1790 
1791   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1792 
1793   if (NumCharsSoFar > 1) {
1794     if (isOrdinary() && NumCharsSoFar == 4)
1795       PP.Diag(Loc, diag::warn_four_char_character_literal);
1796     else if (isOrdinary())
1797       PP.Diag(Loc, diag::warn_multichar_character_literal);
1798     else {
1799       PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1800       HadError = true;
1801     }
1802     IsMultiChar = true;
1803   } else {
1804     IsMultiChar = false;
1805   }
1806 
1807   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1808 
1809   // Narrow character literals act as though their value is concatenated
1810   // in this implementation, but warn on overflow.
1811   bool multi_char_too_long = false;
1812   if (isOrdinary() && isMultiChar()) {
1813     LitVal = 0;
1814     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1815       // check for enough leading zeros to shift into
1816       multi_char_too_long |= (LitVal.countl_zero() < 8);
1817       LitVal <<= 8;
1818       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1819     }
1820   } else if (NumCharsSoFar > 0) {
1821     // otherwise just take the last character
1822     LitVal = buffer_begin[-1];
1823   }
1824 
1825   if (!HadError && multi_char_too_long) {
1826     PP.Diag(Loc, diag::warn_char_constant_too_large);
1827   }
1828 
1829   // Transfer the value from APInt to uint64_t
1830   Value = LitVal.getZExtValue();
1831 
1832   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1833   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1834   // character constants are not sign extended in the this implementation:
1835   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1836   if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1837       PP.getLangOpts().CharIsSigned)
1838     Value = (signed char)Value;
1839 }
1840 
1841 /// \verbatim
1842 ///       string-literal: [C++0x lex.string]
1843 ///         encoding-prefix " [s-char-sequence] "
1844 ///         encoding-prefix R raw-string
1845 ///       encoding-prefix:
1846 ///         u8
1847 ///         u
1848 ///         U
1849 ///         L
1850 ///       s-char-sequence:
1851 ///         s-char
1852 ///         s-char-sequence s-char
1853 ///       s-char:
1854 ///         any member of the source character set except the double-quote ",
1855 ///           backslash \, or new-line character
1856 ///         escape-sequence
1857 ///         universal-character-name
1858 ///       raw-string:
1859 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1860 ///       r-char-sequence:
1861 ///         r-char
1862 ///         r-char-sequence r-char
1863 ///       r-char:
1864 ///         any member of the source character set, except a right parenthesis )
1865 ///           followed by the initial d-char-sequence (which may be empty)
1866 ///           followed by a double quote ".
1867 ///       d-char-sequence:
1868 ///         d-char
1869 ///         d-char-sequence d-char
1870 ///       d-char:
1871 ///         any member of the basic source character set except:
1872 ///           space, the left parenthesis (, the right parenthesis ),
1873 ///           the backslash \, and the control characters representing horizontal
1874 ///           tab, vertical tab, form feed, and newline.
1875 ///       escape-sequence: [C++0x lex.ccon]
1876 ///         simple-escape-sequence
1877 ///         octal-escape-sequence
1878 ///         hexadecimal-escape-sequence
1879 ///       simple-escape-sequence:
1880 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1881 ///       octal-escape-sequence:
1882 ///         \ octal-digit
1883 ///         \ octal-digit octal-digit
1884 ///         \ octal-digit octal-digit octal-digit
1885 ///       hexadecimal-escape-sequence:
1886 ///         \x hexadecimal-digit
1887 ///         hexadecimal-escape-sequence hexadecimal-digit
1888 ///       universal-character-name:
1889 ///         \u hex-quad
1890 ///         \U hex-quad hex-quad
1891 ///       hex-quad:
1892 ///         hex-digit hex-digit hex-digit hex-digit
1893 /// \endverbatim
1894 ///
1895 StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1896                                          Preprocessor &PP,
1897                                          StringLiteralEvalMethod EvalMethod)
1898     : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1899       Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1900       MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1901       ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1902       Pascal(false) {
1903   init(StringToks);
1904 }
1905 
1906 void StringLiteralParser::init(ArrayRef<Token> StringToks){
1907   // The literal token may have come from an invalid source location (e.g. due
1908   // to a PCH error), in which case the token length will be 0.
1909   if (StringToks.empty() || StringToks[0].getLength() < 2)
1910     return DiagnoseLexingError(SourceLocation());
1911 
1912   // Scan all of the string portions, remember the max individual token length,
1913   // computing a bound on the concatenated string length, and see whether any
1914   // piece is a wide-string.  If any of the string portions is a wide-string
1915   // literal, the result is a wide-string literal [C99 6.4.5p4].
1916   assert(!StringToks.empty() && "expected at least one token");
1917   MaxTokenLength = StringToks[0].getLength();
1918   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1919   SizeBound = StringToks[0].getLength() - 2; // -2 for "".
1920   hadError = false;
1921 
1922   // Determines the kind of string from the prefix
1923   Kind = tok::string_literal;
1924 
1925   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1926   for (const Token &Tok : StringToks) {
1927     if (Tok.getLength() < 2)
1928       return DiagnoseLexingError(Tok.getLocation());
1929 
1930     // The string could be shorter than this if it needs cleaning, but this is a
1931     // reasonable bound, which is all we need.
1932     assert(Tok.getLength() >= 2 && "literal token is invalid!");
1933     SizeBound += Tok.getLength() - 2; // -2 for "".
1934 
1935     // Remember maximum string piece length.
1936     if (Tok.getLength() > MaxTokenLength)
1937       MaxTokenLength = Tok.getLength();
1938 
1939     // Remember if we see any wide or utf-8/16/32 strings.
1940     // Also check for illegal concatenations.
1941     if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
1942       if (Diags) {
1943         SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
1944             Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
1945             Features);
1946         CharSourceRange Range =
1947             CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
1948         StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
1949                          getEncodingPrefixLen(Tok.getKind()));
1950         Diags->Report(Tok.getLocation(),
1951                       Features.CPlusPlus26
1952                           ? diag::err_unevaluated_string_prefix
1953                           : diag::warn_unevaluated_string_prefix)
1954             << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
1955       }
1956       if (Features.CPlusPlus26)
1957         hadError = true;
1958     } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
1959       if (isOrdinary()) {
1960         Kind = Tok.getKind();
1961       } else {
1962         if (Diags)
1963           Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
1964         hadError = true;
1965       }
1966     }
1967   }
1968 
1969   // Include space for the null terminator.
1970   ++SizeBound;
1971 
1972   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1973 
1974   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1975   CharByteWidth = getCharWidth(Kind, Target);
1976   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1977   CharByteWidth /= 8;
1978 
1979   // The output buffer size needs to be large enough to hold wide characters.
1980   // This is a worst-case assumption which basically corresponds to L"" "long".
1981   SizeBound *= CharByteWidth;
1982 
1983   // Size the temporary buffer to hold the result string data.
1984   ResultBuf.resize(SizeBound);
1985 
1986   // Likewise, but for each string piece.
1987   SmallString<512> TokenBuf;
1988   TokenBuf.resize(MaxTokenLength);
1989 
1990   // Loop over all the strings, getting their spelling, and expanding them to
1991   // wide strings as appropriate.
1992   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1993 
1994   Pascal = false;
1995 
1996   SourceLocation UDSuffixTokLoc;
1997 
1998   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1999     const char *ThisTokBuf = &TokenBuf[0];
2000     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
2001     // that ThisTokBuf points to a buffer that is big enough for the whole token
2002     // and 'spelled' tokens can only shrink.
2003     bool StringInvalid = false;
2004     unsigned ThisTokLen =
2005       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2006                          &StringInvalid);
2007     if (StringInvalid)
2008       return DiagnoseLexingError(StringToks[i].getLocation());
2009 
2010     const char *ThisTokBegin = ThisTokBuf;
2011     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2012 
2013     // Remove an optional ud-suffix.
2014     if (ThisTokEnd[-1] != '"') {
2015       const char *UDSuffixEnd = ThisTokEnd;
2016       do {
2017         --ThisTokEnd;
2018       } while (ThisTokEnd[-1] != '"');
2019 
2020       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2021 
2022       if (UDSuffixBuf.empty()) {
2023         if (StringToks[i].hasUCN())
2024           expandUCNs(UDSuffixBuf, UDSuffix);
2025         else
2026           UDSuffixBuf.assign(UDSuffix);
2027         UDSuffixToken = i;
2028         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2029         UDSuffixTokLoc = StringToks[i].getLocation();
2030       } else {
2031         SmallString<32> ExpandedUDSuffix;
2032         if (StringToks[i].hasUCN()) {
2033           expandUCNs(ExpandedUDSuffix, UDSuffix);
2034           UDSuffix = ExpandedUDSuffix;
2035         }
2036 
2037         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2038         // result of a concatenation involving at least one user-defined-string-
2039         // literal, all the participating user-defined-string-literals shall
2040         // have the same ud-suffix.
2041         bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2042         if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
2043           if (Diags) {
2044             SourceLocation TokLoc = StringToks[i].getLocation();
2045             if (UnevaluatedStringHasUDL) {
2046               Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2047                   << SourceRange(TokLoc, TokLoc);
2048             } else {
2049               Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2050                   << UDSuffixBuf << UDSuffix
2051                   << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2052             }
2053           }
2054           hadError = true;
2055         }
2056       }
2057     }
2058 
2059     // Strip the end quote.
2060     --ThisTokEnd;
2061 
2062     // TODO: Input character set mapping support.
2063 
2064     // Skip marker for wide or unicode strings.
2065     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
2066       ++ThisTokBuf;
2067       // Skip 8 of u8 marker for utf8 strings.
2068       if (ThisTokBuf[0] == '8')
2069         ++ThisTokBuf;
2070     }
2071 
2072     // Check for raw string
2073     if (ThisTokBuf[0] == 'R') {
2074       if (ThisTokBuf[1] != '"') {
2075         // The file may have come from PCH and then changed after loading the
2076         // PCH; Fail gracefully.
2077         return DiagnoseLexingError(StringToks[i].getLocation());
2078       }
2079       ThisTokBuf += 2; // skip R"
2080 
2081       // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2082       // characters.
2083       constexpr unsigned MaxRawStrDelimLen = 16;
2084 
2085       const char *Prefix = ThisTokBuf;
2086       while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2087              ThisTokBuf[0] != '(')
2088         ++ThisTokBuf;
2089       if (ThisTokBuf[0] != '(')
2090         return DiagnoseLexingError(StringToks[i].getLocation());
2091       ++ThisTokBuf; // skip '('
2092 
2093       // Remove same number of characters from the end
2094       ThisTokEnd -= ThisTokBuf - Prefix;
2095       if (ThisTokEnd < ThisTokBuf)
2096         return DiagnoseLexingError(StringToks[i].getLocation());
2097 
2098       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2099       // results in a new-line in the resulting execution string-literal.
2100       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2101       while (!RemainingTokenSpan.empty()) {
2102         // Split the string literal on \r\n boundaries.
2103         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2104         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2105         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2106 
2107         // Copy everything before the \r\n sequence into the string literal.
2108         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2109           hadError = true;
2110 
2111         // Point into the \n inside the \r\n sequence and operate on the
2112         // remaining portion of the literal.
2113         RemainingTokenSpan = AfterCRLF.substr(1);
2114       }
2115     } else {
2116       if (ThisTokBuf[0] != '"') {
2117         // The file may have come from PCH and then changed after loading the
2118         // PCH; Fail gracefully.
2119         return DiagnoseLexingError(StringToks[i].getLocation());
2120       }
2121       ++ThisTokBuf; // skip "
2122 
2123       // Check if this is a pascal string
2124       if (!isUnevaluated() && Features.PascalStrings &&
2125           ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
2126           ThisTokBuf[1] == 'p') {
2127 
2128         // If the \p sequence is found in the first token, we have a pascal string
2129         // Otherwise, if we already have a pascal string, ignore the first \p
2130         if (i == 0) {
2131           ++ThisTokBuf;
2132           Pascal = true;
2133         } else if (Pascal)
2134           ThisTokBuf += 2;
2135       }
2136 
2137       while (ThisTokBuf != ThisTokEnd) {
2138         // Is this a span of non-escape characters?
2139         if (ThisTokBuf[0] != '\\') {
2140           const char *InStart = ThisTokBuf;
2141           do {
2142             ++ThisTokBuf;
2143           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2144 
2145           // Copy the character span over.
2146           if (CopyStringFragment(StringToks[i], ThisTokBegin,
2147                                  StringRef(InStart, ThisTokBuf - InStart)))
2148             hadError = true;
2149           continue;
2150         }
2151         // Is this a Universal Character Name escape?
2152         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2153             ThisTokBuf[1] == 'N') {
2154           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2155                           ResultPtr, hadError,
2156                           FullSourceLoc(StringToks[i].getLocation(), SM),
2157                           CharByteWidth, Diags, Features);
2158           continue;
2159         }
2160         // Otherwise, this is a non-UCN escape character.  Process it.
2161         unsigned ResultChar =
2162             ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2163                               FullSourceLoc(StringToks[i].getLocation(), SM),
2164                               CharByteWidth * 8, Diags, Features, EvalMethod);
2165 
2166         if (CharByteWidth == 4) {
2167           // FIXME: Make the type of the result buffer correct instead of
2168           // using reinterpret_cast.
2169           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2170           *ResultWidePtr = ResultChar;
2171           ResultPtr += 4;
2172         } else if (CharByteWidth == 2) {
2173           // FIXME: Make the type of the result buffer correct instead of
2174           // using reinterpret_cast.
2175           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2176           *ResultWidePtr = ResultChar & 0xFFFF;
2177           ResultPtr += 2;
2178         } else {
2179           assert(CharByteWidth == 1 && "Unexpected char width");
2180           *ResultPtr++ = ResultChar & 0xFF;
2181         }
2182       }
2183     }
2184   }
2185 
2186   assert((!Pascal || !isUnevaluated()) &&
2187          "Pascal string in unevaluated context");
2188   if (Pascal) {
2189     if (CharByteWidth == 4) {
2190       // FIXME: Make the type of the result buffer correct instead of
2191       // using reinterpret_cast.
2192       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2193       ResultWidePtr[0] = GetNumStringChars() - 1;
2194     } else if (CharByteWidth == 2) {
2195       // FIXME: Make the type of the result buffer correct instead of
2196       // using reinterpret_cast.
2197       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2198       ResultWidePtr[0] = GetNumStringChars() - 1;
2199     } else {
2200       assert(CharByteWidth == 1 && "Unexpected char width");
2201       ResultBuf[0] = GetNumStringChars() - 1;
2202     }
2203 
2204     // Verify that pascal strings aren't too large.
2205     if (GetStringLength() > 256) {
2206       if (Diags)
2207         Diags->Report(StringToks.front().getLocation(),
2208                       diag::err_pascal_string_too_long)
2209           << SourceRange(StringToks.front().getLocation(),
2210                          StringToks.back().getLocation());
2211       hadError = true;
2212       return;
2213     }
2214   } else if (Diags) {
2215     // Complain if this string literal has too many characters.
2216     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2217 
2218     if (GetNumStringChars() > MaxChars)
2219       Diags->Report(StringToks.front().getLocation(),
2220                     diag::ext_string_too_long)
2221         << GetNumStringChars() << MaxChars
2222         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2223         << SourceRange(StringToks.front().getLocation(),
2224                        StringToks.back().getLocation());
2225   }
2226 }
2227 
2228 static const char *resyncUTF8(const char *Err, const char *End) {
2229   if (Err == End)
2230     return End;
2231   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2232   while (++Err != End && (*Err & 0xC0) == 0x80)
2233     ;
2234   return Err;
2235 }
2236 
2237 /// This function copies from Fragment, which is a sequence of bytes
2238 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
2239 /// Performs widening for multi-byte characters.
2240 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2241                                              const char *TokBegin,
2242                                              StringRef Fragment) {
2243   const llvm::UTF8 *ErrorPtrTmp;
2244   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2245     return false;
2246 
2247   // If we see bad encoding for unprefixed string literals, warn and
2248   // simply copy the byte values, for compatibility with gcc and older
2249   // versions of clang.
2250   bool NoErrorOnBadEncoding = isOrdinary();
2251   if (NoErrorOnBadEncoding) {
2252     memcpy(ResultPtr, Fragment.data(), Fragment.size());
2253     ResultPtr += Fragment.size();
2254   }
2255 
2256   if (Diags) {
2257     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2258 
2259     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2260     const DiagnosticBuilder &Builder =
2261       Diag(Diags, Features, SourceLoc, TokBegin,
2262            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2263            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2264                                 : diag::err_bad_string_encoding);
2265 
2266     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2267     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2268 
2269     // Decode into a dummy buffer.
2270     SmallString<512> Dummy;
2271     Dummy.reserve(Fragment.size() * CharByteWidth);
2272     char *Ptr = Dummy.data();
2273 
2274     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2275       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2276       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2277       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2278                                      ErrorPtr, NextStart);
2279       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2280     }
2281   }
2282   return !NoErrorOnBadEncoding;
2283 }
2284 
2285 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2286   hadError = true;
2287   if (Diags)
2288     Diags->Report(Loc, diag::err_lexing_string);
2289 }
2290 
2291 /// getOffsetOfStringByte - This function returns the offset of the
2292 /// specified byte of the string data represented by Token.  This handles
2293 /// advancing over escape sequences in the string.
2294 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2295                                                     unsigned ByteNo) const {
2296   // Get the spelling of the token.
2297   SmallString<32> SpellingBuffer;
2298   SpellingBuffer.resize(Tok.getLength());
2299 
2300   bool StringInvalid = false;
2301   const char *SpellingPtr = &SpellingBuffer[0];
2302   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2303                                        &StringInvalid);
2304   if (StringInvalid)
2305     return 0;
2306 
2307   const char *SpellingStart = SpellingPtr;
2308   const char *SpellingEnd = SpellingPtr+TokLen;
2309 
2310   // Handle UTF-8 strings just like narrow strings.
2311   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2312     SpellingPtr += 2;
2313 
2314   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2315          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2316 
2317   // For raw string literals, this is easy.
2318   if (SpellingPtr[0] == 'R') {
2319     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2320     // Skip 'R"'.
2321     SpellingPtr += 2;
2322     while (*SpellingPtr != '(') {
2323       ++SpellingPtr;
2324       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2325     }
2326     // Skip '('.
2327     ++SpellingPtr;
2328     return SpellingPtr - SpellingStart + ByteNo;
2329   }
2330 
2331   // Skip over the leading quote
2332   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2333   ++SpellingPtr;
2334 
2335   // Skip over bytes until we find the offset we're looking for.
2336   while (ByteNo) {
2337     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2338 
2339     // Step over non-escapes simply.
2340     if (*SpellingPtr != '\\') {
2341       ++SpellingPtr;
2342       --ByteNo;
2343       continue;
2344     }
2345 
2346     // Otherwise, this is an escape character.  Advance over it.
2347     bool HadError = false;
2348     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2349         SpellingPtr[1] == 'N') {
2350       const char *EscapePtr = SpellingPtr;
2351       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2352                                       1, Features, HadError);
2353       if (Len > ByteNo) {
2354         // ByteNo is somewhere within the escape sequence.
2355         SpellingPtr = EscapePtr;
2356         break;
2357       }
2358       ByteNo -= Len;
2359     } else {
2360       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2361                         FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2362                         Diags, Features, StringLiteralEvalMethod::Evaluated);
2363       --ByteNo;
2364     }
2365     assert(!HadError && "This method isn't valid on erroneous strings");
2366   }
2367 
2368   return SpellingPtr-SpellingStart;
2369 }
2370 
2371 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2372 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
2373 /// treat it as an invalid suffix.
2374 bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2375                                           StringRef Suffix) {
2376   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2377          Suffix == "sv";
2378 }
2379