xref: /freebsd/contrib/llvm-project/clang/lib/Lex/LiteralSupport.cpp (revision 9c77fb6aaa366cbabc80ee1b834bcfe4df135491)
1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the NumericLiteralParser, CharLiteralParser, and
10 // StringLiteralParser interfaces.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/Lex/LiteralSupport.h"
15 #include "clang/Basic/CharInfo.h"
16 #include "clang/Basic/LangOptions.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "clang/Lex/LexDiagnostic.h"
20 #include "clang/Lex/Lexer.h"
21 #include "clang/Lex/Preprocessor.h"
22 #include "clang/Lex/Token.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/ScopeExit.h"
25 #include "llvm/ADT/SmallVector.h"
26 #include "llvm/ADT/StringExtras.h"
27 #include "llvm/ADT/StringSwitch.h"
28 #include "llvm/Support/ConvertUTF.h"
29 #include "llvm/Support/Error.h"
30 #include "llvm/Support/ErrorHandling.h"
31 #include "llvm/Support/Unicode.h"
32 #include <algorithm>
33 #include <cassert>
34 #include <cstddef>
35 #include <cstdint>
36 #include <cstring>
37 #include <string>
38 
39 using namespace clang;
40 
41 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
42   switch (kind) {
43   default: llvm_unreachable("Unknown token type!");
44   case tok::char_constant:
45   case tok::string_literal:
46   case tok::utf8_char_constant:
47   case tok::utf8_string_literal:
48     return Target.getCharWidth();
49   case tok::wide_char_constant:
50   case tok::wide_string_literal:
51     return Target.getWCharWidth();
52   case tok::utf16_char_constant:
53   case tok::utf16_string_literal:
54     return Target.getChar16Width();
55   case tok::utf32_char_constant:
56   case tok::utf32_string_literal:
57     return Target.getChar32Width();
58   }
59 }
60 
61 static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
62   switch (kind) {
63   default:
64     llvm_unreachable("Unknown token type!");
65   case tok::char_constant:
66   case tok::string_literal:
67     return 0;
68   case tok::utf8_char_constant:
69   case tok::utf8_string_literal:
70     return 2;
71   case tok::wide_char_constant:
72   case tok::wide_string_literal:
73   case tok::utf16_char_constant:
74   case tok::utf16_string_literal:
75   case tok::utf32_char_constant:
76   case tok::utf32_string_literal:
77     return 1;
78   }
79 }
80 
81 static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
82                                            FullSourceLoc TokLoc,
83                                            const char *TokBegin,
84                                            const char *TokRangeBegin,
85                                            const char *TokRangeEnd) {
86   SourceLocation Begin =
87     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
88                                    TokLoc.getManager(), Features);
89   SourceLocation End =
90     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
91                                    TokLoc.getManager(), Features);
92   return CharSourceRange::getCharRange(Begin, End);
93 }
94 
95 /// Produce a diagnostic highlighting some portion of a literal.
96 ///
97 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
98 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
99 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
100 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
101                               const LangOptions &Features, FullSourceLoc TokLoc,
102                               const char *TokBegin, const char *TokRangeBegin,
103                               const char *TokRangeEnd, unsigned DiagID) {
104   SourceLocation Begin =
105     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
106                                    TokLoc.getManager(), Features);
107   return Diags->Report(Begin, DiagID) <<
108     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
109 }
110 
111 static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
112   switch (Escape) {
113   case '\'':
114   case '"':
115   case '?':
116   case '\\':
117   case 'a':
118   case 'b':
119   case 'f':
120   case 'n':
121   case 'r':
122   case 't':
123   case 'v':
124     return true;
125   }
126   return false;
127 }
128 
129 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
130 /// either a character or a string literal.
131 static unsigned ProcessCharEscape(const char *ThisTokBegin,
132                                   const char *&ThisTokBuf,
133                                   const char *ThisTokEnd, bool &HadError,
134                                   FullSourceLoc Loc, unsigned CharWidth,
135                                   DiagnosticsEngine *Diags,
136                                   const LangOptions &Features,
137                                   StringLiteralEvalMethod EvalMethod) {
138   const char *EscapeBegin = ThisTokBuf;
139   bool Delimited = false;
140   bool EndDelimiterFound = false;
141 
142   // Skip the '\' char.
143   ++ThisTokBuf;
144 
145   // We know that this character can't be off the end of the buffer, because
146   // that would have been \", which would not have been the end of string.
147   unsigned ResultChar = *ThisTokBuf++;
148   char Escape = ResultChar;
149   switch (ResultChar) {
150   // These map to themselves.
151   case '\\': case '\'': case '"': case '?': break;
152 
153     // These have fixed mappings.
154   case 'a':
155     // TODO: K&R: the meaning of '\\a' is different in traditional C
156     ResultChar = 7;
157     break;
158   case 'b':
159     ResultChar = 8;
160     break;
161   case 'e':
162     if (Diags)
163       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
164            diag::ext_nonstandard_escape) << "e";
165     ResultChar = 27;
166     break;
167   case 'E':
168     if (Diags)
169       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
170            diag::ext_nonstandard_escape) << "E";
171     ResultChar = 27;
172     break;
173   case 'f':
174     ResultChar = 12;
175     break;
176   case 'n':
177     ResultChar = 10;
178     break;
179   case 'r':
180     ResultChar = 13;
181     break;
182   case 't':
183     ResultChar = 9;
184     break;
185   case 'v':
186     ResultChar = 11;
187     break;
188   case 'x': { // Hex escape.
189     ResultChar = 0;
190     if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
191       Delimited = true;
192       ThisTokBuf++;
193       if (*ThisTokBuf == '}') {
194         HadError = true;
195         if (Diags)
196           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
197                diag::err_delimited_escape_empty);
198       }
199     } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
200       if (Diags)
201         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
202              diag::err_hex_escape_no_digits) << "x";
203       return ResultChar;
204     }
205 
206     // Hex escapes are a maximal series of hex digits.
207     bool Overflow = false;
208     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
209       if (Delimited && *ThisTokBuf == '}') {
210         ThisTokBuf++;
211         EndDelimiterFound = true;
212         break;
213       }
214       int CharVal = llvm::hexDigitValue(*ThisTokBuf);
215       if (CharVal == -1) {
216         // Non delimited hex escape sequences stop at the first non-hex digit.
217         if (!Delimited)
218           break;
219         HadError = true;
220         if (Diags)
221           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
222                diag::err_delimited_escape_invalid)
223               << StringRef(ThisTokBuf, 1);
224         continue;
225       }
226       // About to shift out a digit?
227       if (ResultChar & 0xF0000000)
228         Overflow = true;
229       ResultChar <<= 4;
230       ResultChar |= CharVal;
231     }
232     // See if any bits will be truncated when evaluated as a character.
233     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
234       Overflow = true;
235       ResultChar &= ~0U >> (32-CharWidth);
236     }
237 
238     // Check for overflow.
239     if (!HadError && Overflow) { // Too many digits to fit in
240       HadError = true;
241       if (Diags)
242         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
243              diag::err_escape_too_large)
244             << 0;
245     }
246     break;
247   }
248   case '0': case '1': case '2': case '3':
249   case '4': case '5': case '6': case '7': {
250     // Octal escapes.
251     --ThisTokBuf;
252     ResultChar = 0;
253 
254     // Octal escapes are a series of octal digits with maximum length 3.
255     // "\0123" is a two digit sequence equal to "\012" "3".
256     unsigned NumDigits = 0;
257     do {
258       ResultChar <<= 3;
259       ResultChar |= *ThisTokBuf++ - '0';
260       ++NumDigits;
261     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
262              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
263 
264     // Check for overflow.  Reject '\777', but not L'\777'.
265     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
266       if (Diags)
267         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
268              diag::err_escape_too_large) << 1;
269       ResultChar &= ~0U >> (32-CharWidth);
270     }
271     break;
272   }
273   case 'o': {
274     bool Overflow = false;
275     if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
276       HadError = true;
277       if (Diags)
278         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
279              diag::err_delimited_escape_missing_brace)
280             << "o";
281 
282       break;
283     }
284     ResultChar = 0;
285     Delimited = true;
286     ++ThisTokBuf;
287     if (*ThisTokBuf == '}') {
288       HadError = true;
289       if (Diags)
290         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
291              diag::err_delimited_escape_empty);
292     }
293 
294     while (ThisTokBuf != ThisTokEnd) {
295       if (*ThisTokBuf == '}') {
296         EndDelimiterFound = true;
297         ThisTokBuf++;
298         break;
299       }
300       if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
301         HadError = true;
302         if (Diags)
303           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
304                diag::err_delimited_escape_invalid)
305               << StringRef(ThisTokBuf, 1);
306         ThisTokBuf++;
307         continue;
308       }
309       // Check if one of the top three bits is set before shifting them out.
310       if (ResultChar & 0xE0000000)
311         Overflow = true;
312 
313       ResultChar <<= 3;
314       ResultChar |= *ThisTokBuf++ - '0';
315     }
316     // Check for overflow.  Reject '\777', but not L'\777'.
317     if (!HadError &&
318         (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
319       HadError = true;
320       if (Diags)
321         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
322              diag::err_escape_too_large)
323             << 1;
324       ResultChar &= ~0U >> (32 - CharWidth);
325     }
326     break;
327   }
328     // Otherwise, these are not valid escapes.
329   case '(': case '{': case '[': case '%':
330     // GCC accepts these as extensions.  We warn about them as such though.
331     if (Diags)
332       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
333            diag::ext_nonstandard_escape)
334         << std::string(1, ResultChar);
335     break;
336   default:
337     if (!Diags)
338       break;
339 
340     if (isPrintable(ResultChar))
341       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
342            diag::ext_unknown_escape)
343         << std::string(1, ResultChar);
344     else
345       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
346            diag::ext_unknown_escape)
347         << "x" + llvm::utohexstr(ResultChar);
348     break;
349   }
350 
351   if (Delimited && Diags) {
352     if (!EndDelimiterFound)
353       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
354            diag::err_expected)
355           << tok::r_brace;
356     else if (!HadError) {
357       Lexer::DiagnoseDelimitedOrNamedEscapeSequence(Loc, false, Features,
358                                                     *Diags);
359     }
360   }
361 
362   if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
363       !IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
364     Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
365          diag::err_unevaluated_string_invalid_escape_sequence)
366         << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
367     HadError = true;
368   }
369 
370   return ResultChar;
371 }
372 
373 static void appendCodePoint(unsigned Codepoint,
374                             llvm::SmallVectorImpl<char> &Str) {
375   char ResultBuf[4];
376   char *ResultPtr = ResultBuf;
377   if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
378     Str.append(ResultBuf, ResultPtr);
379 }
380 
381 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
382   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
383     if (*I != '\\') {
384       Buf.push_back(*I);
385       continue;
386     }
387 
388     ++I;
389     char Kind = *I;
390     ++I;
391 
392     assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
393     uint32_t CodePoint = 0;
394 
395     if (Kind == 'u' && *I == '{') {
396       for (++I; *I != '}'; ++I) {
397         unsigned Value = llvm::hexDigitValue(*I);
398         assert(Value != -1U);
399         CodePoint <<= 4;
400         CodePoint += Value;
401       }
402       appendCodePoint(CodePoint, Buf);
403       continue;
404     }
405 
406     if (Kind == 'N') {
407       assert(*I == '{');
408       ++I;
409       auto Delim = std::find(I, Input.end(), '}');
410       assert(Delim != Input.end());
411       StringRef Name(I, std::distance(I, Delim));
412       std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
413           llvm::sys::unicode::nameToCodepointLooseMatching(Name);
414       assert(Res && "could not find a codepoint that was previously found");
415       CodePoint = Res->CodePoint;
416       assert(CodePoint != 0xFFFFFFFF);
417       appendCodePoint(CodePoint, Buf);
418       I = Delim;
419       continue;
420     }
421 
422     unsigned NumHexDigits;
423     if (Kind == 'u')
424       NumHexDigits = 4;
425     else
426       NumHexDigits = 8;
427 
428     assert(I + NumHexDigits <= E);
429 
430     for (; NumHexDigits != 0; ++I, --NumHexDigits) {
431       unsigned Value = llvm::hexDigitValue(*I);
432       assert(Value != -1U);
433 
434       CodePoint <<= 4;
435       CodePoint += Value;
436     }
437 
438     appendCodePoint(CodePoint, Buf);
439     --I;
440   }
441 }
442 
443 bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
444                                               const LangOptions &LO) {
445   return LO.MicrosoftExt &&
446          (K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
447           K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
448           K == tok::kw___FUNCDNAME__);
449 }
450 
451 bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
452   return tok::isStringLiteral(Tok.getKind()) ||
453          isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
454 }
455 
456 static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
457                                     const char *&ThisTokBuf,
458                                     const char *ThisTokEnd, uint32_t &UcnVal,
459                                     unsigned short &UcnLen, bool &Delimited,
460                                     FullSourceLoc Loc, DiagnosticsEngine *Diags,
461                                     const LangOptions &Features,
462                                     bool in_char_string_literal = false) {
463   const char *UcnBegin = ThisTokBuf;
464   bool HasError = false;
465   bool EndDelimiterFound = false;
466 
467   // Skip the '\u' char's.
468   ThisTokBuf += 2;
469   Delimited = false;
470   if (UcnBegin[1] == 'u' && in_char_string_literal &&
471       ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
472     Delimited = true;
473     ThisTokBuf++;
474   } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
475     if (Diags)
476       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
477            diag::err_hex_escape_no_digits)
478           << StringRef(&ThisTokBuf[-1], 1);
479     return false;
480   }
481   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
482 
483   bool Overflow = false;
484   unsigned short Count = 0;
485   for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
486        ++ThisTokBuf) {
487     if (Delimited && *ThisTokBuf == '}') {
488       ++ThisTokBuf;
489       EndDelimiterFound = true;
490       break;
491     }
492     int CharVal = llvm::hexDigitValue(*ThisTokBuf);
493     if (CharVal == -1) {
494       HasError = true;
495       if (!Delimited)
496         break;
497       if (Diags) {
498         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
499              diag::err_delimited_escape_invalid)
500             << StringRef(ThisTokBuf, 1);
501       }
502       Count++;
503       continue;
504     }
505     if (UcnVal & 0xF0000000) {
506       Overflow = true;
507       continue;
508     }
509     UcnVal <<= 4;
510     UcnVal |= CharVal;
511     Count++;
512   }
513 
514   if (Overflow) {
515     if (Diags)
516       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
517            diag::err_escape_too_large)
518           << 0;
519     return false;
520   }
521 
522   if (Delimited && !EndDelimiterFound) {
523     if (Diags) {
524       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
525            diag::err_expected)
526           << tok::r_brace;
527     }
528     return false;
529   }
530 
531   // If we didn't consume the proper number of digits, there is a problem.
532   if (Count == 0 || (!Delimited && Count != UcnLen)) {
533     if (Diags)
534       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
535            Delimited ? diag::err_delimited_escape_empty
536                      : diag::err_ucn_escape_incomplete);
537     return false;
538   }
539   return !HasError;
540 }
541 
542 static void DiagnoseInvalidUnicodeCharacterName(
543     DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
544     const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
545     llvm::StringRef Name) {
546 
547   Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
548        diag::err_invalid_ucn_name)
549       << Name;
550 
551   namespace u = llvm::sys::unicode;
552 
553   std::optional<u::LooseMatchingResult> Res =
554       u::nameToCodepointLooseMatching(Name);
555   if (Res) {
556     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
557          diag::note_invalid_ucn_name_loose_matching)
558         << FixItHint::CreateReplacement(
559                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
560                                    TokRangeEnd),
561                Res->Name);
562     return;
563   }
564 
565   unsigned Distance = 0;
566   SmallVector<u::MatchForCodepointName> Matches =
567       u::nearestMatchesForCodepointName(Name, 5);
568   assert(!Matches.empty() && "No unicode characters found");
569 
570   for (const auto &Match : Matches) {
571     if (Distance == 0)
572       Distance = Match.Distance;
573     if (std::max(Distance, Match.Distance) -
574             std::min(Distance, Match.Distance) >
575         3)
576       break;
577     Distance = Match.Distance;
578 
579     std::string Str;
580     llvm::UTF32 V = Match.Value;
581     bool Converted =
582         llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
583     (void)Converted;
584     assert(Converted && "Found a match wich is not a unicode character");
585 
586     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
587          diag::note_invalid_ucn_name_candidate)
588         << Match.Name << llvm::utohexstr(Match.Value)
589         << Str // FIXME: Fix the rendering of non printable characters
590         << FixItHint::CreateReplacement(
591                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
592                                    TokRangeEnd),
593                Match.Name);
594   }
595 }
596 
597 static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
598                                   const char *&ThisTokBuf,
599                                   const char *ThisTokEnd, uint32_t &UcnVal,
600                                   unsigned short &UcnLen, FullSourceLoc Loc,
601                                   DiagnosticsEngine *Diags,
602                                   const LangOptions &Features) {
603   const char *UcnBegin = ThisTokBuf;
604   assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
605   ThisTokBuf += 2;
606   if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
607     if (Diags) {
608       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
609            diag::err_delimited_escape_missing_brace)
610           << StringRef(&ThisTokBuf[-1], 1);
611     }
612     return false;
613   }
614   ThisTokBuf++;
615   const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
616     return C == '}' || isVerticalWhitespace(C);
617   });
618   bool Incomplete = ClosingBrace == ThisTokEnd;
619   bool Empty = ClosingBrace == ThisTokBuf;
620   if (Incomplete || Empty) {
621     if (Diags) {
622       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
623            Incomplete ? diag::err_ucn_escape_incomplete
624                       : diag::err_delimited_escape_empty)
625           << StringRef(&UcnBegin[1], 1);
626     }
627     ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
628     return false;
629   }
630   StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
631   ThisTokBuf = ClosingBrace + 1;
632   std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
633   if (!Res) {
634     if (Diags)
635       DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
636                                           &UcnBegin[3], ClosingBrace, Name);
637     return false;
638   }
639   UcnVal = *Res;
640   UcnLen = UcnVal > 0xFFFF ? 8 : 4;
641   return true;
642 }
643 
644 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
645 /// return the UTF32.
646 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
647                              const char *ThisTokEnd, uint32_t &UcnVal,
648                              unsigned short &UcnLen, FullSourceLoc Loc,
649                              DiagnosticsEngine *Diags,
650                              const LangOptions &Features,
651                              bool in_char_string_literal = false) {
652 
653   bool HasError;
654   const char *UcnBegin = ThisTokBuf;
655   bool IsDelimitedEscapeSequence = false;
656   bool IsNamedEscapeSequence = false;
657   if (ThisTokBuf[1] == 'N') {
658     IsNamedEscapeSequence = true;
659     HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
660                                       UcnVal, UcnLen, Loc, Diags, Features);
661   } else {
662     HasError =
663         !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
664                                  UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
665                                  Features, in_char_string_literal);
666   }
667   if (HasError)
668     return false;
669 
670   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
671   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
672       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
673     if (Diags)
674       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
675            diag::err_ucn_escape_invalid);
676     return false;
677   }
678 
679   // C23 and C++11 allow UCNs that refer to control characters
680   // and basic source characters inside character and string literals
681   if (UcnVal < 0xa0 &&
682       // $, @, ` are allowed in all language modes
683       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
684     bool IsError =
685         (!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
686     if (Diags) {
687       char BasicSCSChar = UcnVal;
688       if (UcnVal >= 0x20 && UcnVal < 0x7f)
689         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
690              IsError ? diag::err_ucn_escape_basic_scs
691              : Features.CPlusPlus
692                  ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
693                  : diag::warn_c23_compat_literal_ucn_escape_basic_scs)
694             << StringRef(&BasicSCSChar, 1);
695       else
696         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
697              IsError ? diag::err_ucn_control_character
698              : Features.CPlusPlus
699                  ? diag::warn_cxx98_compat_literal_ucn_control_character
700                  : diag::warn_c23_compat_literal_ucn_control_character);
701     }
702     if (IsError)
703       return false;
704   }
705 
706   if (!Features.CPlusPlus && !Features.C99 && Diags)
707     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
708          diag::warn_ucn_not_valid_in_c89_literal);
709 
710   if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
711     Lexer::DiagnoseDelimitedOrNamedEscapeSequence(Loc, IsNamedEscapeSequence,
712                                                   Features, *Diags);
713   return true;
714 }
715 
716 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
717 /// which this UCN will occupy.
718 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
719                             const char *ThisTokEnd, unsigned CharByteWidth,
720                             const LangOptions &Features, bool &HadError) {
721   // UTF-32: 4 bytes per escape.
722   if (CharByteWidth == 4)
723     return 4;
724 
725   uint32_t UcnVal = 0;
726   unsigned short UcnLen = 0;
727   FullSourceLoc Loc;
728 
729   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
730                         UcnLen, Loc, nullptr, Features, true)) {
731     HadError = true;
732     return 0;
733   }
734 
735   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
736   if (CharByteWidth == 2)
737     return UcnVal <= 0xFFFF ? 2 : 4;
738 
739   // UTF-8.
740   if (UcnVal < 0x80)
741     return 1;
742   if (UcnVal < 0x800)
743     return 2;
744   if (UcnVal < 0x10000)
745     return 3;
746   return 4;
747 }
748 
749 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
750 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
751 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
752 /// we will likely rework our support for UCN's.
753 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
754                             const char *ThisTokEnd,
755                             char *&ResultBuf, bool &HadError,
756                             FullSourceLoc Loc, unsigned CharByteWidth,
757                             DiagnosticsEngine *Diags,
758                             const LangOptions &Features) {
759   typedef uint32_t UTF32;
760   UTF32 UcnVal = 0;
761   unsigned short UcnLen = 0;
762   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
763                         Loc, Diags, Features, true)) {
764     HadError = true;
765     return;
766   }
767 
768   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
769          "only character widths of 1, 2, or 4 bytes supported");
770 
771   (void)UcnLen;
772   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
773 
774   if (CharByteWidth == 4) {
775     // FIXME: Make the type of the result buffer correct instead of
776     // using reinterpret_cast.
777     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
778     *ResultPtr = UcnVal;
779     ResultBuf += 4;
780     return;
781   }
782 
783   if (CharByteWidth == 2) {
784     // FIXME: Make the type of the result buffer correct instead of
785     // using reinterpret_cast.
786     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
787 
788     if (UcnVal <= (UTF32)0xFFFF) {
789       *ResultPtr = UcnVal;
790       ResultBuf += 2;
791       return;
792     }
793 
794     // Convert to UTF16.
795     UcnVal -= 0x10000;
796     *ResultPtr     = 0xD800 + (UcnVal >> 10);
797     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
798     ResultBuf += 4;
799     return;
800   }
801 
802   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
803 
804   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
805   // The conversion below was inspired by:
806   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
807   // First, we determine how many bytes the result will require.
808   typedef uint8_t UTF8;
809 
810   unsigned short bytesToWrite = 0;
811   if (UcnVal < (UTF32)0x80)
812     bytesToWrite = 1;
813   else if (UcnVal < (UTF32)0x800)
814     bytesToWrite = 2;
815   else if (UcnVal < (UTF32)0x10000)
816     bytesToWrite = 3;
817   else
818     bytesToWrite = 4;
819 
820   const unsigned byteMask = 0xBF;
821   const unsigned byteMark = 0x80;
822 
823   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
824   // into the first byte, depending on how many bytes follow.
825   static const UTF8 firstByteMark[5] = {
826     0x00, 0x00, 0xC0, 0xE0, 0xF0
827   };
828   // Finally, we write the bytes into ResultBuf.
829   ResultBuf += bytesToWrite;
830   switch (bytesToWrite) { // note: everything falls through.
831   case 4:
832     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
833     [[fallthrough]];
834   case 3:
835     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
836     [[fallthrough]];
837   case 2:
838     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
839     [[fallthrough]];
840   case 1:
841     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
842   }
843   // Update the buffer.
844   ResultBuf += bytesToWrite;
845 }
846 
847 ///       integer-constant: [C99 6.4.4.1]
848 ///         decimal-constant integer-suffix
849 ///         octal-constant integer-suffix
850 ///         hexadecimal-constant integer-suffix
851 ///         binary-literal integer-suffix [GNU, C++1y]
852 ///       user-defined-integer-literal: [C++11 lex.ext]
853 ///         decimal-literal ud-suffix
854 ///         octal-literal ud-suffix
855 ///         hexadecimal-literal ud-suffix
856 ///         binary-literal ud-suffix [GNU, C++1y]
857 ///       decimal-constant:
858 ///         nonzero-digit
859 ///         decimal-constant digit
860 ///       octal-constant:
861 ///         0
862 ///         octal-constant octal-digit
863 ///       hexadecimal-constant:
864 ///         hexadecimal-prefix hexadecimal-digit
865 ///         hexadecimal-constant hexadecimal-digit
866 ///       hexadecimal-prefix: one of
867 ///         0x 0X
868 ///       binary-literal:
869 ///         0b binary-digit
870 ///         0B binary-digit
871 ///         binary-literal binary-digit
872 ///       integer-suffix:
873 ///         unsigned-suffix [long-suffix]
874 ///         unsigned-suffix [long-long-suffix]
875 ///         long-suffix [unsigned-suffix]
876 ///         long-long-suffix [unsigned-sufix]
877 ///       nonzero-digit:
878 ///         1 2 3 4 5 6 7 8 9
879 ///       octal-digit:
880 ///         0 1 2 3 4 5 6 7
881 ///       hexadecimal-digit:
882 ///         0 1 2 3 4 5 6 7 8 9
883 ///         a b c d e f
884 ///         A B C D E F
885 ///       binary-digit:
886 ///         0
887 ///         1
888 ///       unsigned-suffix: one of
889 ///         u U
890 ///       long-suffix: one of
891 ///         l L
892 ///       long-long-suffix: one of
893 ///         ll LL
894 ///
895 ///       floating-constant: [C99 6.4.4.2]
896 ///         TODO: add rules...
897 ///
898 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
899                                            SourceLocation TokLoc,
900                                            const SourceManager &SM,
901                                            const LangOptions &LangOpts,
902                                            const TargetInfo &Target,
903                                            DiagnosticsEngine &Diags)
904     : SM(SM), LangOpts(LangOpts), Diags(Diags),
905       ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
906 
907   s = DigitsBegin = ThisTokBegin;
908   saw_exponent = false;
909   saw_period = false;
910   saw_ud_suffix = false;
911   saw_fixed_point_suffix = false;
912   isLong = false;
913   isUnsigned = false;
914   isLongLong = false;
915   isSizeT = false;
916   isHalf = false;
917   isFloat = false;
918   isImaginary = false;
919   isFloat16 = false;
920   isFloat128 = false;
921   MicrosoftInteger = 0;
922   isFract = false;
923   isAccum = false;
924   hadError = false;
925   isBitInt = false;
926 
927   // This routine assumes that the range begin/end matches the regex for integer
928   // and FP constants (specifically, the 'pp-number' regex), and assumes that
929   // the byte at "*end" is both valid and not part of the regex.  Because of
930   // this, it doesn't have to check for 'overscan' in various places.
931   // Note: For HLSL, the end token is allowed to be '.' which would be in the
932   // 'pp-number' regex. This is required to support vector swizzles on numeric
933   // constants (i.e. 1.xx or 1.5f.rrr).
934   if (isPreprocessingNumberBody(*ThisTokEnd) &&
935       !(LangOpts.HLSL && *ThisTokEnd == '.')) {
936     Diags.Report(TokLoc, diag::err_lexing_numeric);
937     hadError = true;
938     return;
939   }
940 
941   if (*s == '0') { // parse radix
942     ParseNumberStartingWithZero(TokLoc);
943     if (hadError)
944       return;
945   } else { // the first digit is non-zero
946     radix = 10;
947     s = SkipDigits(s);
948     if (s == ThisTokEnd) {
949       // Done.
950     } else {
951       ParseDecimalOrOctalCommon(TokLoc);
952       if (hadError)
953         return;
954     }
955   }
956 
957   SuffixBegin = s;
958   checkSeparator(TokLoc, s, CSK_AfterDigits);
959 
960   // Initial scan to lookahead for fixed point suffix.
961   if (LangOpts.FixedPoint) {
962     for (const char *c = s; c != ThisTokEnd; ++c) {
963       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
964         saw_fixed_point_suffix = true;
965         break;
966       }
967     }
968   }
969 
970   // Parse the suffix.  At this point we can classify whether we have an FP or
971   // integer constant.
972   bool isFixedPointConstant = isFixedPointLiteral();
973   bool isFPConstant = isFloatingLiteral();
974   bool HasSize = false;
975   bool DoubleUnderscore = false;
976 
977   // Loop over all of the characters of the suffix.  If we see something bad,
978   // we break out of the loop.
979   for (; s != ThisTokEnd; ++s) {
980     switch (*s) {
981     case 'R':
982     case 'r':
983       if (!LangOpts.FixedPoint)
984         break;
985       if (isFract || isAccum) break;
986       if (!(saw_period || saw_exponent)) break;
987       isFract = true;
988       continue;
989     case 'K':
990     case 'k':
991       if (!LangOpts.FixedPoint)
992         break;
993       if (isFract || isAccum) break;
994       if (!(saw_period || saw_exponent)) break;
995       isAccum = true;
996       continue;
997     case 'h':      // FP Suffix for "half".
998     case 'H':
999       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
1000       if (!(LangOpts.Half || LangOpts.FixedPoint))
1001         break;
1002       if (isIntegerLiteral()) break;  // Error for integer constant.
1003       if (HasSize)
1004         break;
1005       HasSize = true;
1006       isHalf = true;
1007       continue;  // Success.
1008     case 'f':      // FP Suffix for "float"
1009     case 'F':
1010       if (!isFPConstant) break;  // Error for integer constant.
1011       if (HasSize)
1012         break;
1013       HasSize = true;
1014 
1015       // CUDA host and device may have different _Float16 support, therefore
1016       // allows f16 literals to avoid false alarm.
1017       // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1018       // should also be supported.
1019       // ToDo: more precise check for CUDA.
1020       // TODO: AMDGPU might also support it in the future.
1021       if ((Target.hasFloat16Type() || LangOpts.CUDA ||
1022            (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1023           s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
1024         s += 2; // success, eat up 2 characters.
1025         isFloat16 = true;
1026         continue;
1027       }
1028 
1029       isFloat = true;
1030       continue;  // Success.
1031     case 'q':    // FP Suffix for "__float128"
1032     case 'Q':
1033       if (!isFPConstant) break;  // Error for integer constant.
1034       if (HasSize)
1035         break;
1036       HasSize = true;
1037       isFloat128 = true;
1038       continue;  // Success.
1039     case 'u':
1040     case 'U':
1041       if (isFPConstant) break;  // Error for floating constant.
1042       if (isUnsigned) break;    // Cannot be repeated.
1043       isUnsigned = true;
1044       continue;  // Success.
1045     case 'l':
1046     case 'L':
1047       if (HasSize)
1048         break;
1049       HasSize = true;
1050 
1051       // Check for long long.  The L's need to be adjacent and the same case.
1052       if (s[1] == s[0]) {
1053         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
1054         if (isFPConstant) break;        // long long invalid for floats.
1055         isLongLong = true;
1056         ++s;  // Eat both of them.
1057       } else {
1058         isLong = true;
1059       }
1060       continue; // Success.
1061     case 'z':
1062     case 'Z':
1063       if (isFPConstant)
1064         break; // Invalid for floats.
1065       if (HasSize)
1066         break;
1067       HasSize = true;
1068       isSizeT = true;
1069       continue;
1070     case 'i':
1071     case 'I':
1072       if (LangOpts.MicrosoftExt && s + 1 < ThisTokEnd && !isFPConstant) {
1073         // Allow i8, i16, i32, i64, and i128. First, look ahead and check if
1074         // suffixes are Microsoft integers and not the imaginary unit.
1075         uint8_t Bits = 0;
1076         size_t ToSkip = 0;
1077         switch (s[1]) {
1078         case '8': // i8 suffix
1079           Bits = 8;
1080           ToSkip = 2;
1081           break;
1082         case '1':
1083           if (s + 2 < ThisTokEnd && s[2] == '6') { // i16 suffix
1084             Bits = 16;
1085             ToSkip = 3;
1086           } else if (s + 3 < ThisTokEnd && s[2] == '2' &&
1087                      s[3] == '8') { // i128 suffix
1088             Bits = 128;
1089             ToSkip = 4;
1090           }
1091           break;
1092         case '3':
1093           if (s + 2 < ThisTokEnd && s[2] == '2') { // i32 suffix
1094             Bits = 32;
1095             ToSkip = 3;
1096           }
1097           break;
1098         case '6':
1099           if (s + 2 < ThisTokEnd && s[2] == '4') { // i64 suffix
1100             Bits = 64;
1101             ToSkip = 3;
1102           }
1103           break;
1104         default:
1105           break;
1106         }
1107         if (Bits) {
1108           if (HasSize)
1109             break;
1110           HasSize = true;
1111           MicrosoftInteger = Bits;
1112           s += ToSkip;
1113           assert(s <= ThisTokEnd && "didn't maximally munch?");
1114           break;
1115         }
1116       }
1117       [[fallthrough]];
1118     case 'j':
1119     case 'J':
1120       if (isImaginary) break;   // Cannot be repeated.
1121       isImaginary = true;
1122       continue;  // Success.
1123     case '_':
1124       if (isFPConstant)
1125         break; // Invalid for floats
1126       if (HasSize)
1127         break;
1128       // There is currently no way to reach this with DoubleUnderscore set.
1129       // If new double underscope literals are added handle it here as above.
1130       assert(!DoubleUnderscore && "unhandled double underscore case");
1131       if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd &&
1132           s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists
1133                          // after __
1134         DoubleUnderscore = true;
1135         s += 2; // Skip both '_'
1136         if (s + 1 < ThisTokEnd &&
1137             (*s == 'u' || *s == 'U')) { // Ensure some character after 'u'/'U'
1138           isUnsigned = true;
1139           ++s;
1140         }
1141         if (s + 1 < ThisTokEnd &&
1142             ((*s == 'w' && *(++s) == 'b') || (*s == 'W' && *(++s) == 'B'))) {
1143           isBitInt = true;
1144           HasSize = true;
1145           continue;
1146         }
1147       }
1148       break;
1149     case 'w':
1150     case 'W':
1151       if (isFPConstant)
1152         break; // Invalid for floats.
1153       if (HasSize)
1154         break; // Invalid if we already have a size for the literal.
1155 
1156       // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1157       // explicitly do not support the suffix in C++ as an extension because a
1158       // library-based UDL that resolves to a library type may be more
1159       // appropriate there. The same rules apply for __wb/__WB.
1160       if ((!LangOpts.CPlusPlus || DoubleUnderscore) && s + 1 < ThisTokEnd &&
1161           ((s[0] == 'w' && s[1] == 'b') || (s[0] == 'W' && s[1] == 'B'))) {
1162         isBitInt = true;
1163         HasSize = true;
1164         ++s; // Skip both characters (2nd char skipped on continue).
1165         continue; // Success.
1166       }
1167     }
1168     // If we reached here, there was an error or a ud-suffix.
1169     break;
1170   }
1171 
1172   // "i", "if", and "il" are user-defined suffixes in C++1y.
1173   if (s != ThisTokEnd || isImaginary) {
1174     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1175     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1176     if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1177       if (!isImaginary) {
1178         // Any suffix pieces we might have parsed are actually part of the
1179         // ud-suffix.
1180         isLong = false;
1181         isUnsigned = false;
1182         isLongLong = false;
1183         isSizeT = false;
1184         isFloat = false;
1185         isFloat16 = false;
1186         isHalf = false;
1187         isImaginary = false;
1188         isBitInt = false;
1189         MicrosoftInteger = 0;
1190         saw_fixed_point_suffix = false;
1191         isFract = false;
1192         isAccum = false;
1193       }
1194 
1195       saw_ud_suffix = true;
1196       return;
1197     }
1198 
1199     if (s != ThisTokEnd) {
1200       // Report an error if there are any.
1201       Diags.Report(Lexer::AdvanceToTokenCharacter(
1202                        TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1203                    diag::err_invalid_suffix_constant)
1204           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1205           << (isFixedPointConstant ? 2 : isFPConstant);
1206       hadError = true;
1207     }
1208   }
1209 
1210   if (!hadError && saw_fixed_point_suffix) {
1211     assert(isFract || isAccum);
1212   }
1213 }
1214 
1215 /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1216 /// numbers. It issues an error for illegal digits, and handles floating point
1217 /// parsing. If it detects a floating point number, the radix is set to 10.
1218 void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1219   assert((radix == 8 || radix == 10) && "Unexpected radix");
1220 
1221   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1222   // the code is using an incorrect base.
1223   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1224       !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1225     Diags.Report(
1226         Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1227         diag::err_invalid_digit)
1228         << StringRef(s, 1) << (radix == 8 ? 1 : 0);
1229     hadError = true;
1230     return;
1231   }
1232 
1233   if (*s == '.') {
1234     checkSeparator(TokLoc, s, CSK_AfterDigits);
1235     s++;
1236     radix = 10;
1237     saw_period = true;
1238     checkSeparator(TokLoc, s, CSK_BeforeDigits);
1239     s = SkipDigits(s); // Skip suffix.
1240   }
1241   if (*s == 'e' || *s == 'E') { // exponent
1242     checkSeparator(TokLoc, s, CSK_AfterDigits);
1243     const char *Exponent = s;
1244     s++;
1245     radix = 10;
1246     saw_exponent = true;
1247     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1248     const char *first_non_digit = SkipDigits(s);
1249     if (containsDigits(s, first_non_digit)) {
1250       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1251       s = first_non_digit;
1252     } else {
1253       if (!hadError) {
1254         Diags.Report(Lexer::AdvanceToTokenCharacter(
1255                          TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1256                      diag::err_exponent_has_no_digits);
1257         hadError = true;
1258       }
1259       return;
1260     }
1261   }
1262 }
1263 
1264 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1265 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
1266 /// treat it as an invalid suffix.
1267 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1268                                            StringRef Suffix) {
1269   if (!LangOpts.CPlusPlus11 || Suffix.empty())
1270     return false;
1271 
1272   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1273   // Suffixes starting with '__' (double underscore) are for use by
1274   // the implementation.
1275   if (Suffix.starts_with("_") && !Suffix.starts_with("__"))
1276     return true;
1277 
1278   // In C++11, there are no library suffixes.
1279   if (!LangOpts.CPlusPlus14)
1280     return false;
1281 
1282   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1283   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1284   // In C++2a "d" and "y" are used in the library.
1285   return llvm::StringSwitch<bool>(Suffix)
1286       .Cases("h", "min", "s", true)
1287       .Cases("ms", "us", "ns", true)
1288       .Cases("il", "i", "if", true)
1289       .Cases("d", "y", LangOpts.CPlusPlus20)
1290       .Default(false);
1291 }
1292 
1293 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1294                                           const char *Pos,
1295                                           CheckSeparatorKind IsAfterDigits) {
1296   if (IsAfterDigits == CSK_AfterDigits) {
1297     if (Pos == ThisTokBegin)
1298       return;
1299     --Pos;
1300   } else if (Pos == ThisTokEnd)
1301     return;
1302 
1303   if (isDigitSeparator(*Pos)) {
1304     Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1305                                                 LangOpts),
1306                  diag::err_digit_separator_not_between_digits)
1307         << IsAfterDigits;
1308     hadError = true;
1309   }
1310 }
1311 
1312 /// ParseNumberStartingWithZero - This method is called when the first character
1313 /// of the number is found to be a zero.  This means it is either an octal
1314 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1315 /// a floating point number (01239.123e4).  Eat the prefix, determining the
1316 /// radix etc.
1317 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1318   assert(s[0] == '0' && "Invalid method call");
1319   s++;
1320 
1321   int c1 = s[0];
1322 
1323   // Handle a hex number like 0x1234.
1324   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1325     s++;
1326     assert(s < ThisTokEnd && "didn't maximally munch?");
1327     radix = 16;
1328     DigitsBegin = s;
1329     s = SkipHexDigits(s);
1330     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1331     if (s == ThisTokEnd) {
1332       // Done.
1333     } else if (*s == '.') {
1334       s++;
1335       saw_period = true;
1336       const char *floatDigitsBegin = s;
1337       s = SkipHexDigits(s);
1338       if (containsDigits(floatDigitsBegin, s))
1339         HasSignificandDigits = true;
1340       if (HasSignificandDigits)
1341         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1342     }
1343 
1344     if (!HasSignificandDigits) {
1345       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1346                                                   LangOpts),
1347                    diag::err_hex_constant_requires)
1348           << LangOpts.CPlusPlus << 1;
1349       hadError = true;
1350       return;
1351     }
1352 
1353     // A binary exponent can appear with or with a '.'. If dotted, the
1354     // binary exponent is required.
1355     if (*s == 'p' || *s == 'P') {
1356       checkSeparator(TokLoc, s, CSK_AfterDigits);
1357       const char *Exponent = s;
1358       s++;
1359       saw_exponent = true;
1360       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1361       const char *first_non_digit = SkipDigits(s);
1362       if (!containsDigits(s, first_non_digit)) {
1363         if (!hadError) {
1364           Diags.Report(Lexer::AdvanceToTokenCharacter(
1365                            TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1366                        diag::err_exponent_has_no_digits);
1367           hadError = true;
1368         }
1369         return;
1370       }
1371       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1372       s = first_non_digit;
1373 
1374       if (!LangOpts.HexFloats)
1375         Diags.Report(TokLoc, LangOpts.CPlusPlus
1376                                  ? diag::ext_hex_literal_invalid
1377                                  : diag::ext_hex_constant_invalid);
1378       else if (LangOpts.CPlusPlus17)
1379         Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1380     } else if (saw_period) {
1381       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1382                                                   LangOpts),
1383                    diag::err_hex_constant_requires)
1384           << LangOpts.CPlusPlus << 0;
1385       hadError = true;
1386     }
1387     return;
1388   }
1389 
1390   // Handle simple binary numbers 0b01010
1391   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1392     // 0b101010 is a C++14 and C23 extension.
1393     unsigned DiagId;
1394     if (LangOpts.CPlusPlus14)
1395       DiagId = diag::warn_cxx11_compat_binary_literal;
1396     else if (LangOpts.C23)
1397       DiagId = diag::warn_c23_compat_binary_literal;
1398     else if (LangOpts.CPlusPlus)
1399       DiagId = diag::ext_binary_literal_cxx14;
1400     else
1401       DiagId = diag::ext_binary_literal;
1402     Diags.Report(TokLoc, DiagId);
1403     ++s;
1404     assert(s < ThisTokEnd && "didn't maximally munch?");
1405     radix = 2;
1406     DigitsBegin = s;
1407     s = SkipBinaryDigits(s);
1408     if (s == ThisTokEnd) {
1409       // Done.
1410     } else if (isHexDigit(*s) &&
1411                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1412       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1413                                                   LangOpts),
1414                    diag::err_invalid_digit)
1415           << StringRef(s, 1) << 2;
1416       hadError = true;
1417     }
1418     // Other suffixes will be diagnosed by the caller.
1419     return;
1420   }
1421 
1422   // Parse a potential octal literal prefix.
1423   bool IsSingleZero = false;
1424   if ((c1 == 'O' || c1 == 'o') && (s[1] >= '0' && s[1] <= '7')) {
1425     unsigned DiagId;
1426     if (LangOpts.C2y)
1427       DiagId = diag::warn_c2y_compat_octal_literal;
1428     else if (LangOpts.CPlusPlus)
1429       DiagId = diag::ext_cpp_octal_literal;
1430     else
1431       DiagId = diag::ext_octal_literal;
1432     Diags.Report(TokLoc, DiagId);
1433     ++s;
1434     DigitsBegin = s;
1435     radix = 8;
1436     s = SkipOctalDigits(s);
1437     if (s == ThisTokEnd) {
1438       // Done
1439     } else if ((isHexDigit(*s) && *s != 'e' && *s != 'E' && *s != '.') &&
1440                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1441       auto InvalidDigitLoc = Lexer::AdvanceToTokenCharacter(
1442           TokLoc, s - ThisTokBegin, SM, LangOpts);
1443       Diags.Report(InvalidDigitLoc, diag::err_invalid_digit)
1444           << StringRef(s, 1) << 1;
1445       hadError = true;
1446     }
1447     // Other suffixes will be diagnosed by the caller.
1448     return;
1449   }
1450 
1451   auto _ = llvm::make_scope_exit([&] {
1452     // If we still have an octal value but we did not see an octal prefix,
1453     // diagnose as being an obsolescent feature starting in C2y.
1454     if (radix == 8 && LangOpts.C2y && !hadError && !IsSingleZero)
1455       Diags.Report(TokLoc, diag::warn_unprefixed_octal_deprecated);
1456   });
1457 
1458   // For now, the radix is set to 8. If we discover that we have a
1459   // floating point constant, the radix will change to 10. Octal floating
1460   // point constants are not permitted (only decimal and hexadecimal).
1461   radix = 8;
1462   const char *PossibleNewDigitStart = s;
1463   s = SkipOctalDigits(s);
1464   // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1465   // as the start of the digits. So if skipping octal digits does not skip
1466   // anything, we leave the digit start where it was.
1467   if (s != PossibleNewDigitStart)
1468     DigitsBegin = PossibleNewDigitStart;
1469   else
1470     IsSingleZero = (s == ThisTokBegin + 1);
1471 
1472   if (s == ThisTokEnd)
1473     return; // Done, simple octal number like 01234
1474 
1475   // If we have some other non-octal digit that *is* a decimal digit, see if
1476   // this is part of a floating point number like 094.123 or 09e1.
1477   if (isDigit(*s)) {
1478     const char *EndDecimal = SkipDigits(s);
1479     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1480       s = EndDecimal;
1481       radix = 10;
1482     }
1483   }
1484 
1485   ParseDecimalOrOctalCommon(TokLoc);
1486 }
1487 
1488 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1489   switch (Radix) {
1490   case 2:
1491     return NumDigits <= 64;
1492   case 8:
1493     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1494   case 10:
1495     return NumDigits <= 19; // floor(log10(2^64))
1496   case 16:
1497     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1498   default:
1499     llvm_unreachable("impossible Radix");
1500   }
1501 }
1502 
1503 /// GetIntegerValue - Convert this numeric literal value to an APInt that
1504 /// matches Val's input width.  If there is an overflow, set Val to the low bits
1505 /// of the result and return true.  Otherwise, return false.
1506 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1507   // Fast path: Compute a conservative bound on the maximum number of
1508   // bits per digit in this radix. If we can't possibly overflow a
1509   // uint64 based on that bound then do the simple conversion to
1510   // integer. This avoids the expensive overflow checking below, and
1511   // handles the common cases that matter (small decimal integers and
1512   // hex/octal values which don't overflow).
1513   const unsigned NumDigits = SuffixBegin - DigitsBegin;
1514   if (alwaysFitsInto64Bits(radix, NumDigits)) {
1515     uint64_t N = 0;
1516     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1517       if (!isDigitSeparator(*Ptr))
1518         N = N * radix + llvm::hexDigitValue(*Ptr);
1519 
1520     // This will truncate the value to Val's input width. Simply check
1521     // for overflow by comparing.
1522     Val = N;
1523     return Val.getZExtValue() != N;
1524   }
1525 
1526   Val = 0;
1527   const char *Ptr = DigitsBegin;
1528 
1529   llvm::APInt RadixVal(Val.getBitWidth(), radix);
1530   llvm::APInt CharVal(Val.getBitWidth(), 0);
1531   llvm::APInt OldVal = Val;
1532 
1533   bool OverflowOccurred = false;
1534   while (Ptr < SuffixBegin) {
1535     if (isDigitSeparator(*Ptr)) {
1536       ++Ptr;
1537       continue;
1538     }
1539 
1540     unsigned C = llvm::hexDigitValue(*Ptr++);
1541 
1542     // If this letter is out of bound for this radix, reject it.
1543     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1544 
1545     CharVal = C;
1546 
1547     // Add the digit to the value in the appropriate radix.  If adding in digits
1548     // made the value smaller, then this overflowed.
1549     OldVal = Val;
1550 
1551     // Multiply by radix, did overflow occur on the multiply?
1552     Val *= RadixVal;
1553     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1554 
1555     // Add value, did overflow occur on the value?
1556     //   (a + b) ult b  <=> overflow
1557     Val += CharVal;
1558     OverflowOccurred |= Val.ult(CharVal);
1559   }
1560   return OverflowOccurred;
1561 }
1562 
1563 llvm::APFloat::opStatus
1564 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result,
1565                                     llvm::RoundingMode RM) {
1566   using llvm::APFloat;
1567 
1568   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1569 
1570   llvm::SmallString<16> Buffer;
1571   StringRef Str(ThisTokBegin, n);
1572   if (Str.contains('\'')) {
1573     Buffer.reserve(n);
1574     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1575                         &isDigitSeparator);
1576     Str = Buffer;
1577   }
1578 
1579   auto StatusOrErr = Result.convertFromString(Str, RM);
1580   assert(StatusOrErr && "Invalid floating point representation");
1581   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1582                                                : APFloat::opInvalidOp;
1583 }
1584 
1585 static inline bool IsExponentPart(char c, bool isHex) {
1586   if (isHex)
1587     return c == 'p' || c == 'P';
1588   return c == 'e' || c == 'E';
1589 }
1590 
1591 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1592   assert(radix == 16 || radix == 10);
1593 
1594   // Find how many digits are needed to store the whole literal.
1595   unsigned NumDigits = SuffixBegin - DigitsBegin;
1596   if (saw_period) --NumDigits;
1597 
1598   // Initial scan of the exponent if it exists
1599   bool ExpOverflowOccurred = false;
1600   bool NegativeExponent = false;
1601   const char *ExponentBegin;
1602   uint64_t Exponent = 0;
1603   int64_t BaseShift = 0;
1604   if (saw_exponent) {
1605     const char *Ptr = DigitsBegin;
1606 
1607     while (!IsExponentPart(*Ptr, radix == 16))
1608       ++Ptr;
1609     ExponentBegin = Ptr;
1610     ++Ptr;
1611     NegativeExponent = *Ptr == '-';
1612     if (NegativeExponent) ++Ptr;
1613 
1614     unsigned NumExpDigits = SuffixBegin - Ptr;
1615     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1616       llvm::StringRef ExpStr(Ptr, NumExpDigits);
1617       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1618       Exponent = ExpInt.getZExtValue();
1619     } else {
1620       ExpOverflowOccurred = true;
1621     }
1622 
1623     if (NegativeExponent) BaseShift -= Exponent;
1624     else BaseShift += Exponent;
1625   }
1626 
1627   // Number of bits needed for decimal literal is
1628   //   ceil(NumDigits * log2(10))       Integral part
1629   // + Scale                            Fractional part
1630   // + ceil(Exponent * log2(10))        Exponent
1631   // --------------------------------------------------
1632   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1633   //
1634   // But for simplicity in handling integers, we can round up log2(10) to 4,
1635   // making:
1636   // 4 * (NumDigits + Exponent) + Scale
1637   //
1638   // Number of digits needed for hexadecimal literal is
1639   //   4 * NumDigits                    Integral part
1640   // + Scale                            Fractional part
1641   // + Exponent                         Exponent
1642   // --------------------------------------------------
1643   //   (4 * NumDigits) + Scale + Exponent
1644   uint64_t NumBitsNeeded;
1645   if (radix == 10)
1646     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1647   else
1648     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1649 
1650   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1651     ExpOverflowOccurred = true;
1652   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1653 
1654   bool FoundDecimal = false;
1655 
1656   int64_t FractBaseShift = 0;
1657   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1658   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1659     if (*Ptr == '.') {
1660       FoundDecimal = true;
1661       continue;
1662     }
1663 
1664     // Normal reading of an integer
1665     unsigned C = llvm::hexDigitValue(*Ptr);
1666     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1667 
1668     Val *= radix;
1669     Val += C;
1670 
1671     if (FoundDecimal)
1672       // Keep track of how much we will need to adjust this value by from the
1673       // number of digits past the radix point.
1674       --FractBaseShift;
1675   }
1676 
1677   // For a radix of 16, we will be multiplying by 2 instead of 16.
1678   if (radix == 16) FractBaseShift *= 4;
1679   BaseShift += FractBaseShift;
1680 
1681   Val <<= Scale;
1682 
1683   uint64_t Base = (radix == 16) ? 2 : 10;
1684   if (BaseShift > 0) {
1685     for (int64_t i = 0; i < BaseShift; ++i) {
1686       Val *= Base;
1687     }
1688   } else if (BaseShift < 0) {
1689     for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1690       Val = Val.udiv(Base);
1691   }
1692 
1693   bool IntOverflowOccurred = false;
1694   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1695   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1696     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1697     StoreVal = Val.trunc(StoreVal.getBitWidth());
1698   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1699     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1700     StoreVal = Val.zext(StoreVal.getBitWidth());
1701   } else {
1702     StoreVal = Val;
1703   }
1704 
1705   return IntOverflowOccurred || ExpOverflowOccurred;
1706 }
1707 
1708 /// \verbatim
1709 ///       user-defined-character-literal: [C++11 lex.ext]
1710 ///         character-literal ud-suffix
1711 ///       ud-suffix:
1712 ///         identifier
1713 ///       character-literal: [C++11 lex.ccon]
1714 ///         ' c-char-sequence '
1715 ///         u' c-char-sequence '
1716 ///         U' c-char-sequence '
1717 ///         L' c-char-sequence '
1718 ///         u8' c-char-sequence ' [C++1z lex.ccon]
1719 ///       c-char-sequence:
1720 ///         c-char
1721 ///         c-char-sequence c-char
1722 ///       c-char:
1723 ///         any member of the source character set except the single-quote ',
1724 ///           backslash \, or new-line character
1725 ///         escape-sequence
1726 ///         universal-character-name
1727 ///       escape-sequence:
1728 ///         simple-escape-sequence
1729 ///         octal-escape-sequence
1730 ///         hexadecimal-escape-sequence
1731 ///       simple-escape-sequence:
1732 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1733 ///       octal-escape-sequence:
1734 ///         \ octal-digit
1735 ///         \ octal-digit octal-digit
1736 ///         \ octal-digit octal-digit octal-digit
1737 ///       hexadecimal-escape-sequence:
1738 ///         \x hexadecimal-digit
1739 ///         hexadecimal-escape-sequence hexadecimal-digit
1740 ///       universal-character-name: [C++11 lex.charset]
1741 ///         \u hex-quad
1742 ///         \U hex-quad hex-quad
1743 ///       hex-quad:
1744 ///         hex-digit hex-digit hex-digit hex-digit
1745 /// \endverbatim
1746 ///
1747 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1748                                      SourceLocation Loc, Preprocessor &PP,
1749                                      tok::TokenKind kind) {
1750   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1751   HadError = false;
1752 
1753   Kind = kind;
1754 
1755   const char *TokBegin = begin;
1756 
1757   // Skip over wide character determinant.
1758   if (Kind != tok::char_constant)
1759     ++begin;
1760   if (Kind == tok::utf8_char_constant)
1761     ++begin;
1762 
1763   // Skip over the entry quote.
1764   if (begin[0] != '\'') {
1765     PP.Diag(Loc, diag::err_lexing_char);
1766     HadError = true;
1767     return;
1768   }
1769 
1770   ++begin;
1771 
1772   // Remove an optional ud-suffix.
1773   if (end[-1] != '\'') {
1774     const char *UDSuffixEnd = end;
1775     do {
1776       --end;
1777     } while (end[-1] != '\'');
1778     // FIXME: Don't bother with this if !tok.hasUCN().
1779     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1780     UDSuffixOffset = end - TokBegin;
1781   }
1782 
1783   // Trim the ending quote.
1784   assert(end != begin && "Invalid token lexed");
1785   --end;
1786 
1787   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1788   // up to 64-bits.
1789   // FIXME: This extensively assumes that 'char' is 8-bits.
1790   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1791          "Assumes char is 8 bits");
1792   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1793          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1794          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1795   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1796          "Assumes sizeof(wchar) on target is <= 64");
1797 
1798   SmallVector<uint32_t, 4> codepoint_buffer;
1799   codepoint_buffer.resize(end - begin);
1800   uint32_t *buffer_begin = &codepoint_buffer.front();
1801   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1802 
1803   // Unicode escapes representing characters that cannot be correctly
1804   // represented in a single code unit are disallowed in character literals
1805   // by this implementation.
1806   uint32_t largest_character_for_kind;
1807   if (tok::wide_char_constant == Kind) {
1808     largest_character_for_kind =
1809         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1810   } else if (tok::utf8_char_constant == Kind) {
1811     largest_character_for_kind = 0x7F;
1812   } else if (tok::utf16_char_constant == Kind) {
1813     largest_character_for_kind = 0xFFFF;
1814   } else if (tok::utf32_char_constant == Kind) {
1815     largest_character_for_kind = 0x10FFFF;
1816   } else {
1817     largest_character_for_kind = 0x7Fu;
1818   }
1819 
1820   while (begin != end) {
1821     // Is this a span of non-escape characters?
1822     if (begin[0] != '\\') {
1823       char const *start = begin;
1824       do {
1825         ++begin;
1826       } while (begin != end && *begin != '\\');
1827 
1828       char const *tmp_in_start = start;
1829       uint32_t *tmp_out_start = buffer_begin;
1830       llvm::ConversionResult res =
1831           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1832                              reinterpret_cast<llvm::UTF8 const *>(begin),
1833                              &buffer_begin, buffer_end, llvm::strictConversion);
1834       if (res != llvm::conversionOK) {
1835         // If we see bad encoding for unprefixed character literals, warn and
1836         // simply copy the byte values, for compatibility with gcc and
1837         // older versions of clang.
1838         bool NoErrorOnBadEncoding = isOrdinary();
1839         unsigned Msg = diag::err_bad_character_encoding;
1840         if (NoErrorOnBadEncoding)
1841           Msg = diag::warn_bad_character_encoding;
1842         PP.Diag(Loc, Msg);
1843         if (NoErrorOnBadEncoding) {
1844           start = tmp_in_start;
1845           buffer_begin = tmp_out_start;
1846           for (; start != begin; ++start, ++buffer_begin)
1847             *buffer_begin = static_cast<uint8_t>(*start);
1848         } else {
1849           HadError = true;
1850         }
1851       } else {
1852         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1853           if (*tmp_out_start > largest_character_for_kind) {
1854             HadError = true;
1855             PP.Diag(Loc, diag::err_character_too_large);
1856           }
1857         }
1858       }
1859 
1860       continue;
1861     }
1862     // Is this a Universal Character Name escape?
1863     if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1864       unsigned short UcnLen = 0;
1865       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1866                             FullSourceLoc(Loc, PP.getSourceManager()),
1867                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1868         HadError = true;
1869       } else if (*buffer_begin > largest_character_for_kind) {
1870         HadError = true;
1871         PP.Diag(Loc, diag::err_character_too_large);
1872       }
1873 
1874       ++buffer_begin;
1875       continue;
1876     }
1877     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1878     uint64_t result =
1879         ProcessCharEscape(TokBegin, begin, end, HadError,
1880                           FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
1881                           &PP.getDiagnostics(), PP.getLangOpts(),
1882                           StringLiteralEvalMethod::Evaluated);
1883     *buffer_begin++ = result;
1884   }
1885 
1886   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1887 
1888   if (NumCharsSoFar > 1) {
1889     if (isOrdinary() && NumCharsSoFar == 4)
1890       PP.Diag(Loc, diag::warn_four_char_character_literal);
1891     else if (isOrdinary())
1892       PP.Diag(Loc, diag::warn_multichar_character_literal);
1893     else {
1894       PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1895       HadError = true;
1896     }
1897     IsMultiChar = true;
1898   } else {
1899     IsMultiChar = false;
1900   }
1901 
1902   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1903 
1904   // Narrow character literals act as though their value is concatenated
1905   // in this implementation, but warn on overflow.
1906   bool multi_char_too_long = false;
1907   if (isOrdinary() && isMultiChar()) {
1908     LitVal = 0;
1909     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1910       // check for enough leading zeros to shift into
1911       multi_char_too_long |= (LitVal.countl_zero() < 8);
1912       LitVal <<= 8;
1913       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1914     }
1915   } else if (NumCharsSoFar > 0) {
1916     // otherwise just take the last character
1917     LitVal = buffer_begin[-1];
1918   }
1919 
1920   if (!HadError && multi_char_too_long) {
1921     PP.Diag(Loc, diag::warn_char_constant_too_large);
1922   }
1923 
1924   // Transfer the value from APInt to uint64_t
1925   Value = LitVal.getZExtValue();
1926 
1927   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1928   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1929   // character constants are not sign extended in the this implementation:
1930   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1931   if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1932       PP.getLangOpts().CharIsSigned)
1933     Value = (signed char)Value;
1934 }
1935 
1936 /// \verbatim
1937 ///       string-literal: [C++0x lex.string]
1938 ///         encoding-prefix " [s-char-sequence] "
1939 ///         encoding-prefix R raw-string
1940 ///       encoding-prefix:
1941 ///         u8
1942 ///         u
1943 ///         U
1944 ///         L
1945 ///       s-char-sequence:
1946 ///         s-char
1947 ///         s-char-sequence s-char
1948 ///       s-char:
1949 ///         any member of the source character set except the double-quote ",
1950 ///           backslash \, or new-line character
1951 ///         escape-sequence
1952 ///         universal-character-name
1953 ///       raw-string:
1954 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1955 ///       r-char-sequence:
1956 ///         r-char
1957 ///         r-char-sequence r-char
1958 ///       r-char:
1959 ///         any member of the source character set, except a right parenthesis )
1960 ///           followed by the initial d-char-sequence (which may be empty)
1961 ///           followed by a double quote ".
1962 ///       d-char-sequence:
1963 ///         d-char
1964 ///         d-char-sequence d-char
1965 ///       d-char:
1966 ///         any member of the basic source character set except:
1967 ///           space, the left parenthesis (, the right parenthesis ),
1968 ///           the backslash \, and the control characters representing horizontal
1969 ///           tab, vertical tab, form feed, and newline.
1970 ///       escape-sequence: [C++0x lex.ccon]
1971 ///         simple-escape-sequence
1972 ///         octal-escape-sequence
1973 ///         hexadecimal-escape-sequence
1974 ///       simple-escape-sequence:
1975 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1976 ///       octal-escape-sequence:
1977 ///         \ octal-digit
1978 ///         \ octal-digit octal-digit
1979 ///         \ octal-digit octal-digit octal-digit
1980 ///       hexadecimal-escape-sequence:
1981 ///         \x hexadecimal-digit
1982 ///         hexadecimal-escape-sequence hexadecimal-digit
1983 ///       universal-character-name:
1984 ///         \u hex-quad
1985 ///         \U hex-quad hex-quad
1986 ///       hex-quad:
1987 ///         hex-digit hex-digit hex-digit hex-digit
1988 /// \endverbatim
1989 ///
1990 StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
1991                                          Preprocessor &PP,
1992                                          StringLiteralEvalMethod EvalMethod)
1993     : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1994       Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1995       MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1996       ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
1997       Pascal(false) {
1998   init(StringToks);
1999 }
2000 
2001 void StringLiteralParser::init(ArrayRef<Token> StringToks){
2002   // The literal token may have come from an invalid source location (e.g. due
2003   // to a PCH error), in which case the token length will be 0.
2004   if (StringToks.empty() || StringToks[0].getLength() < 2)
2005     return DiagnoseLexingError(SourceLocation());
2006 
2007   // Scan all of the string portions, remember the max individual token length,
2008   // computing a bound on the concatenated string length, and see whether any
2009   // piece is a wide-string.  If any of the string portions is a wide-string
2010   // literal, the result is a wide-string literal [C99 6.4.5p4].
2011   assert(!StringToks.empty() && "expected at least one token");
2012   MaxTokenLength = StringToks[0].getLength();
2013   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
2014   SizeBound = StringToks[0].getLength() - 2; // -2 for "".
2015   hadError = false;
2016 
2017   // Determines the kind of string from the prefix
2018   Kind = tok::string_literal;
2019 
2020   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
2021   for (const Token &Tok : StringToks) {
2022     if (Tok.getLength() < 2)
2023       return DiagnoseLexingError(Tok.getLocation());
2024 
2025     // The string could be shorter than this if it needs cleaning, but this is a
2026     // reasonable bound, which is all we need.
2027     assert(Tok.getLength() >= 2 && "literal token is invalid!");
2028     SizeBound += Tok.getLength() - 2; // -2 for "".
2029 
2030     // Remember maximum string piece length.
2031     if (Tok.getLength() > MaxTokenLength)
2032       MaxTokenLength = Tok.getLength();
2033 
2034     // Remember if we see any wide or utf-8/16/32 strings.
2035     // Also check for illegal concatenations.
2036     if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
2037       if (Diags) {
2038         SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
2039             Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
2040             Features);
2041         CharSourceRange Range =
2042             CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
2043         StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
2044                          getEncodingPrefixLen(Tok.getKind()));
2045         Diags->Report(Tok.getLocation(),
2046                       Features.CPlusPlus26
2047                           ? diag::err_unevaluated_string_prefix
2048                           : diag::warn_unevaluated_string_prefix)
2049             << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
2050       }
2051       if (Features.CPlusPlus26)
2052         hadError = true;
2053     } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
2054       if (isOrdinary()) {
2055         Kind = Tok.getKind();
2056       } else {
2057         if (Diags)
2058           Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
2059         hadError = true;
2060       }
2061     }
2062   }
2063 
2064   // Include space for the null terminator.
2065   ++SizeBound;
2066 
2067   // TODO: K&R warning: "traditional C rejects string constant concatenation"
2068 
2069   // Get the width in bytes of char/wchar_t/char16_t/char32_t
2070   CharByteWidth = getCharWidth(Kind, Target);
2071   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
2072   CharByteWidth /= 8;
2073 
2074   // The output buffer size needs to be large enough to hold wide characters.
2075   // This is a worst-case assumption which basically corresponds to L"" "long".
2076   SizeBound *= CharByteWidth;
2077 
2078   // Size the temporary buffer to hold the result string data.
2079   ResultBuf.resize(SizeBound);
2080 
2081   // Likewise, but for each string piece.
2082   SmallString<512> TokenBuf;
2083   TokenBuf.resize(MaxTokenLength);
2084 
2085   // Loop over all the strings, getting their spelling, and expanding them to
2086   // wide strings as appropriate.
2087   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
2088 
2089   Pascal = false;
2090 
2091   SourceLocation UDSuffixTokLoc;
2092 
2093   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
2094     const char *ThisTokBuf = &TokenBuf[0];
2095     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
2096     // that ThisTokBuf points to a buffer that is big enough for the whole token
2097     // and 'spelled' tokens can only shrink.
2098     bool StringInvalid = false;
2099     unsigned ThisTokLen =
2100       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
2101                          &StringInvalid);
2102     if (StringInvalid)
2103       return DiagnoseLexingError(StringToks[i].getLocation());
2104 
2105     const char *ThisTokBegin = ThisTokBuf;
2106     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
2107 
2108     // Remove an optional ud-suffix.
2109     if (ThisTokEnd[-1] != '"') {
2110       const char *UDSuffixEnd = ThisTokEnd;
2111       do {
2112         --ThisTokEnd;
2113       } while (ThisTokEnd[-1] != '"');
2114 
2115       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
2116 
2117       if (UDSuffixBuf.empty()) {
2118         if (StringToks[i].hasUCN())
2119           expandUCNs(UDSuffixBuf, UDSuffix);
2120         else
2121           UDSuffixBuf.assign(UDSuffix);
2122         UDSuffixToken = i;
2123         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
2124         UDSuffixTokLoc = StringToks[i].getLocation();
2125       } else {
2126         SmallString<32> ExpandedUDSuffix;
2127         if (StringToks[i].hasUCN()) {
2128           expandUCNs(ExpandedUDSuffix, UDSuffix);
2129           UDSuffix = ExpandedUDSuffix;
2130         }
2131 
2132         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
2133         // result of a concatenation involving at least one user-defined-string-
2134         // literal, all the participating user-defined-string-literals shall
2135         // have the same ud-suffix.
2136         bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
2137         if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
2138           if (Diags) {
2139             SourceLocation TokLoc = StringToks[i].getLocation();
2140             if (UnevaluatedStringHasUDL) {
2141               Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
2142                   << SourceRange(TokLoc, TokLoc);
2143             } else {
2144               Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
2145                   << UDSuffixBuf << UDSuffix
2146                   << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
2147             }
2148           }
2149           hadError = true;
2150         }
2151       }
2152     }
2153 
2154     // Strip the end quote.
2155     --ThisTokEnd;
2156 
2157     // TODO: Input character set mapping support.
2158 
2159     // Skip marker for wide or unicode strings.
2160     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
2161       ++ThisTokBuf;
2162       // Skip 8 of u8 marker for utf8 strings.
2163       if (ThisTokBuf[0] == '8')
2164         ++ThisTokBuf;
2165     }
2166 
2167     // Check for raw string
2168     if (ThisTokBuf[0] == 'R') {
2169       if (ThisTokBuf[1] != '"') {
2170         // The file may have come from PCH and then changed after loading the
2171         // PCH; Fail gracefully.
2172         return DiagnoseLexingError(StringToks[i].getLocation());
2173       }
2174       ThisTokBuf += 2; // skip R"
2175 
2176       // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2177       // characters.
2178       constexpr unsigned MaxRawStrDelimLen = 16;
2179 
2180       const char *Prefix = ThisTokBuf;
2181       while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2182              ThisTokBuf[0] != '(')
2183         ++ThisTokBuf;
2184       if (ThisTokBuf[0] != '(')
2185         return DiagnoseLexingError(StringToks[i].getLocation());
2186       ++ThisTokBuf; // skip '('
2187 
2188       // Remove same number of characters from the end
2189       ThisTokEnd -= ThisTokBuf - Prefix;
2190       if (ThisTokEnd < ThisTokBuf)
2191         return DiagnoseLexingError(StringToks[i].getLocation());
2192 
2193       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2194       // results in a new-line in the resulting execution string-literal.
2195       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2196       while (!RemainingTokenSpan.empty()) {
2197         // Split the string literal on \r\n boundaries.
2198         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2199         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2200         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2201 
2202         // Copy everything before the \r\n sequence into the string literal.
2203         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2204           hadError = true;
2205 
2206         // Point into the \n inside the \r\n sequence and operate on the
2207         // remaining portion of the literal.
2208         RemainingTokenSpan = AfterCRLF.substr(1);
2209       }
2210     } else {
2211       if (ThisTokBuf[0] != '"') {
2212         // The file may have come from PCH and then changed after loading the
2213         // PCH; Fail gracefully.
2214         return DiagnoseLexingError(StringToks[i].getLocation());
2215       }
2216       ++ThisTokBuf; // skip "
2217 
2218       // Check if this is a pascal string
2219       if (!isUnevaluated() && Features.PascalStrings &&
2220           ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
2221           ThisTokBuf[1] == 'p') {
2222 
2223         // If the \p sequence is found in the first token, we have a pascal string
2224         // Otherwise, if we already have a pascal string, ignore the first \p
2225         if (i == 0) {
2226           ++ThisTokBuf;
2227           Pascal = true;
2228         } else if (Pascal)
2229           ThisTokBuf += 2;
2230       }
2231 
2232       while (ThisTokBuf != ThisTokEnd) {
2233         // Is this a span of non-escape characters?
2234         if (ThisTokBuf[0] != '\\') {
2235           const char *InStart = ThisTokBuf;
2236           do {
2237             ++ThisTokBuf;
2238           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2239 
2240           // Copy the character span over.
2241           if (CopyStringFragment(StringToks[i], ThisTokBegin,
2242                                  StringRef(InStart, ThisTokBuf - InStart)))
2243             hadError = true;
2244           continue;
2245         }
2246         // Is this a Universal Character Name escape?
2247         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2248             ThisTokBuf[1] == 'N') {
2249           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2250                           ResultPtr, hadError,
2251                           FullSourceLoc(StringToks[i].getLocation(), SM),
2252                           CharByteWidth, Diags, Features);
2253           continue;
2254         }
2255         // Otherwise, this is a non-UCN escape character.  Process it.
2256         unsigned ResultChar =
2257             ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2258                               FullSourceLoc(StringToks[i].getLocation(), SM),
2259                               CharByteWidth * 8, Diags, Features, EvalMethod);
2260 
2261         if (CharByteWidth == 4) {
2262           // FIXME: Make the type of the result buffer correct instead of
2263           // using reinterpret_cast.
2264           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2265           *ResultWidePtr = ResultChar;
2266           ResultPtr += 4;
2267         } else if (CharByteWidth == 2) {
2268           // FIXME: Make the type of the result buffer correct instead of
2269           // using reinterpret_cast.
2270           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2271           *ResultWidePtr = ResultChar & 0xFFFF;
2272           ResultPtr += 2;
2273         } else {
2274           assert(CharByteWidth == 1 && "Unexpected char width");
2275           *ResultPtr++ = ResultChar & 0xFF;
2276         }
2277       }
2278     }
2279   }
2280 
2281   assert((!Pascal || !isUnevaluated()) &&
2282          "Pascal string in unevaluated context");
2283   if (Pascal) {
2284     if (CharByteWidth == 4) {
2285       // FIXME: Make the type of the result buffer correct instead of
2286       // using reinterpret_cast.
2287       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2288       ResultWidePtr[0] = GetNumStringChars() - 1;
2289     } else if (CharByteWidth == 2) {
2290       // FIXME: Make the type of the result buffer correct instead of
2291       // using reinterpret_cast.
2292       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2293       ResultWidePtr[0] = GetNumStringChars() - 1;
2294     } else {
2295       assert(CharByteWidth == 1 && "Unexpected char width");
2296       ResultBuf[0] = GetNumStringChars() - 1;
2297     }
2298 
2299     // Verify that pascal strings aren't too large.
2300     if (GetStringLength() > 256) {
2301       if (Diags)
2302         Diags->Report(StringToks.front().getLocation(),
2303                       diag::err_pascal_string_too_long)
2304           << SourceRange(StringToks.front().getLocation(),
2305                          StringToks.back().getLocation());
2306       hadError = true;
2307       return;
2308     }
2309   } else if (Diags) {
2310     // Complain if this string literal has too many characters.
2311     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2312 
2313     if (GetNumStringChars() > MaxChars)
2314       Diags->Report(StringToks.front().getLocation(),
2315                     diag::ext_string_too_long)
2316         << GetNumStringChars() << MaxChars
2317         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2318         << SourceRange(StringToks.front().getLocation(),
2319                        StringToks.back().getLocation());
2320   }
2321 }
2322 
2323 static const char *resyncUTF8(const char *Err, const char *End) {
2324   if (Err == End)
2325     return End;
2326   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2327   while (++Err != End && (*Err & 0xC0) == 0x80)
2328     ;
2329   return Err;
2330 }
2331 
2332 /// This function copies from Fragment, which is a sequence of bytes
2333 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
2334 /// Performs widening for multi-byte characters.
2335 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2336                                              const char *TokBegin,
2337                                              StringRef Fragment) {
2338   const llvm::UTF8 *ErrorPtrTmp;
2339   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2340     return false;
2341 
2342   // If we see bad encoding for unprefixed string literals, warn and
2343   // simply copy the byte values, for compatibility with gcc and older
2344   // versions of clang.
2345   bool NoErrorOnBadEncoding = isOrdinary();
2346   if (NoErrorOnBadEncoding) {
2347     memcpy(ResultPtr, Fragment.data(), Fragment.size());
2348     ResultPtr += Fragment.size();
2349   }
2350 
2351   if (Diags) {
2352     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2353 
2354     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2355     const DiagnosticBuilder &Builder =
2356       Diag(Diags, Features, SourceLoc, TokBegin,
2357            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2358            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2359                                 : diag::err_bad_string_encoding);
2360 
2361     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2362     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2363 
2364     // Decode into a dummy buffer.
2365     SmallString<512> Dummy;
2366     Dummy.reserve(Fragment.size() * CharByteWidth);
2367     char *Ptr = Dummy.data();
2368 
2369     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2370       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2371       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2372       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2373                                      ErrorPtr, NextStart);
2374       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2375     }
2376   }
2377   return !NoErrorOnBadEncoding;
2378 }
2379 
2380 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2381   hadError = true;
2382   if (Diags)
2383     Diags->Report(Loc, diag::err_lexing_string);
2384 }
2385 
2386 /// getOffsetOfStringByte - This function returns the offset of the
2387 /// specified byte of the string data represented by Token.  This handles
2388 /// advancing over escape sequences in the string.
2389 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2390                                                     unsigned ByteNo) const {
2391   // Get the spelling of the token.
2392   SmallString<32> SpellingBuffer;
2393   SpellingBuffer.resize(Tok.getLength());
2394 
2395   bool StringInvalid = false;
2396   const char *SpellingPtr = &SpellingBuffer[0];
2397   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2398                                        &StringInvalid);
2399   if (StringInvalid)
2400     return 0;
2401 
2402   const char *SpellingStart = SpellingPtr;
2403   const char *SpellingEnd = SpellingPtr+TokLen;
2404 
2405   // Handle UTF-8 strings just like narrow strings.
2406   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2407     SpellingPtr += 2;
2408 
2409   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2410          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2411 
2412   // For raw string literals, this is easy.
2413   if (SpellingPtr[0] == 'R') {
2414     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2415     // Skip 'R"'.
2416     SpellingPtr += 2;
2417     while (*SpellingPtr != '(') {
2418       ++SpellingPtr;
2419       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2420     }
2421     // Skip '('.
2422     ++SpellingPtr;
2423     return SpellingPtr - SpellingStart + ByteNo;
2424   }
2425 
2426   // Skip over the leading quote
2427   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2428   ++SpellingPtr;
2429 
2430   // Skip over bytes until we find the offset we're looking for.
2431   while (ByteNo) {
2432     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2433 
2434     // Step over non-escapes simply.
2435     if (*SpellingPtr != '\\') {
2436       ++SpellingPtr;
2437       --ByteNo;
2438       continue;
2439     }
2440 
2441     // Otherwise, this is an escape character.  Advance over it.
2442     bool HadError = false;
2443     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2444         SpellingPtr[1] == 'N') {
2445       const char *EscapePtr = SpellingPtr;
2446       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2447                                       1, Features, HadError);
2448       if (Len > ByteNo) {
2449         // ByteNo is somewhere within the escape sequence.
2450         SpellingPtr = EscapePtr;
2451         break;
2452       }
2453       ByteNo -= Len;
2454     } else {
2455       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2456                         FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
2457                         Diags, Features, StringLiteralEvalMethod::Evaluated);
2458       --ByteNo;
2459     }
2460     assert(!HadError && "This method isn't valid on erroneous strings");
2461   }
2462 
2463   return SpellingPtr-SpellingStart;
2464 }
2465 
2466 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2467 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
2468 /// treat it as an invalid suffix.
2469 bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2470                                           StringRef Suffix) {
2471   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2472          Suffix == "sv";
2473 }
2474