xref: /freebsd/contrib/llvm-project/clang/lib/Lex/LiteralSupport.cpp (revision fcaf7f8644a9988098ac6be2165bce3ea4786e91)
1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements the NumericLiteralParser, CharLiteralParser, and
10 // StringLiteralParser interfaces.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "clang/Lex/LiteralSupport.h"
15 #include "clang/Basic/CharInfo.h"
16 #include "clang/Basic/LangOptions.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/TargetInfo.h"
19 #include "clang/Lex/LexDiagnostic.h"
20 #include "clang/Lex/Lexer.h"
21 #include "clang/Lex/Preprocessor.h"
22 #include "clang/Lex/Token.h"
23 #include "llvm/ADT/APInt.h"
24 #include "llvm/ADT/SmallVector.h"
25 #include "llvm/ADT/StringExtras.h"
26 #include "llvm/ADT/StringSwitch.h"
27 #include "llvm/Support/ConvertUTF.h"
28 #include "llvm/Support/Error.h"
29 #include "llvm/Support/ErrorHandling.h"
30 #include "llvm/Support/Unicode.h"
31 #include <algorithm>
32 #include <cassert>
33 #include <cstddef>
34 #include <cstdint>
35 #include <cstring>
36 #include <string>
37 
38 using namespace clang;
39 
40 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
41   switch (kind) {
42   default: llvm_unreachable("Unknown token type!");
43   case tok::char_constant:
44   case tok::string_literal:
45   case tok::utf8_char_constant:
46   case tok::utf8_string_literal:
47     return Target.getCharWidth();
48   case tok::wide_char_constant:
49   case tok::wide_string_literal:
50     return Target.getWCharWidth();
51   case tok::utf16_char_constant:
52   case tok::utf16_string_literal:
53     return Target.getChar16Width();
54   case tok::utf32_char_constant:
55   case tok::utf32_string_literal:
56     return Target.getChar32Width();
57   }
58 }
59 
60 static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
61                                            FullSourceLoc TokLoc,
62                                            const char *TokBegin,
63                                            const char *TokRangeBegin,
64                                            const char *TokRangeEnd) {
65   SourceLocation Begin =
66     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
67                                    TokLoc.getManager(), Features);
68   SourceLocation End =
69     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
70                                    TokLoc.getManager(), Features);
71   return CharSourceRange::getCharRange(Begin, End);
72 }
73 
74 /// Produce a diagnostic highlighting some portion of a literal.
75 ///
76 /// Emits the diagnostic \p DiagID, highlighting the range of characters from
77 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
78 /// a substring of a spelling buffer for the token beginning at \p TokBegin.
79 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
80                               const LangOptions &Features, FullSourceLoc TokLoc,
81                               const char *TokBegin, const char *TokRangeBegin,
82                               const char *TokRangeEnd, unsigned DiagID) {
83   SourceLocation Begin =
84     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
85                                    TokLoc.getManager(), Features);
86   return Diags->Report(Begin, DiagID) <<
87     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
88 }
89 
90 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
91 /// either a character or a string literal.
92 static unsigned ProcessCharEscape(const char *ThisTokBegin,
93                                   const char *&ThisTokBuf,
94                                   const char *ThisTokEnd, bool &HadError,
95                                   FullSourceLoc Loc, unsigned CharWidth,
96                                   DiagnosticsEngine *Diags,
97                                   const LangOptions &Features) {
98   const char *EscapeBegin = ThisTokBuf;
99   bool Delimited = false;
100   bool EndDelimiterFound = false;
101 
102   // Skip the '\' char.
103   ++ThisTokBuf;
104 
105   // We know that this character can't be off the end of the buffer, because
106   // that would have been \", which would not have been the end of string.
107   unsigned ResultChar = *ThisTokBuf++;
108   switch (ResultChar) {
109   // These map to themselves.
110   case '\\': case '\'': case '"': case '?': break;
111 
112     // These have fixed mappings.
113   case 'a':
114     // TODO: K&R: the meaning of '\\a' is different in traditional C
115     ResultChar = 7;
116     break;
117   case 'b':
118     ResultChar = 8;
119     break;
120   case 'e':
121     if (Diags)
122       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
123            diag::ext_nonstandard_escape) << "e";
124     ResultChar = 27;
125     break;
126   case 'E':
127     if (Diags)
128       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
129            diag::ext_nonstandard_escape) << "E";
130     ResultChar = 27;
131     break;
132   case 'f':
133     ResultChar = 12;
134     break;
135   case 'n':
136     ResultChar = 10;
137     break;
138   case 'r':
139     ResultChar = 13;
140     break;
141   case 't':
142     ResultChar = 9;
143     break;
144   case 'v':
145     ResultChar = 11;
146     break;
147   case 'x': { // Hex escape.
148     ResultChar = 0;
149     if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
150       Delimited = true;
151       ThisTokBuf++;
152       if (*ThisTokBuf == '}') {
153         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
154              diag::err_delimited_escape_empty);
155         return ResultChar;
156       }
157     } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
158       if (Diags)
159         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
160              diag::err_hex_escape_no_digits) << "x";
161       return ResultChar;
162     }
163 
164     // Hex escapes are a maximal series of hex digits.
165     bool Overflow = false;
166     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
167       if (Delimited && *ThisTokBuf == '}') {
168         ThisTokBuf++;
169         EndDelimiterFound = true;
170         break;
171       }
172       int CharVal = llvm::hexDigitValue(*ThisTokBuf);
173       if (CharVal == -1) {
174         // Non delimited hex escape sequences stop at the first non-hex digit.
175         if (!Delimited)
176           break;
177         HadError = true;
178         if (Diags)
179           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
180                diag::err_delimited_escape_invalid)
181               << StringRef(ThisTokBuf, 1);
182         continue;
183       }
184       // About to shift out a digit?
185       if (ResultChar & 0xF0000000)
186         Overflow = true;
187       ResultChar <<= 4;
188       ResultChar |= CharVal;
189     }
190     // See if any bits will be truncated when evaluated as a character.
191     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
192       Overflow = true;
193       ResultChar &= ~0U >> (32-CharWidth);
194     }
195 
196     // Check for overflow.
197     if (!HadError && Overflow) { // Too many digits to fit in
198       HadError = true;
199       if (Diags)
200         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
201              diag::err_escape_too_large)
202             << 0;
203     }
204     break;
205   }
206   case '0': case '1': case '2': case '3':
207   case '4': case '5': case '6': case '7': {
208     // Octal escapes.
209     --ThisTokBuf;
210     ResultChar = 0;
211 
212     // Octal escapes are a series of octal digits with maximum length 3.
213     // "\0123" is a two digit sequence equal to "\012" "3".
214     unsigned NumDigits = 0;
215     do {
216       ResultChar <<= 3;
217       ResultChar |= *ThisTokBuf++ - '0';
218       ++NumDigits;
219     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
220              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
221 
222     // Check for overflow.  Reject '\777', but not L'\777'.
223     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
224       if (Diags)
225         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
226              diag::err_escape_too_large) << 1;
227       ResultChar &= ~0U >> (32-CharWidth);
228     }
229     break;
230   }
231   case 'o': {
232     bool Overflow = false;
233     if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
234       HadError = true;
235       if (Diags)
236         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
237              diag::err_delimited_escape_missing_brace)
238             << "o";
239 
240       break;
241     }
242     ResultChar = 0;
243     Delimited = true;
244     ++ThisTokBuf;
245     if (*ThisTokBuf == '}') {
246       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
247            diag::err_delimited_escape_empty);
248       return ResultChar;
249     }
250 
251     while (ThisTokBuf != ThisTokEnd) {
252       if (*ThisTokBuf == '}') {
253         EndDelimiterFound = true;
254         ThisTokBuf++;
255         break;
256       }
257       if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
258         HadError = true;
259         if (Diags)
260           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
261                diag::err_delimited_escape_invalid)
262               << StringRef(ThisTokBuf, 1);
263         ThisTokBuf++;
264         continue;
265       }
266       if (ResultChar & 0x020000000)
267         Overflow = true;
268 
269       ResultChar <<= 3;
270       ResultChar |= *ThisTokBuf++ - '0';
271     }
272     // Check for overflow.  Reject '\777', but not L'\777'.
273     if (!HadError &&
274         (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
275       HadError = true;
276       if (Diags)
277         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
278              diag::err_escape_too_large)
279             << 1;
280       ResultChar &= ~0U >> (32 - CharWidth);
281     }
282     break;
283   }
284     // Otherwise, these are not valid escapes.
285   case '(': case '{': case '[': case '%':
286     // GCC accepts these as extensions.  We warn about them as such though.
287     if (Diags)
288       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
289            diag::ext_nonstandard_escape)
290         << std::string(1, ResultChar);
291     break;
292   default:
293     if (!Diags)
294       break;
295 
296     if (isPrintable(ResultChar))
297       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
298            diag::ext_unknown_escape)
299         << std::string(1, ResultChar);
300     else
301       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
302            diag::ext_unknown_escape)
303         << "x" + llvm::utohexstr(ResultChar);
304     break;
305   }
306 
307   if (Delimited && Diags) {
308     if (!EndDelimiterFound)
309       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
310            diag::err_expected)
311           << tok::r_brace;
312     else if (!HadError) {
313       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
314            Features.CPlusPlus2b ? diag::warn_cxx2b_delimited_escape_sequence
315                                 : diag::ext_delimited_escape_sequence)
316           << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
317     }
318   }
319 
320   return ResultChar;
321 }
322 
323 static void appendCodePoint(unsigned Codepoint,
324                             llvm::SmallVectorImpl<char> &Str) {
325   char ResultBuf[4];
326   char *ResultPtr = ResultBuf;
327   if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
328     Str.append(ResultBuf, ResultPtr);
329 }
330 
331 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
332   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
333     if (*I != '\\') {
334       Buf.push_back(*I);
335       continue;
336     }
337 
338     ++I;
339     char Kind = *I;
340     ++I;
341 
342     assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
343     uint32_t CodePoint = 0;
344 
345     if (Kind == 'u' && *I == '{') {
346       for (++I; *I != '}'; ++I) {
347         unsigned Value = llvm::hexDigitValue(*I);
348         assert(Value != -1U);
349         CodePoint <<= 4;
350         CodePoint += Value;
351       }
352       appendCodePoint(CodePoint, Buf);
353       continue;
354     }
355 
356     if (Kind == 'N') {
357       assert(*I == '{');
358       ++I;
359       auto Delim = std::find(I, Input.end(), '}');
360       assert(Delim != Input.end());
361       llvm::Optional<llvm::sys::unicode::LooseMatchingResult> Res =
362           llvm::sys::unicode::nameToCodepointLooseMatching(
363               StringRef(I, std::distance(I, Delim)));
364       assert(Res);
365       CodePoint = Res->CodePoint;
366       assert(CodePoint != 0xFFFFFFFF);
367       appendCodePoint(CodePoint, Buf);
368       I = Delim;
369       continue;
370     }
371 
372     unsigned NumHexDigits;
373     if (Kind == 'u')
374       NumHexDigits = 4;
375     else
376       NumHexDigits = 8;
377 
378     assert(I + NumHexDigits <= E);
379 
380     for (; NumHexDigits != 0; ++I, --NumHexDigits) {
381       unsigned Value = llvm::hexDigitValue(*I);
382       assert(Value != -1U);
383 
384       CodePoint <<= 4;
385       CodePoint += Value;
386     }
387 
388     appendCodePoint(CodePoint, Buf);
389     --I;
390   }
391 }
392 
393 static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
394                                     const char *&ThisTokBuf,
395                                     const char *ThisTokEnd, uint32_t &UcnVal,
396                                     unsigned short &UcnLen, bool &Delimited,
397                                     FullSourceLoc Loc, DiagnosticsEngine *Diags,
398                                     const LangOptions &Features,
399                                     bool in_char_string_literal = false) {
400   const char *UcnBegin = ThisTokBuf;
401   bool HasError = false;
402   bool EndDelimiterFound = false;
403 
404   // Skip the '\u' char's.
405   ThisTokBuf += 2;
406   Delimited = false;
407   if (UcnBegin[1] == 'u' && in_char_string_literal &&
408       ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
409     Delimited = true;
410     ThisTokBuf++;
411   } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
412     if (Diags)
413       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
414            diag::err_hex_escape_no_digits)
415           << StringRef(&ThisTokBuf[-1], 1);
416     return false;
417   }
418   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
419 
420   bool Overflow = false;
421   unsigned short Count = 0;
422   for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
423        ++ThisTokBuf) {
424     if (Delimited && *ThisTokBuf == '}') {
425       ++ThisTokBuf;
426       EndDelimiterFound = true;
427       break;
428     }
429     int CharVal = llvm::hexDigitValue(*ThisTokBuf);
430     if (CharVal == -1) {
431       HasError = true;
432       if (!Delimited)
433         break;
434       if (Diags) {
435         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
436              diag::err_delimited_escape_invalid)
437             << StringRef(ThisTokBuf, 1);
438       }
439       Count++;
440       continue;
441     }
442     if (UcnVal & 0xF0000000) {
443       Overflow = true;
444       continue;
445     }
446     UcnVal <<= 4;
447     UcnVal |= CharVal;
448     Count++;
449   }
450 
451   if (Overflow) {
452     if (Diags)
453       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
454            diag::err_escape_too_large)
455           << 0;
456     return false;
457   }
458 
459   if (Delimited && !EndDelimiterFound) {
460     if (Diags) {
461       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
462            diag::err_expected)
463           << tok::r_brace;
464     }
465     return false;
466   }
467 
468   // If we didn't consume the proper number of digits, there is a problem.
469   if (Count == 0 || (!Delimited && Count != UcnLen)) {
470     if (Diags)
471       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
472            Delimited ? diag::err_delimited_escape_empty
473                      : diag::err_ucn_escape_incomplete);
474     return false;
475   }
476   return !HasError;
477 }
478 
479 static void DiagnoseInvalidUnicodeCharacterName(
480     DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
481     const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
482     llvm::StringRef Name) {
483 
484   Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
485        diag::err_invalid_ucn_name)
486       << Name;
487 
488   namespace u = llvm::sys::unicode;
489 
490   llvm::Optional<u::LooseMatchingResult> Res =
491       u::nameToCodepointLooseMatching(Name);
492   if (Res) {
493     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
494          diag::note_invalid_ucn_name_loose_matching)
495         << FixItHint::CreateReplacement(
496                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
497                                    TokRangeEnd),
498                Res->Name);
499     return;
500   }
501 
502   unsigned Distance = 0;
503   SmallVector<u::MatchForCodepointName> Matches =
504       u::nearestMatchesForCodepointName(Name, 5);
505   assert(!Matches.empty() && "No unicode characters found");
506 
507   for (const auto &Match : Matches) {
508     if (Distance == 0)
509       Distance = Match.Distance;
510     if (std::max(Distance, Match.Distance) -
511             std::min(Distance, Match.Distance) >
512         3)
513       break;
514     Distance = Match.Distance;
515 
516     std::string Str;
517     llvm::UTF32 V = Match.Value;
518     LLVM_ATTRIBUTE_UNUSED bool Converted =
519         llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
520     assert(Converted && "Found a match wich is not a unicode character");
521 
522     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
523          diag::note_invalid_ucn_name_candidate)
524         << Match.Name << llvm::utohexstr(Match.Value)
525         << Str // FIXME: Fix the rendering of non printable characters
526         << FixItHint::CreateReplacement(
527                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
528                                    TokRangeEnd),
529                Match.Name);
530   }
531 }
532 
533 static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
534                                   const char *&ThisTokBuf,
535                                   const char *ThisTokEnd, uint32_t &UcnVal,
536                                   unsigned short &UcnLen, FullSourceLoc Loc,
537                                   DiagnosticsEngine *Diags,
538                                   const LangOptions &Features) {
539   const char *UcnBegin = ThisTokBuf;
540   assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
541   ThisTokBuf += 2;
542   if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
543     if (Diags) {
544       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
545            diag::err_delimited_escape_missing_brace)
546           << StringRef(&ThisTokBuf[-1], 1);
547     }
548     ThisTokBuf++;
549     return false;
550   }
551   ThisTokBuf++;
552   const char *ClosingBrace =
553       std::find_if_not(ThisTokBuf, ThisTokEnd, [](char C) {
554         return llvm::isAlnum(C) || llvm::isSpace(C) || C == '_' || C == '-';
555       });
556   bool Incomplete = ClosingBrace == ThisTokEnd || *ClosingBrace != '}';
557   bool Empty = ClosingBrace == ThisTokBuf;
558   if (Incomplete || Empty) {
559     if (Diags) {
560       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
561            Incomplete ? diag::err_ucn_escape_incomplete
562                       : diag::err_delimited_escape_empty)
563           << StringRef(&UcnBegin[1], 1);
564     }
565     ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
566     return false;
567   }
568   StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
569   ThisTokBuf = ClosingBrace + 1;
570   llvm::Optional<char32_t> Res =
571       llvm::sys::unicode::nameToCodepointStrict(Name);
572   if (!Res) {
573     if (Diags)
574       DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
575                                           &UcnBegin[3], ClosingBrace, Name);
576     return false;
577   }
578   UcnVal = *Res;
579   UcnLen = UcnVal > 0xFFFF ? 8 : 4;
580   return true;
581 }
582 
583 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
584 /// return the UTF32.
585 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
586                              const char *ThisTokEnd, uint32_t &UcnVal,
587                              unsigned short &UcnLen, FullSourceLoc Loc,
588                              DiagnosticsEngine *Diags,
589                              const LangOptions &Features,
590                              bool in_char_string_literal = false) {
591 
592   bool HasError;
593   const char *UcnBegin = ThisTokBuf;
594   bool IsDelimitedEscapeSequence = false;
595   bool IsNamedEscapeSequence = false;
596   if (ThisTokBuf[1] == 'N') {
597     IsNamedEscapeSequence = true;
598     HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
599                                       UcnVal, UcnLen, Loc, Diags, Features);
600   } else {
601     HasError =
602         !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
603                                  UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
604                                  Features, in_char_string_literal);
605   }
606   if (HasError)
607     return false;
608 
609   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
610   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
611       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
612     if (Diags)
613       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
614            diag::err_ucn_escape_invalid);
615     return false;
616   }
617 
618   // C++11 allows UCNs that refer to control characters and basic source
619   // characters inside character and string literals
620   if (UcnVal < 0xa0 &&
621       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
622     bool IsError = (!Features.CPlusPlus11 || !in_char_string_literal);
623     if (Diags) {
624       char BasicSCSChar = UcnVal;
625       if (UcnVal >= 0x20 && UcnVal < 0x7f)
626         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
627              IsError ? diag::err_ucn_escape_basic_scs :
628                        diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
629             << StringRef(&BasicSCSChar, 1);
630       else
631         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
632              IsError ? diag::err_ucn_control_character :
633                        diag::warn_cxx98_compat_literal_ucn_control_character);
634     }
635     if (IsError)
636       return false;
637   }
638 
639   if (!Features.CPlusPlus && !Features.C99 && Diags)
640     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
641          diag::warn_ucn_not_valid_in_c89_literal);
642 
643   if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
644     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
645          Features.CPlusPlus2b ? diag::warn_cxx2b_delimited_escape_sequence
646                               : diag::ext_delimited_escape_sequence)
647         << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
648 
649   return true;
650 }
651 
652 /// MeasureUCNEscape - Determine the number of bytes within the resulting string
653 /// which this UCN will occupy.
654 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
655                             const char *ThisTokEnd, unsigned CharByteWidth,
656                             const LangOptions &Features, bool &HadError) {
657   // UTF-32: 4 bytes per escape.
658   if (CharByteWidth == 4)
659     return 4;
660 
661   uint32_t UcnVal = 0;
662   unsigned short UcnLen = 0;
663   FullSourceLoc Loc;
664 
665   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
666                         UcnLen, Loc, nullptr, Features, true)) {
667     HadError = true;
668     return 0;
669   }
670 
671   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
672   if (CharByteWidth == 2)
673     return UcnVal <= 0xFFFF ? 2 : 4;
674 
675   // UTF-8.
676   if (UcnVal < 0x80)
677     return 1;
678   if (UcnVal < 0x800)
679     return 2;
680   if (UcnVal < 0x10000)
681     return 3;
682   return 4;
683 }
684 
685 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
686 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
687 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
688 /// we will likely rework our support for UCN's.
689 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
690                             const char *ThisTokEnd,
691                             char *&ResultBuf, bool &HadError,
692                             FullSourceLoc Loc, unsigned CharByteWidth,
693                             DiagnosticsEngine *Diags,
694                             const LangOptions &Features) {
695   typedef uint32_t UTF32;
696   UTF32 UcnVal = 0;
697   unsigned short UcnLen = 0;
698   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
699                         Loc, Diags, Features, true)) {
700     HadError = true;
701     return;
702   }
703 
704   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
705          "only character widths of 1, 2, or 4 bytes supported");
706 
707   (void)UcnLen;
708   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
709 
710   if (CharByteWidth == 4) {
711     // FIXME: Make the type of the result buffer correct instead of
712     // using reinterpret_cast.
713     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
714     *ResultPtr = UcnVal;
715     ResultBuf += 4;
716     return;
717   }
718 
719   if (CharByteWidth == 2) {
720     // FIXME: Make the type of the result buffer correct instead of
721     // using reinterpret_cast.
722     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
723 
724     if (UcnVal <= (UTF32)0xFFFF) {
725       *ResultPtr = UcnVal;
726       ResultBuf += 2;
727       return;
728     }
729 
730     // Convert to UTF16.
731     UcnVal -= 0x10000;
732     *ResultPtr     = 0xD800 + (UcnVal >> 10);
733     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
734     ResultBuf += 4;
735     return;
736   }
737 
738   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
739 
740   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
741   // The conversion below was inspired by:
742   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
743   // First, we determine how many bytes the result will require.
744   typedef uint8_t UTF8;
745 
746   unsigned short bytesToWrite = 0;
747   if (UcnVal < (UTF32)0x80)
748     bytesToWrite = 1;
749   else if (UcnVal < (UTF32)0x800)
750     bytesToWrite = 2;
751   else if (UcnVal < (UTF32)0x10000)
752     bytesToWrite = 3;
753   else
754     bytesToWrite = 4;
755 
756   const unsigned byteMask = 0xBF;
757   const unsigned byteMark = 0x80;
758 
759   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
760   // into the first byte, depending on how many bytes follow.
761   static const UTF8 firstByteMark[5] = {
762     0x00, 0x00, 0xC0, 0xE0, 0xF0
763   };
764   // Finally, we write the bytes into ResultBuf.
765   ResultBuf += bytesToWrite;
766   switch (bytesToWrite) { // note: everything falls through.
767   case 4:
768     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
769     LLVM_FALLTHROUGH;
770   case 3:
771     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
772     LLVM_FALLTHROUGH;
773   case 2:
774     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
775     LLVM_FALLTHROUGH;
776   case 1:
777     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
778   }
779   // Update the buffer.
780   ResultBuf += bytesToWrite;
781 }
782 
783 ///       integer-constant: [C99 6.4.4.1]
784 ///         decimal-constant integer-suffix
785 ///         octal-constant integer-suffix
786 ///         hexadecimal-constant integer-suffix
787 ///         binary-literal integer-suffix [GNU, C++1y]
788 ///       user-defined-integer-literal: [C++11 lex.ext]
789 ///         decimal-literal ud-suffix
790 ///         octal-literal ud-suffix
791 ///         hexadecimal-literal ud-suffix
792 ///         binary-literal ud-suffix [GNU, C++1y]
793 ///       decimal-constant:
794 ///         nonzero-digit
795 ///         decimal-constant digit
796 ///       octal-constant:
797 ///         0
798 ///         octal-constant octal-digit
799 ///       hexadecimal-constant:
800 ///         hexadecimal-prefix hexadecimal-digit
801 ///         hexadecimal-constant hexadecimal-digit
802 ///       hexadecimal-prefix: one of
803 ///         0x 0X
804 ///       binary-literal:
805 ///         0b binary-digit
806 ///         0B binary-digit
807 ///         binary-literal binary-digit
808 ///       integer-suffix:
809 ///         unsigned-suffix [long-suffix]
810 ///         unsigned-suffix [long-long-suffix]
811 ///         long-suffix [unsigned-suffix]
812 ///         long-long-suffix [unsigned-sufix]
813 ///       nonzero-digit:
814 ///         1 2 3 4 5 6 7 8 9
815 ///       octal-digit:
816 ///         0 1 2 3 4 5 6 7
817 ///       hexadecimal-digit:
818 ///         0 1 2 3 4 5 6 7 8 9
819 ///         a b c d e f
820 ///         A B C D E F
821 ///       binary-digit:
822 ///         0
823 ///         1
824 ///       unsigned-suffix: one of
825 ///         u U
826 ///       long-suffix: one of
827 ///         l L
828 ///       long-long-suffix: one of
829 ///         ll LL
830 ///
831 ///       floating-constant: [C99 6.4.4.2]
832 ///         TODO: add rules...
833 ///
834 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
835                                            SourceLocation TokLoc,
836                                            const SourceManager &SM,
837                                            const LangOptions &LangOpts,
838                                            const TargetInfo &Target,
839                                            DiagnosticsEngine &Diags)
840     : SM(SM), LangOpts(LangOpts), Diags(Diags),
841       ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
842 
843   s = DigitsBegin = ThisTokBegin;
844   saw_exponent = false;
845   saw_period = false;
846   saw_ud_suffix = false;
847   saw_fixed_point_suffix = false;
848   isLong = false;
849   isUnsigned = false;
850   isLongLong = false;
851   isSizeT = false;
852   isHalf = false;
853   isFloat = false;
854   isImaginary = false;
855   isFloat16 = false;
856   isFloat128 = false;
857   MicrosoftInteger = 0;
858   isFract = false;
859   isAccum = false;
860   hadError = false;
861   isBitInt = false;
862 
863   // This routine assumes that the range begin/end matches the regex for integer
864   // and FP constants (specifically, the 'pp-number' regex), and assumes that
865   // the byte at "*end" is both valid and not part of the regex.  Because of
866   // this, it doesn't have to check for 'overscan' in various places.
867   if (isPreprocessingNumberBody(*ThisTokEnd)) {
868     Diags.Report(TokLoc, diag::err_lexing_numeric);
869     hadError = true;
870     return;
871   }
872 
873   if (*s == '0') { // parse radix
874     ParseNumberStartingWithZero(TokLoc);
875     if (hadError)
876       return;
877   } else { // the first digit is non-zero
878     radix = 10;
879     s = SkipDigits(s);
880     if (s == ThisTokEnd) {
881       // Done.
882     } else {
883       ParseDecimalOrOctalCommon(TokLoc);
884       if (hadError)
885         return;
886     }
887   }
888 
889   SuffixBegin = s;
890   checkSeparator(TokLoc, s, CSK_AfterDigits);
891 
892   // Initial scan to lookahead for fixed point suffix.
893   if (LangOpts.FixedPoint) {
894     for (const char *c = s; c != ThisTokEnd; ++c) {
895       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
896         saw_fixed_point_suffix = true;
897         break;
898       }
899     }
900   }
901 
902   // Parse the suffix.  At this point we can classify whether we have an FP or
903   // integer constant.
904   bool isFixedPointConstant = isFixedPointLiteral();
905   bool isFPConstant = isFloatingLiteral();
906   bool HasSize = false;
907 
908   // Loop over all of the characters of the suffix.  If we see something bad,
909   // we break out of the loop.
910   for (; s != ThisTokEnd; ++s) {
911     switch (*s) {
912     case 'R':
913     case 'r':
914       if (!LangOpts.FixedPoint)
915         break;
916       if (isFract || isAccum) break;
917       if (!(saw_period || saw_exponent)) break;
918       isFract = true;
919       continue;
920     case 'K':
921     case 'k':
922       if (!LangOpts.FixedPoint)
923         break;
924       if (isFract || isAccum) break;
925       if (!(saw_period || saw_exponent)) break;
926       isAccum = true;
927       continue;
928     case 'h':      // FP Suffix for "half".
929     case 'H':
930       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
931       if (!(LangOpts.Half || LangOpts.FixedPoint))
932         break;
933       if (isIntegerLiteral()) break;  // Error for integer constant.
934       if (HasSize)
935         break;
936       HasSize = true;
937       isHalf = true;
938       continue;  // Success.
939     case 'f':      // FP Suffix for "float"
940     case 'F':
941       if (!isFPConstant) break;  // Error for integer constant.
942       if (HasSize)
943         break;
944       HasSize = true;
945 
946       // CUDA host and device may have different _Float16 support, therefore
947       // allows f16 literals to avoid false alarm.
948       // ToDo: more precise check for CUDA.
949       if ((Target.hasFloat16Type() || LangOpts.CUDA) && s + 2 < ThisTokEnd &&
950           s[1] == '1' && s[2] == '6') {
951         s += 2; // success, eat up 2 characters.
952         isFloat16 = true;
953         continue;
954       }
955 
956       isFloat = true;
957       continue;  // Success.
958     case 'q':    // FP Suffix for "__float128"
959     case 'Q':
960       if (!isFPConstant) break;  // Error for integer constant.
961       if (HasSize)
962         break;
963       HasSize = true;
964       isFloat128 = true;
965       continue;  // Success.
966     case 'u':
967     case 'U':
968       if (isFPConstant) break;  // Error for floating constant.
969       if (isUnsigned) break;    // Cannot be repeated.
970       isUnsigned = true;
971       continue;  // Success.
972     case 'l':
973     case 'L':
974       if (HasSize)
975         break;
976       HasSize = true;
977 
978       // Check for long long.  The L's need to be adjacent and the same case.
979       if (s[1] == s[0]) {
980         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
981         if (isFPConstant) break;        // long long invalid for floats.
982         isLongLong = true;
983         ++s;  // Eat both of them.
984       } else {
985         isLong = true;
986       }
987       continue; // Success.
988     case 'z':
989     case 'Z':
990       if (isFPConstant)
991         break; // Invalid for floats.
992       if (HasSize)
993         break;
994       HasSize = true;
995       isSizeT = true;
996       continue;
997     case 'i':
998     case 'I':
999       if (LangOpts.MicrosoftExt && !isFPConstant) {
1000         // Allow i8, i16, i32, and i64. First, look ahead and check if
1001         // suffixes are Microsoft integers and not the imaginary unit.
1002         uint8_t Bits = 0;
1003         size_t ToSkip = 0;
1004         switch (s[1]) {
1005         case '8': // i8 suffix
1006           Bits = 8;
1007           ToSkip = 2;
1008           break;
1009         case '1':
1010           if (s[2] == '6') { // i16 suffix
1011             Bits = 16;
1012             ToSkip = 3;
1013           }
1014           break;
1015         case '3':
1016           if (s[2] == '2') { // i32 suffix
1017             Bits = 32;
1018             ToSkip = 3;
1019           }
1020           break;
1021         case '6':
1022           if (s[2] == '4') { // i64 suffix
1023             Bits = 64;
1024             ToSkip = 3;
1025           }
1026           break;
1027         default:
1028           break;
1029         }
1030         if (Bits) {
1031           if (HasSize)
1032             break;
1033           HasSize = true;
1034           MicrosoftInteger = Bits;
1035           s += ToSkip;
1036           assert(s <= ThisTokEnd && "didn't maximally munch?");
1037           break;
1038         }
1039       }
1040       LLVM_FALLTHROUGH;
1041     case 'j':
1042     case 'J':
1043       if (isImaginary) break;   // Cannot be repeated.
1044       isImaginary = true;
1045       continue;  // Success.
1046     case 'w':
1047     case 'W':
1048       if (isFPConstant)
1049         break; // Invalid for floats.
1050       if (HasSize)
1051         break; // Invalid if we already have a size for the literal.
1052 
1053       // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
1054       // explicitly do not support the suffix in C++ as an extension because a
1055       // library-based UDL that resolves to a library type may be more
1056       // appropriate there.
1057       if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
1058           (s[0] == 'W' && s[1] == 'B'))) {
1059         isBitInt = true;
1060         HasSize = true;
1061         ++s; // Skip both characters (2nd char skipped on continue).
1062         continue; // Success.
1063       }
1064     }
1065     // If we reached here, there was an error or a ud-suffix.
1066     break;
1067   }
1068 
1069   // "i", "if", and "il" are user-defined suffixes in C++1y.
1070   if (s != ThisTokEnd || isImaginary) {
1071     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
1072     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
1073     if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
1074       if (!isImaginary) {
1075         // Any suffix pieces we might have parsed are actually part of the
1076         // ud-suffix.
1077         isLong = false;
1078         isUnsigned = false;
1079         isLongLong = false;
1080         isSizeT = false;
1081         isFloat = false;
1082         isFloat16 = false;
1083         isHalf = false;
1084         isImaginary = false;
1085         isBitInt = false;
1086         MicrosoftInteger = 0;
1087         saw_fixed_point_suffix = false;
1088         isFract = false;
1089         isAccum = false;
1090       }
1091 
1092       saw_ud_suffix = true;
1093       return;
1094     }
1095 
1096     if (s != ThisTokEnd) {
1097       // Report an error if there are any.
1098       Diags.Report(Lexer::AdvanceToTokenCharacter(
1099                        TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
1100                    diag::err_invalid_suffix_constant)
1101           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
1102           << (isFixedPointConstant ? 2 : isFPConstant);
1103       hadError = true;
1104     }
1105   }
1106 
1107   if (!hadError && saw_fixed_point_suffix) {
1108     assert(isFract || isAccum);
1109   }
1110 }
1111 
1112 /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
1113 /// numbers. It issues an error for illegal digits, and handles floating point
1114 /// parsing. If it detects a floating point number, the radix is set to 10.
1115 void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
1116   assert((radix == 8 || radix == 10) && "Unexpected radix");
1117 
1118   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
1119   // the code is using an incorrect base.
1120   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
1121       !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1122     Diags.Report(
1123         Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
1124         diag::err_invalid_digit)
1125         << StringRef(s, 1) << (radix == 8 ? 1 : 0);
1126     hadError = true;
1127     return;
1128   }
1129 
1130   if (*s == '.') {
1131     checkSeparator(TokLoc, s, CSK_AfterDigits);
1132     s++;
1133     radix = 10;
1134     saw_period = true;
1135     checkSeparator(TokLoc, s, CSK_BeforeDigits);
1136     s = SkipDigits(s); // Skip suffix.
1137   }
1138   if (*s == 'e' || *s == 'E') { // exponent
1139     checkSeparator(TokLoc, s, CSK_AfterDigits);
1140     const char *Exponent = s;
1141     s++;
1142     radix = 10;
1143     saw_exponent = true;
1144     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1145     const char *first_non_digit = SkipDigits(s);
1146     if (containsDigits(s, first_non_digit)) {
1147       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1148       s = first_non_digit;
1149     } else {
1150       if (!hadError) {
1151         Diags.Report(Lexer::AdvanceToTokenCharacter(
1152                          TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1153                      diag::err_exponent_has_no_digits);
1154         hadError = true;
1155       }
1156       return;
1157     }
1158   }
1159 }
1160 
1161 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
1162 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
1163 /// treat it as an invalid suffix.
1164 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
1165                                            StringRef Suffix) {
1166   if (!LangOpts.CPlusPlus11 || Suffix.empty())
1167     return false;
1168 
1169   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
1170   if (Suffix[0] == '_')
1171     return true;
1172 
1173   // In C++11, there are no library suffixes.
1174   if (!LangOpts.CPlusPlus14)
1175     return false;
1176 
1177   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
1178   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
1179   // In C++2a "d" and "y" are used in the library.
1180   return llvm::StringSwitch<bool>(Suffix)
1181       .Cases("h", "min", "s", true)
1182       .Cases("ms", "us", "ns", true)
1183       .Cases("il", "i", "if", true)
1184       .Cases("d", "y", LangOpts.CPlusPlus20)
1185       .Default(false);
1186 }
1187 
1188 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
1189                                           const char *Pos,
1190                                           CheckSeparatorKind IsAfterDigits) {
1191   if (IsAfterDigits == CSK_AfterDigits) {
1192     if (Pos == ThisTokBegin)
1193       return;
1194     --Pos;
1195   } else if (Pos == ThisTokEnd)
1196     return;
1197 
1198   if (isDigitSeparator(*Pos)) {
1199     Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
1200                                                 LangOpts),
1201                  diag::err_digit_separator_not_between_digits)
1202         << IsAfterDigits;
1203     hadError = true;
1204   }
1205 }
1206 
1207 /// ParseNumberStartingWithZero - This method is called when the first character
1208 /// of the number is found to be a zero.  This means it is either an octal
1209 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
1210 /// a floating point number (01239.123e4).  Eat the prefix, determining the
1211 /// radix etc.
1212 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
1213   assert(s[0] == '0' && "Invalid method call");
1214   s++;
1215 
1216   int c1 = s[0];
1217 
1218   // Handle a hex number like 0x1234.
1219   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
1220     s++;
1221     assert(s < ThisTokEnd && "didn't maximally munch?");
1222     radix = 16;
1223     DigitsBegin = s;
1224     s = SkipHexDigits(s);
1225     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
1226     if (s == ThisTokEnd) {
1227       // Done.
1228     } else if (*s == '.') {
1229       s++;
1230       saw_period = true;
1231       const char *floatDigitsBegin = s;
1232       s = SkipHexDigits(s);
1233       if (containsDigits(floatDigitsBegin, s))
1234         HasSignificandDigits = true;
1235       if (HasSignificandDigits)
1236         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
1237     }
1238 
1239     if (!HasSignificandDigits) {
1240       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1241                                                   LangOpts),
1242                    diag::err_hex_constant_requires)
1243           << LangOpts.CPlusPlus << 1;
1244       hadError = true;
1245       return;
1246     }
1247 
1248     // A binary exponent can appear with or with a '.'. If dotted, the
1249     // binary exponent is required.
1250     if (*s == 'p' || *s == 'P') {
1251       checkSeparator(TokLoc, s, CSK_AfterDigits);
1252       const char *Exponent = s;
1253       s++;
1254       saw_exponent = true;
1255       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
1256       const char *first_non_digit = SkipDigits(s);
1257       if (!containsDigits(s, first_non_digit)) {
1258         if (!hadError) {
1259           Diags.Report(Lexer::AdvanceToTokenCharacter(
1260                            TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
1261                        diag::err_exponent_has_no_digits);
1262           hadError = true;
1263         }
1264         return;
1265       }
1266       checkSeparator(TokLoc, s, CSK_BeforeDigits);
1267       s = first_non_digit;
1268 
1269       if (!LangOpts.HexFloats)
1270         Diags.Report(TokLoc, LangOpts.CPlusPlus
1271                                  ? diag::ext_hex_literal_invalid
1272                                  : diag::ext_hex_constant_invalid);
1273       else if (LangOpts.CPlusPlus17)
1274         Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
1275     } else if (saw_period) {
1276       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1277                                                   LangOpts),
1278                    diag::err_hex_constant_requires)
1279           << LangOpts.CPlusPlus << 0;
1280       hadError = true;
1281     }
1282     return;
1283   }
1284 
1285   // Handle simple binary numbers 0b01010
1286   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
1287     // 0b101010 is a C++1y / GCC extension.
1288     Diags.Report(TokLoc, LangOpts.CPlusPlus14
1289                              ? diag::warn_cxx11_compat_binary_literal
1290                          : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
1291                                               : diag::ext_binary_literal);
1292     ++s;
1293     assert(s < ThisTokEnd && "didn't maximally munch?");
1294     radix = 2;
1295     DigitsBegin = s;
1296     s = SkipBinaryDigits(s);
1297     if (s == ThisTokEnd) {
1298       // Done.
1299     } else if (isHexDigit(*s) &&
1300                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
1301       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
1302                                                   LangOpts),
1303                    diag::err_invalid_digit)
1304           << StringRef(s, 1) << 2;
1305       hadError = true;
1306     }
1307     // Other suffixes will be diagnosed by the caller.
1308     return;
1309   }
1310 
1311   // For now, the radix is set to 8. If we discover that we have a
1312   // floating point constant, the radix will change to 10. Octal floating
1313   // point constants are not permitted (only decimal and hexadecimal).
1314   radix = 8;
1315   const char *PossibleNewDigitStart = s;
1316   s = SkipOctalDigits(s);
1317   // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
1318   // as the start of the digits. So if skipping octal digits does not skip
1319   // anything, we leave the digit start where it was.
1320   if (s != PossibleNewDigitStart)
1321     DigitsBegin = PossibleNewDigitStart;
1322 
1323   if (s == ThisTokEnd)
1324     return; // Done, simple octal number like 01234
1325 
1326   // If we have some other non-octal digit that *is* a decimal digit, see if
1327   // this is part of a floating point number like 094.123 or 09e1.
1328   if (isDigit(*s)) {
1329     const char *EndDecimal = SkipDigits(s);
1330     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
1331       s = EndDecimal;
1332       radix = 10;
1333     }
1334   }
1335 
1336   ParseDecimalOrOctalCommon(TokLoc);
1337 }
1338 
1339 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
1340   switch (Radix) {
1341   case 2:
1342     return NumDigits <= 64;
1343   case 8:
1344     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
1345   case 10:
1346     return NumDigits <= 19; // floor(log10(2^64))
1347   case 16:
1348     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
1349   default:
1350     llvm_unreachable("impossible Radix");
1351   }
1352 }
1353 
1354 /// GetIntegerValue - Convert this numeric literal value to an APInt that
1355 /// matches Val's input width.  If there is an overflow, set Val to the low bits
1356 /// of the result and return true.  Otherwise, return false.
1357 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
1358   // Fast path: Compute a conservative bound on the maximum number of
1359   // bits per digit in this radix. If we can't possibly overflow a
1360   // uint64 based on that bound then do the simple conversion to
1361   // integer. This avoids the expensive overflow checking below, and
1362   // handles the common cases that matter (small decimal integers and
1363   // hex/octal values which don't overflow).
1364   const unsigned NumDigits = SuffixBegin - DigitsBegin;
1365   if (alwaysFitsInto64Bits(radix, NumDigits)) {
1366     uint64_t N = 0;
1367     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
1368       if (!isDigitSeparator(*Ptr))
1369         N = N * radix + llvm::hexDigitValue(*Ptr);
1370 
1371     // This will truncate the value to Val's input width. Simply check
1372     // for overflow by comparing.
1373     Val = N;
1374     return Val.getZExtValue() != N;
1375   }
1376 
1377   Val = 0;
1378   const char *Ptr = DigitsBegin;
1379 
1380   llvm::APInt RadixVal(Val.getBitWidth(), radix);
1381   llvm::APInt CharVal(Val.getBitWidth(), 0);
1382   llvm::APInt OldVal = Val;
1383 
1384   bool OverflowOccurred = false;
1385   while (Ptr < SuffixBegin) {
1386     if (isDigitSeparator(*Ptr)) {
1387       ++Ptr;
1388       continue;
1389     }
1390 
1391     unsigned C = llvm::hexDigitValue(*Ptr++);
1392 
1393     // If this letter is out of bound for this radix, reject it.
1394     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1395 
1396     CharVal = C;
1397 
1398     // Add the digit to the value in the appropriate radix.  If adding in digits
1399     // made the value smaller, then this overflowed.
1400     OldVal = Val;
1401 
1402     // Multiply by radix, did overflow occur on the multiply?
1403     Val *= RadixVal;
1404     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
1405 
1406     // Add value, did overflow occur on the value?
1407     //   (a + b) ult b  <=> overflow
1408     Val += CharVal;
1409     OverflowOccurred |= Val.ult(CharVal);
1410   }
1411   return OverflowOccurred;
1412 }
1413 
1414 llvm::APFloat::opStatus
1415 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
1416   using llvm::APFloat;
1417 
1418   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
1419 
1420   llvm::SmallString<16> Buffer;
1421   StringRef Str(ThisTokBegin, n);
1422   if (Str.contains('\'')) {
1423     Buffer.reserve(n);
1424     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
1425                         &isDigitSeparator);
1426     Str = Buffer;
1427   }
1428 
1429   auto StatusOrErr =
1430       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1431   assert(StatusOrErr && "Invalid floating point representation");
1432   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1433                                                : APFloat::opInvalidOp;
1434 }
1435 
1436 static inline bool IsExponentPart(char c) {
1437   return c == 'p' || c == 'P' || c == 'e' || c == 'E';
1438 }
1439 
1440 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
1441   assert(radix == 16 || radix == 10);
1442 
1443   // Find how many digits are needed to store the whole literal.
1444   unsigned NumDigits = SuffixBegin - DigitsBegin;
1445   if (saw_period) --NumDigits;
1446 
1447   // Initial scan of the exponent if it exists
1448   bool ExpOverflowOccurred = false;
1449   bool NegativeExponent = false;
1450   const char *ExponentBegin;
1451   uint64_t Exponent = 0;
1452   int64_t BaseShift = 0;
1453   if (saw_exponent) {
1454     const char *Ptr = DigitsBegin;
1455 
1456     while (!IsExponentPart(*Ptr)) ++Ptr;
1457     ExponentBegin = Ptr;
1458     ++Ptr;
1459     NegativeExponent = *Ptr == '-';
1460     if (NegativeExponent) ++Ptr;
1461 
1462     unsigned NumExpDigits = SuffixBegin - Ptr;
1463     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
1464       llvm::StringRef ExpStr(Ptr, NumExpDigits);
1465       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
1466       Exponent = ExpInt.getZExtValue();
1467     } else {
1468       ExpOverflowOccurred = true;
1469     }
1470 
1471     if (NegativeExponent) BaseShift -= Exponent;
1472     else BaseShift += Exponent;
1473   }
1474 
1475   // Number of bits needed for decimal literal is
1476   //   ceil(NumDigits * log2(10))       Integral part
1477   // + Scale                            Fractional part
1478   // + ceil(Exponent * log2(10))        Exponent
1479   // --------------------------------------------------
1480   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
1481   //
1482   // But for simplicity in handling integers, we can round up log2(10) to 4,
1483   // making:
1484   // 4 * (NumDigits + Exponent) + Scale
1485   //
1486   // Number of digits needed for hexadecimal literal is
1487   //   4 * NumDigits                    Integral part
1488   // + Scale                            Fractional part
1489   // + Exponent                         Exponent
1490   // --------------------------------------------------
1491   //   (4 * NumDigits) + Scale + Exponent
1492   uint64_t NumBitsNeeded;
1493   if (radix == 10)
1494     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
1495   else
1496     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
1497 
1498   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
1499     ExpOverflowOccurred = true;
1500   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
1501 
1502   bool FoundDecimal = false;
1503 
1504   int64_t FractBaseShift = 0;
1505   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
1506   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
1507     if (*Ptr == '.') {
1508       FoundDecimal = true;
1509       continue;
1510     }
1511 
1512     // Normal reading of an integer
1513     unsigned C = llvm::hexDigitValue(*Ptr);
1514     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
1515 
1516     Val *= radix;
1517     Val += C;
1518 
1519     if (FoundDecimal)
1520       // Keep track of how much we will need to adjust this value by from the
1521       // number of digits past the radix point.
1522       --FractBaseShift;
1523   }
1524 
1525   // For a radix of 16, we will be multiplying by 2 instead of 16.
1526   if (radix == 16) FractBaseShift *= 4;
1527   BaseShift += FractBaseShift;
1528 
1529   Val <<= Scale;
1530 
1531   uint64_t Base = (radix == 16) ? 2 : 10;
1532   if (BaseShift > 0) {
1533     for (int64_t i = 0; i < BaseShift; ++i) {
1534       Val *= Base;
1535     }
1536   } else if (BaseShift < 0) {
1537     for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
1538       Val = Val.udiv(Base);
1539   }
1540 
1541   bool IntOverflowOccurred = false;
1542   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
1543   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
1544     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
1545     StoreVal = Val.trunc(StoreVal.getBitWidth());
1546   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
1547     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
1548     StoreVal = Val.zext(StoreVal.getBitWidth());
1549   } else {
1550     StoreVal = Val;
1551   }
1552 
1553   return IntOverflowOccurred || ExpOverflowOccurred;
1554 }
1555 
1556 /// \verbatim
1557 ///       user-defined-character-literal: [C++11 lex.ext]
1558 ///         character-literal ud-suffix
1559 ///       ud-suffix:
1560 ///         identifier
1561 ///       character-literal: [C++11 lex.ccon]
1562 ///         ' c-char-sequence '
1563 ///         u' c-char-sequence '
1564 ///         U' c-char-sequence '
1565 ///         L' c-char-sequence '
1566 ///         u8' c-char-sequence ' [C++1z lex.ccon]
1567 ///       c-char-sequence:
1568 ///         c-char
1569 ///         c-char-sequence c-char
1570 ///       c-char:
1571 ///         any member of the source character set except the single-quote ',
1572 ///           backslash \, or new-line character
1573 ///         escape-sequence
1574 ///         universal-character-name
1575 ///       escape-sequence:
1576 ///         simple-escape-sequence
1577 ///         octal-escape-sequence
1578 ///         hexadecimal-escape-sequence
1579 ///       simple-escape-sequence:
1580 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1581 ///       octal-escape-sequence:
1582 ///         \ octal-digit
1583 ///         \ octal-digit octal-digit
1584 ///         \ octal-digit octal-digit octal-digit
1585 ///       hexadecimal-escape-sequence:
1586 ///         \x hexadecimal-digit
1587 ///         hexadecimal-escape-sequence hexadecimal-digit
1588 ///       universal-character-name: [C++11 lex.charset]
1589 ///         \u hex-quad
1590 ///         \U hex-quad hex-quad
1591 ///       hex-quad:
1592 ///         hex-digit hex-digit hex-digit hex-digit
1593 /// \endverbatim
1594 ///
1595 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
1596                                      SourceLocation Loc, Preprocessor &PP,
1597                                      tok::TokenKind kind) {
1598   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
1599   HadError = false;
1600 
1601   Kind = kind;
1602 
1603   const char *TokBegin = begin;
1604 
1605   // Skip over wide character determinant.
1606   if (Kind != tok::char_constant)
1607     ++begin;
1608   if (Kind == tok::utf8_char_constant)
1609     ++begin;
1610 
1611   // Skip over the entry quote.
1612   if (begin[0] != '\'') {
1613     PP.Diag(Loc, diag::err_lexing_char);
1614     HadError = true;
1615     return;
1616   }
1617 
1618   ++begin;
1619 
1620   // Remove an optional ud-suffix.
1621   if (end[-1] != '\'') {
1622     const char *UDSuffixEnd = end;
1623     do {
1624       --end;
1625     } while (end[-1] != '\'');
1626     // FIXME: Don't bother with this if !tok.hasUCN().
1627     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
1628     UDSuffixOffset = end - TokBegin;
1629   }
1630 
1631   // Trim the ending quote.
1632   assert(end != begin && "Invalid token lexed");
1633   --end;
1634 
1635   // FIXME: The "Value" is an uint64_t so we can handle char literals of
1636   // up to 64-bits.
1637   // FIXME: This extensively assumes that 'char' is 8-bits.
1638   assert(PP.getTargetInfo().getCharWidth() == 8 &&
1639          "Assumes char is 8 bits");
1640   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
1641          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
1642          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
1643   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
1644          "Assumes sizeof(wchar) on target is <= 64");
1645 
1646   SmallVector<uint32_t, 4> codepoint_buffer;
1647   codepoint_buffer.resize(end - begin);
1648   uint32_t *buffer_begin = &codepoint_buffer.front();
1649   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
1650 
1651   // Unicode escapes representing characters that cannot be correctly
1652   // represented in a single code unit are disallowed in character literals
1653   // by this implementation.
1654   uint32_t largest_character_for_kind;
1655   if (tok::wide_char_constant == Kind) {
1656     largest_character_for_kind =
1657         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
1658   } else if (tok::utf8_char_constant == Kind) {
1659     largest_character_for_kind = 0x7F;
1660   } else if (tok::utf16_char_constant == Kind) {
1661     largest_character_for_kind = 0xFFFF;
1662   } else if (tok::utf32_char_constant == Kind) {
1663     largest_character_for_kind = 0x10FFFF;
1664   } else {
1665     largest_character_for_kind = 0x7Fu;
1666   }
1667 
1668   while (begin != end) {
1669     // Is this a span of non-escape characters?
1670     if (begin[0] != '\\') {
1671       char const *start = begin;
1672       do {
1673         ++begin;
1674       } while (begin != end && *begin != '\\');
1675 
1676       char const *tmp_in_start = start;
1677       uint32_t *tmp_out_start = buffer_begin;
1678       llvm::ConversionResult res =
1679           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
1680                              reinterpret_cast<llvm::UTF8 const *>(begin),
1681                              &buffer_begin, buffer_end, llvm::strictConversion);
1682       if (res != llvm::conversionOK) {
1683         // If we see bad encoding for unprefixed character literals, warn and
1684         // simply copy the byte values, for compatibility with gcc and
1685         // older versions of clang.
1686         bool NoErrorOnBadEncoding = isOrdinary();
1687         unsigned Msg = diag::err_bad_character_encoding;
1688         if (NoErrorOnBadEncoding)
1689           Msg = diag::warn_bad_character_encoding;
1690         PP.Diag(Loc, Msg);
1691         if (NoErrorOnBadEncoding) {
1692           start = tmp_in_start;
1693           buffer_begin = tmp_out_start;
1694           for (; start != begin; ++start, ++buffer_begin)
1695             *buffer_begin = static_cast<uint8_t>(*start);
1696         } else {
1697           HadError = true;
1698         }
1699       } else {
1700         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
1701           if (*tmp_out_start > largest_character_for_kind) {
1702             HadError = true;
1703             PP.Diag(Loc, diag::err_character_too_large);
1704           }
1705         }
1706       }
1707 
1708       continue;
1709     }
1710     // Is this a Universal Character Name escape?
1711     if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
1712       unsigned short UcnLen = 0;
1713       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
1714                             FullSourceLoc(Loc, PP.getSourceManager()),
1715                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
1716         HadError = true;
1717       } else if (*buffer_begin > largest_character_for_kind) {
1718         HadError = true;
1719         PP.Diag(Loc, diag::err_character_too_large);
1720       }
1721 
1722       ++buffer_begin;
1723       continue;
1724     }
1725     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
1726     uint64_t result =
1727       ProcessCharEscape(TokBegin, begin, end, HadError,
1728                         FullSourceLoc(Loc,PP.getSourceManager()),
1729                         CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
1730     *buffer_begin++ = result;
1731   }
1732 
1733   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
1734 
1735   if (NumCharsSoFar > 1) {
1736     if (isOrdinary() && NumCharsSoFar == 4)
1737       PP.Diag(Loc, diag::warn_four_char_character_literal);
1738     else if (isOrdinary())
1739       PP.Diag(Loc, diag::warn_multichar_character_literal);
1740     else {
1741       PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1742       HadError = true;
1743     }
1744     IsMultiChar = true;
1745   } else {
1746     IsMultiChar = false;
1747   }
1748 
1749   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
1750 
1751   // Narrow character literals act as though their value is concatenated
1752   // in this implementation, but warn on overflow.
1753   bool multi_char_too_long = false;
1754   if (isOrdinary() && isMultiChar()) {
1755     LitVal = 0;
1756     for (size_t i = 0; i < NumCharsSoFar; ++i) {
1757       // check for enough leading zeros to shift into
1758       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
1759       LitVal <<= 8;
1760       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
1761     }
1762   } else if (NumCharsSoFar > 0) {
1763     // otherwise just take the last character
1764     LitVal = buffer_begin[-1];
1765   }
1766 
1767   if (!HadError && multi_char_too_long) {
1768     PP.Diag(Loc, diag::warn_char_constant_too_large);
1769   }
1770 
1771   // Transfer the value from APInt to uint64_t
1772   Value = LitVal.getZExtValue();
1773 
1774   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
1775   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
1776   // character constants are not sign extended in the this implementation:
1777   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
1778   if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
1779       PP.getLangOpts().CharIsSigned)
1780     Value = (signed char)Value;
1781 }
1782 
1783 /// \verbatim
1784 ///       string-literal: [C++0x lex.string]
1785 ///         encoding-prefix " [s-char-sequence] "
1786 ///         encoding-prefix R raw-string
1787 ///       encoding-prefix:
1788 ///         u8
1789 ///         u
1790 ///         U
1791 ///         L
1792 ///       s-char-sequence:
1793 ///         s-char
1794 ///         s-char-sequence s-char
1795 ///       s-char:
1796 ///         any member of the source character set except the double-quote ",
1797 ///           backslash \, or new-line character
1798 ///         escape-sequence
1799 ///         universal-character-name
1800 ///       raw-string:
1801 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
1802 ///       r-char-sequence:
1803 ///         r-char
1804 ///         r-char-sequence r-char
1805 ///       r-char:
1806 ///         any member of the source character set, except a right parenthesis )
1807 ///           followed by the initial d-char-sequence (which may be empty)
1808 ///           followed by a double quote ".
1809 ///       d-char-sequence:
1810 ///         d-char
1811 ///         d-char-sequence d-char
1812 ///       d-char:
1813 ///         any member of the basic source character set except:
1814 ///           space, the left parenthesis (, the right parenthesis ),
1815 ///           the backslash \, and the control characters representing horizontal
1816 ///           tab, vertical tab, form feed, and newline.
1817 ///       escape-sequence: [C++0x lex.ccon]
1818 ///         simple-escape-sequence
1819 ///         octal-escape-sequence
1820 ///         hexadecimal-escape-sequence
1821 ///       simple-escape-sequence:
1822 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
1823 ///       octal-escape-sequence:
1824 ///         \ octal-digit
1825 ///         \ octal-digit octal-digit
1826 ///         \ octal-digit octal-digit octal-digit
1827 ///       hexadecimal-escape-sequence:
1828 ///         \x hexadecimal-digit
1829 ///         hexadecimal-escape-sequence hexadecimal-digit
1830 ///       universal-character-name:
1831 ///         \u hex-quad
1832 ///         \U hex-quad hex-quad
1833 ///       hex-quad:
1834 ///         hex-digit hex-digit hex-digit hex-digit
1835 /// \endverbatim
1836 ///
1837 StringLiteralParser::
1838 StringLiteralParser(ArrayRef<Token> StringToks,
1839                     Preprocessor &PP)
1840   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1841     Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
1842     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
1843     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
1844   init(StringToks);
1845 }
1846 
1847 void StringLiteralParser::init(ArrayRef<Token> StringToks){
1848   // The literal token may have come from an invalid source location (e.g. due
1849   // to a PCH error), in which case the token length will be 0.
1850   if (StringToks.empty() || StringToks[0].getLength() < 2)
1851     return DiagnoseLexingError(SourceLocation());
1852 
1853   // Scan all of the string portions, remember the max individual token length,
1854   // computing a bound on the concatenated string length, and see whether any
1855   // piece is a wide-string.  If any of the string portions is a wide-string
1856   // literal, the result is a wide-string literal [C99 6.4.5p4].
1857   assert(!StringToks.empty() && "expected at least one token");
1858   MaxTokenLength = StringToks[0].getLength();
1859   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
1860   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
1861   Kind = StringToks[0].getKind();
1862 
1863   hadError = false;
1864 
1865   // Implement Translation Phase #6: concatenation of string literals
1866   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
1867   for (unsigned i = 1; i != StringToks.size(); ++i) {
1868     if (StringToks[i].getLength() < 2)
1869       return DiagnoseLexingError(StringToks[i].getLocation());
1870 
1871     // The string could be shorter than this if it needs cleaning, but this is a
1872     // reasonable bound, which is all we need.
1873     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
1874     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
1875 
1876     // Remember maximum string piece length.
1877     if (StringToks[i].getLength() > MaxTokenLength)
1878       MaxTokenLength = StringToks[i].getLength();
1879 
1880     // Remember if we see any wide or utf-8/16/32 strings.
1881     // Also check for illegal concatenations.
1882     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
1883       if (isOrdinary()) {
1884         Kind = StringToks[i].getKind();
1885       } else {
1886         if (Diags)
1887           Diags->Report(StringToks[i].getLocation(),
1888                         diag::err_unsupported_string_concat);
1889         hadError = true;
1890       }
1891     }
1892   }
1893 
1894   // Include space for the null terminator.
1895   ++SizeBound;
1896 
1897   // TODO: K&R warning: "traditional C rejects string constant concatenation"
1898 
1899   // Get the width in bytes of char/wchar_t/char16_t/char32_t
1900   CharByteWidth = getCharWidth(Kind, Target);
1901   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
1902   CharByteWidth /= 8;
1903 
1904   // The output buffer size needs to be large enough to hold wide characters.
1905   // This is a worst-case assumption which basically corresponds to L"" "long".
1906   SizeBound *= CharByteWidth;
1907 
1908   // Size the temporary buffer to hold the result string data.
1909   ResultBuf.resize(SizeBound);
1910 
1911   // Likewise, but for each string piece.
1912   SmallString<512> TokenBuf;
1913   TokenBuf.resize(MaxTokenLength);
1914 
1915   // Loop over all the strings, getting their spelling, and expanding them to
1916   // wide strings as appropriate.
1917   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
1918 
1919   Pascal = false;
1920 
1921   SourceLocation UDSuffixTokLoc;
1922 
1923   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
1924     const char *ThisTokBuf = &TokenBuf[0];
1925     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
1926     // that ThisTokBuf points to a buffer that is big enough for the whole token
1927     // and 'spelled' tokens can only shrink.
1928     bool StringInvalid = false;
1929     unsigned ThisTokLen =
1930       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
1931                          &StringInvalid);
1932     if (StringInvalid)
1933       return DiagnoseLexingError(StringToks[i].getLocation());
1934 
1935     const char *ThisTokBegin = ThisTokBuf;
1936     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
1937 
1938     // Remove an optional ud-suffix.
1939     if (ThisTokEnd[-1] != '"') {
1940       const char *UDSuffixEnd = ThisTokEnd;
1941       do {
1942         --ThisTokEnd;
1943       } while (ThisTokEnd[-1] != '"');
1944 
1945       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
1946 
1947       if (UDSuffixBuf.empty()) {
1948         if (StringToks[i].hasUCN())
1949           expandUCNs(UDSuffixBuf, UDSuffix);
1950         else
1951           UDSuffixBuf.assign(UDSuffix);
1952         UDSuffixToken = i;
1953         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
1954         UDSuffixTokLoc = StringToks[i].getLocation();
1955       } else {
1956         SmallString<32> ExpandedUDSuffix;
1957         if (StringToks[i].hasUCN()) {
1958           expandUCNs(ExpandedUDSuffix, UDSuffix);
1959           UDSuffix = ExpandedUDSuffix;
1960         }
1961 
1962         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
1963         // result of a concatenation involving at least one user-defined-string-
1964         // literal, all the participating user-defined-string-literals shall
1965         // have the same ud-suffix.
1966         if (UDSuffixBuf != UDSuffix) {
1967           if (Diags) {
1968             SourceLocation TokLoc = StringToks[i].getLocation();
1969             Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
1970               << UDSuffixBuf << UDSuffix
1971               << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
1972               << SourceRange(TokLoc, TokLoc);
1973           }
1974           hadError = true;
1975         }
1976       }
1977     }
1978 
1979     // Strip the end quote.
1980     --ThisTokEnd;
1981 
1982     // TODO: Input character set mapping support.
1983 
1984     // Skip marker for wide or unicode strings.
1985     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
1986       ++ThisTokBuf;
1987       // Skip 8 of u8 marker for utf8 strings.
1988       if (ThisTokBuf[0] == '8')
1989         ++ThisTokBuf;
1990     }
1991 
1992     // Check for raw string
1993     if (ThisTokBuf[0] == 'R') {
1994       if (ThisTokBuf[1] != '"') {
1995         // The file may have come from PCH and then changed after loading the
1996         // PCH; Fail gracefully.
1997         return DiagnoseLexingError(StringToks[i].getLocation());
1998       }
1999       ThisTokBuf += 2; // skip R"
2000 
2001       // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2002       // characters.
2003       constexpr unsigned MaxRawStrDelimLen = 16;
2004 
2005       const char *Prefix = ThisTokBuf;
2006       while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2007              ThisTokBuf[0] != '(')
2008         ++ThisTokBuf;
2009       if (ThisTokBuf[0] != '(')
2010         return DiagnoseLexingError(StringToks[i].getLocation());
2011       ++ThisTokBuf; // skip '('
2012 
2013       // Remove same number of characters from the end
2014       ThisTokEnd -= ThisTokBuf - Prefix;
2015       if (ThisTokEnd < ThisTokBuf)
2016         return DiagnoseLexingError(StringToks[i].getLocation());
2017 
2018       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
2019       // results in a new-line in the resulting execution string-literal.
2020       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
2021       while (!RemainingTokenSpan.empty()) {
2022         // Split the string literal on \r\n boundaries.
2023         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
2024         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
2025         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
2026 
2027         // Copy everything before the \r\n sequence into the string literal.
2028         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
2029           hadError = true;
2030 
2031         // Point into the \n inside the \r\n sequence and operate on the
2032         // remaining portion of the literal.
2033         RemainingTokenSpan = AfterCRLF.substr(1);
2034       }
2035     } else {
2036       if (ThisTokBuf[0] != '"') {
2037         // The file may have come from PCH and then changed after loading the
2038         // PCH; Fail gracefully.
2039         return DiagnoseLexingError(StringToks[i].getLocation());
2040       }
2041       ++ThisTokBuf; // skip "
2042 
2043       // Check if this is a pascal string
2044       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
2045           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
2046 
2047         // If the \p sequence is found in the first token, we have a pascal string
2048         // Otherwise, if we already have a pascal string, ignore the first \p
2049         if (i == 0) {
2050           ++ThisTokBuf;
2051           Pascal = true;
2052         } else if (Pascal)
2053           ThisTokBuf += 2;
2054       }
2055 
2056       while (ThisTokBuf != ThisTokEnd) {
2057         // Is this a span of non-escape characters?
2058         if (ThisTokBuf[0] != '\\') {
2059           const char *InStart = ThisTokBuf;
2060           do {
2061             ++ThisTokBuf;
2062           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
2063 
2064           // Copy the character span over.
2065           if (CopyStringFragment(StringToks[i], ThisTokBegin,
2066                                  StringRef(InStart, ThisTokBuf - InStart)))
2067             hadError = true;
2068           continue;
2069         }
2070         // Is this a Universal Character Name escape?
2071         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
2072             ThisTokBuf[1] == 'N') {
2073           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
2074                           ResultPtr, hadError,
2075                           FullSourceLoc(StringToks[i].getLocation(), SM),
2076                           CharByteWidth, Diags, Features);
2077           continue;
2078         }
2079         // Otherwise, this is a non-UCN escape character.  Process it.
2080         unsigned ResultChar =
2081           ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
2082                             FullSourceLoc(StringToks[i].getLocation(), SM),
2083                             CharByteWidth*8, Diags, Features);
2084 
2085         if (CharByteWidth == 4) {
2086           // FIXME: Make the type of the result buffer correct instead of
2087           // using reinterpret_cast.
2088           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
2089           *ResultWidePtr = ResultChar;
2090           ResultPtr += 4;
2091         } else if (CharByteWidth == 2) {
2092           // FIXME: Make the type of the result buffer correct instead of
2093           // using reinterpret_cast.
2094           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
2095           *ResultWidePtr = ResultChar & 0xFFFF;
2096           ResultPtr += 2;
2097         } else {
2098           assert(CharByteWidth == 1 && "Unexpected char width");
2099           *ResultPtr++ = ResultChar & 0xFF;
2100         }
2101       }
2102     }
2103   }
2104 
2105   if (Pascal) {
2106     if (CharByteWidth == 4) {
2107       // FIXME: Make the type of the result buffer correct instead of
2108       // using reinterpret_cast.
2109       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
2110       ResultWidePtr[0] = GetNumStringChars() - 1;
2111     } else if (CharByteWidth == 2) {
2112       // FIXME: Make the type of the result buffer correct instead of
2113       // using reinterpret_cast.
2114       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
2115       ResultWidePtr[0] = GetNumStringChars() - 1;
2116     } else {
2117       assert(CharByteWidth == 1 && "Unexpected char width");
2118       ResultBuf[0] = GetNumStringChars() - 1;
2119     }
2120 
2121     // Verify that pascal strings aren't too large.
2122     if (GetStringLength() > 256) {
2123       if (Diags)
2124         Diags->Report(StringToks.front().getLocation(),
2125                       diag::err_pascal_string_too_long)
2126           << SourceRange(StringToks.front().getLocation(),
2127                          StringToks.back().getLocation());
2128       hadError = true;
2129       return;
2130     }
2131   } else if (Diags) {
2132     // Complain if this string literal has too many characters.
2133     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
2134 
2135     if (GetNumStringChars() > MaxChars)
2136       Diags->Report(StringToks.front().getLocation(),
2137                     diag::ext_string_too_long)
2138         << GetNumStringChars() << MaxChars
2139         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
2140         << SourceRange(StringToks.front().getLocation(),
2141                        StringToks.back().getLocation());
2142   }
2143 }
2144 
2145 static const char *resyncUTF8(const char *Err, const char *End) {
2146   if (Err == End)
2147     return End;
2148   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
2149   while (++Err != End && (*Err & 0xC0) == 0x80)
2150     ;
2151   return Err;
2152 }
2153 
2154 /// This function copies from Fragment, which is a sequence of bytes
2155 /// within Tok's contents (which begin at TokBegin) into ResultPtr.
2156 /// Performs widening for multi-byte characters.
2157 bool StringLiteralParser::CopyStringFragment(const Token &Tok,
2158                                              const char *TokBegin,
2159                                              StringRef Fragment) {
2160   const llvm::UTF8 *ErrorPtrTmp;
2161   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
2162     return false;
2163 
2164   // If we see bad encoding for unprefixed string literals, warn and
2165   // simply copy the byte values, for compatibility with gcc and older
2166   // versions of clang.
2167   bool NoErrorOnBadEncoding = isOrdinary();
2168   if (NoErrorOnBadEncoding) {
2169     memcpy(ResultPtr, Fragment.data(), Fragment.size());
2170     ResultPtr += Fragment.size();
2171   }
2172 
2173   if (Diags) {
2174     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2175 
2176     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
2177     const DiagnosticBuilder &Builder =
2178       Diag(Diags, Features, SourceLoc, TokBegin,
2179            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
2180            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
2181                                 : diag::err_bad_string_encoding);
2182 
2183     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2184     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
2185 
2186     // Decode into a dummy buffer.
2187     SmallString<512> Dummy;
2188     Dummy.reserve(Fragment.size() * CharByteWidth);
2189     char *Ptr = Dummy.data();
2190 
2191     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
2192       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
2193       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
2194       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
2195                                      ErrorPtr, NextStart);
2196       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
2197     }
2198   }
2199   return !NoErrorOnBadEncoding;
2200 }
2201 
2202 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
2203   hadError = true;
2204   if (Diags)
2205     Diags->Report(Loc, diag::err_lexing_string);
2206 }
2207 
2208 /// getOffsetOfStringByte - This function returns the offset of the
2209 /// specified byte of the string data represented by Token.  This handles
2210 /// advancing over escape sequences in the string.
2211 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
2212                                                     unsigned ByteNo) const {
2213   // Get the spelling of the token.
2214   SmallString<32> SpellingBuffer;
2215   SpellingBuffer.resize(Tok.getLength());
2216 
2217   bool StringInvalid = false;
2218   const char *SpellingPtr = &SpellingBuffer[0];
2219   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
2220                                        &StringInvalid);
2221   if (StringInvalid)
2222     return 0;
2223 
2224   const char *SpellingStart = SpellingPtr;
2225   const char *SpellingEnd = SpellingPtr+TokLen;
2226 
2227   // Handle UTF-8 strings just like narrow strings.
2228   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
2229     SpellingPtr += 2;
2230 
2231   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
2232          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
2233 
2234   // For raw string literals, this is easy.
2235   if (SpellingPtr[0] == 'R') {
2236     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
2237     // Skip 'R"'.
2238     SpellingPtr += 2;
2239     while (*SpellingPtr != '(') {
2240       ++SpellingPtr;
2241       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
2242     }
2243     // Skip '('.
2244     ++SpellingPtr;
2245     return SpellingPtr - SpellingStart + ByteNo;
2246   }
2247 
2248   // Skip over the leading quote
2249   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
2250   ++SpellingPtr;
2251 
2252   // Skip over bytes until we find the offset we're looking for.
2253   while (ByteNo) {
2254     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
2255 
2256     // Step over non-escapes simply.
2257     if (*SpellingPtr != '\\') {
2258       ++SpellingPtr;
2259       --ByteNo;
2260       continue;
2261     }
2262 
2263     // Otherwise, this is an escape character.  Advance over it.
2264     bool HadError = false;
2265     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
2266         SpellingPtr[1] == 'N') {
2267       const char *EscapePtr = SpellingPtr;
2268       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
2269                                       1, Features, HadError);
2270       if (Len > ByteNo) {
2271         // ByteNo is somewhere within the escape sequence.
2272         SpellingPtr = EscapePtr;
2273         break;
2274       }
2275       ByteNo -= Len;
2276     } else {
2277       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
2278                         FullSourceLoc(Tok.getLocation(), SM),
2279                         CharByteWidth*8, Diags, Features);
2280       --ByteNo;
2281     }
2282     assert(!HadError && "This method isn't valid on erroneous strings");
2283   }
2284 
2285   return SpellingPtr-SpellingStart;
2286 }
2287 
2288 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
2289 /// suffixes as ud-suffixes, because the diagnostic experience is better if we
2290 /// treat it as an invalid suffix.
2291 bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
2292                                           StringRef Suffix) {
2293   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
2294          Suffix == "sv";
2295 }
2296