1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the NumericLiteralParser, CharLiteralParser, and 10 // StringLiteralParser interfaces. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "clang/Lex/LiteralSupport.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/LangOptions.h" 17 #include "clang/Basic/SourceLocation.h" 18 #include "clang/Basic/TargetInfo.h" 19 #include "clang/Lex/LexDiagnostic.h" 20 #include "clang/Lex/Lexer.h" 21 #include "clang/Lex/Preprocessor.h" 22 #include "clang/Lex/Token.h" 23 #include "llvm/ADT/APInt.h" 24 #include "llvm/ADT/ScopeExit.h" 25 #include "llvm/ADT/SmallVector.h" 26 #include "llvm/ADT/StringExtras.h" 27 #include "llvm/ADT/StringSwitch.h" 28 #include "llvm/Support/ConvertUTF.h" 29 #include "llvm/Support/Error.h" 30 #include "llvm/Support/ErrorHandling.h" 31 #include "llvm/Support/Unicode.h" 32 #include <algorithm> 33 #include <cassert> 34 #include <cstddef> 35 #include <cstdint> 36 #include <cstring> 37 #include <string> 38 39 using namespace clang; 40 41 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { 42 switch (kind) { 43 default: llvm_unreachable("Unknown token type!"); 44 case tok::char_constant: 45 case tok::string_literal: 46 case tok::utf8_char_constant: 47 case tok::utf8_string_literal: 48 return Target.getCharWidth(); 49 case tok::wide_char_constant: 50 case tok::wide_string_literal: 51 return Target.getWCharWidth(); 52 case tok::utf16_char_constant: 53 case tok::utf16_string_literal: 54 return Target.getChar16Width(); 55 case tok::utf32_char_constant: 56 case tok::utf32_string_literal: 57 return Target.getChar32Width(); 58 } 59 } 60 61 static unsigned getEncodingPrefixLen(tok::TokenKind kind) { 62 switch (kind) { 63 default: 64 llvm_unreachable("Unknown token type!"); 65 case tok::char_constant: 66 case tok::string_literal: 67 return 0; 68 case tok::utf8_char_constant: 69 case tok::utf8_string_literal: 70 return 2; 71 case tok::wide_char_constant: 72 case tok::wide_string_literal: 73 case tok::utf16_char_constant: 74 case tok::utf16_string_literal: 75 case tok::utf32_char_constant: 76 case tok::utf32_string_literal: 77 return 1; 78 } 79 } 80 81 static CharSourceRange MakeCharSourceRange(const LangOptions &Features, 82 FullSourceLoc TokLoc, 83 const char *TokBegin, 84 const char *TokRangeBegin, 85 const char *TokRangeEnd) { 86 SourceLocation Begin = 87 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, 88 TokLoc.getManager(), Features); 89 SourceLocation End = 90 Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin, 91 TokLoc.getManager(), Features); 92 return CharSourceRange::getCharRange(Begin, End); 93 } 94 95 /// Produce a diagnostic highlighting some portion of a literal. 96 /// 97 /// Emits the diagnostic \p DiagID, highlighting the range of characters from 98 /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be 99 /// a substring of a spelling buffer for the token beginning at \p TokBegin. 100 static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, 101 const LangOptions &Features, FullSourceLoc TokLoc, 102 const char *TokBegin, const char *TokRangeBegin, 103 const char *TokRangeEnd, unsigned DiagID) { 104 SourceLocation Begin = 105 Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, 106 TokLoc.getManager(), Features); 107 return Diags->Report(Begin, DiagID) << 108 MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd); 109 } 110 111 static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) { 112 switch (Escape) { 113 case '\'': 114 case '"': 115 case '?': 116 case '\\': 117 case 'a': 118 case 'b': 119 case 'f': 120 case 'n': 121 case 'r': 122 case 't': 123 case 'v': 124 return true; 125 } 126 return false; 127 } 128 129 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in 130 /// either a character or a string literal. 131 static unsigned ProcessCharEscape(const char *ThisTokBegin, 132 const char *&ThisTokBuf, 133 const char *ThisTokEnd, bool &HadError, 134 FullSourceLoc Loc, unsigned CharWidth, 135 DiagnosticsEngine *Diags, 136 const LangOptions &Features, 137 StringLiteralEvalMethod EvalMethod) { 138 const char *EscapeBegin = ThisTokBuf; 139 bool Delimited = false; 140 bool EndDelimiterFound = false; 141 142 // Skip the '\' char. 143 ++ThisTokBuf; 144 145 // We know that this character can't be off the end of the buffer, because 146 // that would have been \", which would not have been the end of string. 147 unsigned ResultChar = *ThisTokBuf++; 148 char Escape = ResultChar; 149 switch (ResultChar) { 150 // These map to themselves. 151 case '\\': case '\'': case '"': case '?': break; 152 153 // These have fixed mappings. 154 case 'a': 155 // TODO: K&R: the meaning of '\\a' is different in traditional C 156 ResultChar = 7; 157 break; 158 case 'b': 159 ResultChar = 8; 160 break; 161 case 'e': 162 if (Diags) 163 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 164 diag::ext_nonstandard_escape) << "e"; 165 ResultChar = 27; 166 break; 167 case 'E': 168 if (Diags) 169 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 170 diag::ext_nonstandard_escape) << "E"; 171 ResultChar = 27; 172 break; 173 case 'f': 174 ResultChar = 12; 175 break; 176 case 'n': 177 ResultChar = 10; 178 break; 179 case 'r': 180 ResultChar = 13; 181 break; 182 case 't': 183 ResultChar = 9; 184 break; 185 case 'v': 186 ResultChar = 11; 187 break; 188 case 'x': { // Hex escape. 189 ResultChar = 0; 190 if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { 191 Delimited = true; 192 ThisTokBuf++; 193 if (*ThisTokBuf == '}') { 194 HadError = true; 195 if (Diags) 196 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 197 diag::err_delimited_escape_empty); 198 } 199 } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { 200 if (Diags) 201 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 202 diag::err_hex_escape_no_digits) << "x"; 203 return ResultChar; 204 } 205 206 // Hex escapes are a maximal series of hex digits. 207 bool Overflow = false; 208 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { 209 if (Delimited && *ThisTokBuf == '}') { 210 ThisTokBuf++; 211 EndDelimiterFound = true; 212 break; 213 } 214 int CharVal = llvm::hexDigitValue(*ThisTokBuf); 215 if (CharVal == -1) { 216 // Non delimited hex escape sequences stop at the first non-hex digit. 217 if (!Delimited) 218 break; 219 HadError = true; 220 if (Diags) 221 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 222 diag::err_delimited_escape_invalid) 223 << StringRef(ThisTokBuf, 1); 224 continue; 225 } 226 // About to shift out a digit? 227 if (ResultChar & 0xF0000000) 228 Overflow = true; 229 ResultChar <<= 4; 230 ResultChar |= CharVal; 231 } 232 // See if any bits will be truncated when evaluated as a character. 233 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 234 Overflow = true; 235 ResultChar &= ~0U >> (32-CharWidth); 236 } 237 238 // Check for overflow. 239 if (!HadError && Overflow) { // Too many digits to fit in 240 HadError = true; 241 if (Diags) 242 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 243 diag::err_escape_too_large) 244 << 0; 245 } 246 break; 247 } 248 case '0': case '1': case '2': case '3': 249 case '4': case '5': case '6': case '7': { 250 // Octal escapes. 251 --ThisTokBuf; 252 ResultChar = 0; 253 254 // Octal escapes are a series of octal digits with maximum length 3. 255 // "\0123" is a two digit sequence equal to "\012" "3". 256 unsigned NumDigits = 0; 257 do { 258 ResultChar <<= 3; 259 ResultChar |= *ThisTokBuf++ - '0'; 260 ++NumDigits; 261 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && 262 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); 263 264 // Check for overflow. Reject '\777', but not L'\777'. 265 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 266 if (Diags) 267 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 268 diag::err_escape_too_large) << 1; 269 ResultChar &= ~0U >> (32-CharWidth); 270 } 271 break; 272 } 273 case 'o': { 274 bool Overflow = false; 275 if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') { 276 HadError = true; 277 if (Diags) 278 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 279 diag::err_delimited_escape_missing_brace) 280 << "o"; 281 282 break; 283 } 284 ResultChar = 0; 285 Delimited = true; 286 ++ThisTokBuf; 287 if (*ThisTokBuf == '}') { 288 HadError = true; 289 if (Diags) 290 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 291 diag::err_delimited_escape_empty); 292 } 293 294 while (ThisTokBuf != ThisTokEnd) { 295 if (*ThisTokBuf == '}') { 296 EndDelimiterFound = true; 297 ThisTokBuf++; 298 break; 299 } 300 if (*ThisTokBuf < '0' || *ThisTokBuf > '7') { 301 HadError = true; 302 if (Diags) 303 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 304 diag::err_delimited_escape_invalid) 305 << StringRef(ThisTokBuf, 1); 306 ThisTokBuf++; 307 continue; 308 } 309 // Check if one of the top three bits is set before shifting them out. 310 if (ResultChar & 0xE0000000) 311 Overflow = true; 312 313 ResultChar <<= 3; 314 ResultChar |= *ThisTokBuf++ - '0'; 315 } 316 // Check for overflow. Reject '\777', but not L'\777'. 317 if (!HadError && 318 (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) { 319 HadError = true; 320 if (Diags) 321 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 322 diag::err_escape_too_large) 323 << 1; 324 ResultChar &= ~0U >> (32 - CharWidth); 325 } 326 break; 327 } 328 // Otherwise, these are not valid escapes. 329 case '(': case '{': case '[': case '%': 330 // GCC accepts these as extensions. We warn about them as such though. 331 if (Diags) 332 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 333 diag::ext_nonstandard_escape) 334 << std::string(1, ResultChar); 335 break; 336 default: 337 if (!Diags) 338 break; 339 340 if (isPrintable(ResultChar)) 341 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 342 diag::ext_unknown_escape) 343 << std::string(1, ResultChar); 344 else 345 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 346 diag::ext_unknown_escape) 347 << "x" + llvm::utohexstr(ResultChar); 348 break; 349 } 350 351 if (Delimited && Diags) { 352 if (!EndDelimiterFound) 353 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 354 diag::err_expected) 355 << tok::r_brace; 356 else if (!HadError) { 357 Lexer::DiagnoseDelimitedOrNamedEscapeSequence(Loc, false, Features, 358 *Diags); 359 } 360 } 361 362 if (EvalMethod == StringLiteralEvalMethod::Unevaluated && 363 !IsEscapeValidInUnevaluatedStringLiteral(Escape)) { 364 Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, 365 diag::err_unevaluated_string_invalid_escape_sequence) 366 << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin); 367 HadError = true; 368 } 369 370 return ResultChar; 371 } 372 373 static void appendCodePoint(unsigned Codepoint, 374 llvm::SmallVectorImpl<char> &Str) { 375 char ResultBuf[4]; 376 char *ResultPtr = ResultBuf; 377 if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr)) 378 Str.append(ResultBuf, ResultPtr); 379 } 380 381 void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) { 382 for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) { 383 if (*I != '\\') { 384 Buf.push_back(*I); 385 continue; 386 } 387 388 ++I; 389 char Kind = *I; 390 ++I; 391 392 assert(Kind == 'u' || Kind == 'U' || Kind == 'N'); 393 uint32_t CodePoint = 0; 394 395 if (Kind == 'u' && *I == '{') { 396 for (++I; *I != '}'; ++I) { 397 unsigned Value = llvm::hexDigitValue(*I); 398 assert(Value != -1U); 399 CodePoint <<= 4; 400 CodePoint += Value; 401 } 402 appendCodePoint(CodePoint, Buf); 403 continue; 404 } 405 406 if (Kind == 'N') { 407 assert(*I == '{'); 408 ++I; 409 auto Delim = std::find(I, Input.end(), '}'); 410 assert(Delim != Input.end()); 411 StringRef Name(I, std::distance(I, Delim)); 412 std::optional<llvm::sys::unicode::LooseMatchingResult> Res = 413 llvm::sys::unicode::nameToCodepointLooseMatching(Name); 414 assert(Res && "could not find a codepoint that was previously found"); 415 CodePoint = Res->CodePoint; 416 assert(CodePoint != 0xFFFFFFFF); 417 appendCodePoint(CodePoint, Buf); 418 I = Delim; 419 continue; 420 } 421 422 unsigned NumHexDigits; 423 if (Kind == 'u') 424 NumHexDigits = 4; 425 else 426 NumHexDigits = 8; 427 428 assert(I + NumHexDigits <= E); 429 430 for (; NumHexDigits != 0; ++I, --NumHexDigits) { 431 unsigned Value = llvm::hexDigitValue(*I); 432 assert(Value != -1U); 433 434 CodePoint <<= 4; 435 CodePoint += Value; 436 } 437 438 appendCodePoint(CodePoint, Buf); 439 --I; 440 } 441 } 442 443 bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K, 444 const LangOptions &LO) { 445 return LO.MicrosoftExt && 446 (K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ || 447 K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ || 448 K == tok::kw___FUNCDNAME__); 449 } 450 451 bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) { 452 return tok::isStringLiteral(Tok.getKind()) || 453 isFunctionLocalStringLiteralMacro(Tok.getKind(), LO); 454 } 455 456 static bool ProcessNumericUCNEscape(const char *ThisTokBegin, 457 const char *&ThisTokBuf, 458 const char *ThisTokEnd, uint32_t &UcnVal, 459 unsigned short &UcnLen, bool &Delimited, 460 FullSourceLoc Loc, DiagnosticsEngine *Diags, 461 const LangOptions &Features, 462 bool in_char_string_literal = false) { 463 const char *UcnBegin = ThisTokBuf; 464 bool HasError = false; 465 bool EndDelimiterFound = false; 466 467 // Skip the '\u' char's. 468 ThisTokBuf += 2; 469 Delimited = false; 470 if (UcnBegin[1] == 'u' && in_char_string_literal && 471 ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') { 472 Delimited = true; 473 ThisTokBuf++; 474 } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) { 475 if (Diags) 476 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 477 diag::err_hex_escape_no_digits) 478 << StringRef(&ThisTokBuf[-1], 1); 479 return false; 480 } 481 UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); 482 483 bool Overflow = false; 484 unsigned short Count = 0; 485 for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen); 486 ++ThisTokBuf) { 487 if (Delimited && *ThisTokBuf == '}') { 488 ++ThisTokBuf; 489 EndDelimiterFound = true; 490 break; 491 } 492 int CharVal = llvm::hexDigitValue(*ThisTokBuf); 493 if (CharVal == -1) { 494 HasError = true; 495 if (!Delimited) 496 break; 497 if (Diags) { 498 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 499 diag::err_delimited_escape_invalid) 500 << StringRef(ThisTokBuf, 1); 501 } 502 Count++; 503 continue; 504 } 505 if (UcnVal & 0xF0000000) { 506 Overflow = true; 507 continue; 508 } 509 UcnVal <<= 4; 510 UcnVal |= CharVal; 511 Count++; 512 } 513 514 if (Overflow) { 515 if (Diags) 516 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 517 diag::err_escape_too_large) 518 << 0; 519 return false; 520 } 521 522 if (Delimited && !EndDelimiterFound) { 523 if (Diags) { 524 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 525 diag::err_expected) 526 << tok::r_brace; 527 } 528 return false; 529 } 530 531 // If we didn't consume the proper number of digits, there is a problem. 532 if (Count == 0 || (!Delimited && Count != UcnLen)) { 533 if (Diags) 534 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 535 Delimited ? diag::err_delimited_escape_empty 536 : diag::err_ucn_escape_incomplete); 537 return false; 538 } 539 return !HasError; 540 } 541 542 static void DiagnoseInvalidUnicodeCharacterName( 543 DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc, 544 const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd, 545 llvm::StringRef Name) { 546 547 Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd, 548 diag::err_invalid_ucn_name) 549 << Name; 550 551 namespace u = llvm::sys::unicode; 552 553 std::optional<u::LooseMatchingResult> Res = 554 u::nameToCodepointLooseMatching(Name); 555 if (Res) { 556 Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd, 557 diag::note_invalid_ucn_name_loose_matching) 558 << FixItHint::CreateReplacement( 559 MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin, 560 TokRangeEnd), 561 Res->Name); 562 return; 563 } 564 565 unsigned Distance = 0; 566 SmallVector<u::MatchForCodepointName> Matches = 567 u::nearestMatchesForCodepointName(Name, 5); 568 assert(!Matches.empty() && "No unicode characters found"); 569 570 for (const auto &Match : Matches) { 571 if (Distance == 0) 572 Distance = Match.Distance; 573 if (std::max(Distance, Match.Distance) - 574 std::min(Distance, Match.Distance) > 575 3) 576 break; 577 Distance = Match.Distance; 578 579 std::string Str; 580 llvm::UTF32 V = Match.Value; 581 bool Converted = 582 llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str); 583 (void)Converted; 584 assert(Converted && "Found a match wich is not a unicode character"); 585 586 Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd, 587 diag::note_invalid_ucn_name_candidate) 588 << Match.Name << llvm::utohexstr(Match.Value) 589 << Str // FIXME: Fix the rendering of non printable characters 590 << FixItHint::CreateReplacement( 591 MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin, 592 TokRangeEnd), 593 Match.Name); 594 } 595 } 596 597 static bool ProcessNamedUCNEscape(const char *ThisTokBegin, 598 const char *&ThisTokBuf, 599 const char *ThisTokEnd, uint32_t &UcnVal, 600 unsigned short &UcnLen, FullSourceLoc Loc, 601 DiagnosticsEngine *Diags, 602 const LangOptions &Features) { 603 const char *UcnBegin = ThisTokBuf; 604 assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N'); 605 ThisTokBuf += 2; 606 if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') { 607 if (Diags) { 608 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 609 diag::err_delimited_escape_missing_brace) 610 << StringRef(&ThisTokBuf[-1], 1); 611 } 612 return false; 613 } 614 ThisTokBuf++; 615 const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) { 616 return C == '}' || isVerticalWhitespace(C); 617 }); 618 bool Incomplete = ClosingBrace == ThisTokEnd; 619 bool Empty = ClosingBrace == ThisTokBuf; 620 if (Incomplete || Empty) { 621 if (Diags) { 622 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 623 Incomplete ? diag::err_ucn_escape_incomplete 624 : diag::err_delimited_escape_empty) 625 << StringRef(&UcnBegin[1], 1); 626 } 627 ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1; 628 return false; 629 } 630 StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf); 631 ThisTokBuf = ClosingBrace + 1; 632 std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name); 633 if (!Res) { 634 if (Diags) 635 DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin, 636 &UcnBegin[3], ClosingBrace, Name); 637 return false; 638 } 639 UcnVal = *Res; 640 UcnLen = UcnVal > 0xFFFF ? 8 : 4; 641 return true; 642 } 643 644 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and 645 /// return the UTF32. 646 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, 647 const char *ThisTokEnd, uint32_t &UcnVal, 648 unsigned short &UcnLen, FullSourceLoc Loc, 649 DiagnosticsEngine *Diags, 650 const LangOptions &Features, 651 bool in_char_string_literal = false) { 652 653 bool HasError; 654 const char *UcnBegin = ThisTokBuf; 655 bool IsDelimitedEscapeSequence = false; 656 bool IsNamedEscapeSequence = false; 657 if (ThisTokBuf[1] == 'N') { 658 IsNamedEscapeSequence = true; 659 HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, 660 UcnVal, UcnLen, Loc, Diags, Features); 661 } else { 662 HasError = 663 !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, 664 UcnLen, IsDelimitedEscapeSequence, Loc, Diags, 665 Features, in_char_string_literal); 666 } 667 if (HasError) 668 return false; 669 670 // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2] 671 if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints 672 UcnVal > 0x10FFFF) { // maximum legal UTF32 value 673 if (Diags) 674 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 675 diag::err_ucn_escape_invalid); 676 return false; 677 } 678 679 // C23 and C++11 allow UCNs that refer to control characters 680 // and basic source characters inside character and string literals 681 if (UcnVal < 0xa0 && 682 // $, @, ` are allowed in all language modes 683 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { 684 bool IsError = 685 (!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal); 686 if (Diags) { 687 char BasicSCSChar = UcnVal; 688 if (UcnVal >= 0x20 && UcnVal < 0x7f) 689 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 690 IsError ? diag::err_ucn_escape_basic_scs 691 : Features.CPlusPlus 692 ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs 693 : diag::warn_c23_compat_literal_ucn_escape_basic_scs) 694 << StringRef(&BasicSCSChar, 1); 695 else 696 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 697 IsError ? diag::err_ucn_control_character 698 : Features.CPlusPlus 699 ? diag::warn_cxx98_compat_literal_ucn_control_character 700 : diag::warn_c23_compat_literal_ucn_control_character); 701 } 702 if (IsError) 703 return false; 704 } 705 706 if (!Features.CPlusPlus && !Features.C99 && Diags) 707 Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, 708 diag::warn_ucn_not_valid_in_c89_literal); 709 710 if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags) 711 Lexer::DiagnoseDelimitedOrNamedEscapeSequence(Loc, IsNamedEscapeSequence, 712 Features, *Diags); 713 return true; 714 } 715 716 /// MeasureUCNEscape - Determine the number of bytes within the resulting string 717 /// which this UCN will occupy. 718 static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, 719 const char *ThisTokEnd, unsigned CharByteWidth, 720 const LangOptions &Features, bool &HadError) { 721 // UTF-32: 4 bytes per escape. 722 if (CharByteWidth == 4) 723 return 4; 724 725 uint32_t UcnVal = 0; 726 unsigned short UcnLen = 0; 727 FullSourceLoc Loc; 728 729 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, 730 UcnLen, Loc, nullptr, Features, true)) { 731 HadError = true; 732 return 0; 733 } 734 735 // UTF-16: 2 bytes for BMP, 4 bytes otherwise. 736 if (CharByteWidth == 2) 737 return UcnVal <= 0xFFFF ? 2 : 4; 738 739 // UTF-8. 740 if (UcnVal < 0x80) 741 return 1; 742 if (UcnVal < 0x800) 743 return 2; 744 if (UcnVal < 0x10000) 745 return 3; 746 return 4; 747 } 748 749 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and 750 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of 751 /// StringLiteralParser. When we decide to implement UCN's for identifiers, 752 /// we will likely rework our support for UCN's. 753 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, 754 const char *ThisTokEnd, 755 char *&ResultBuf, bool &HadError, 756 FullSourceLoc Loc, unsigned CharByteWidth, 757 DiagnosticsEngine *Diags, 758 const LangOptions &Features) { 759 typedef uint32_t UTF32; 760 UTF32 UcnVal = 0; 761 unsigned short UcnLen = 0; 762 if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, 763 Loc, Diags, Features, true)) { 764 HadError = true; 765 return; 766 } 767 768 assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) && 769 "only character widths of 1, 2, or 4 bytes supported"); 770 771 (void)UcnLen; 772 assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); 773 774 if (CharByteWidth == 4) { 775 // FIXME: Make the type of the result buffer correct instead of 776 // using reinterpret_cast. 777 llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf); 778 *ResultPtr = UcnVal; 779 ResultBuf += 4; 780 return; 781 } 782 783 if (CharByteWidth == 2) { 784 // FIXME: Make the type of the result buffer correct instead of 785 // using reinterpret_cast. 786 llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf); 787 788 if (UcnVal <= (UTF32)0xFFFF) { 789 *ResultPtr = UcnVal; 790 ResultBuf += 2; 791 return; 792 } 793 794 // Convert to UTF16. 795 UcnVal -= 0x10000; 796 *ResultPtr = 0xD800 + (UcnVal >> 10); 797 *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF); 798 ResultBuf += 4; 799 return; 800 } 801 802 assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); 803 804 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. 805 // The conversion below was inspired by: 806 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c 807 // First, we determine how many bytes the result will require. 808 typedef uint8_t UTF8; 809 810 unsigned short bytesToWrite = 0; 811 if (UcnVal < (UTF32)0x80) 812 bytesToWrite = 1; 813 else if (UcnVal < (UTF32)0x800) 814 bytesToWrite = 2; 815 else if (UcnVal < (UTF32)0x10000) 816 bytesToWrite = 3; 817 else 818 bytesToWrite = 4; 819 820 const unsigned byteMask = 0xBF; 821 const unsigned byteMark = 0x80; 822 823 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed 824 // into the first byte, depending on how many bytes follow. 825 static const UTF8 firstByteMark[5] = { 826 0x00, 0x00, 0xC0, 0xE0, 0xF0 827 }; 828 // Finally, we write the bytes into ResultBuf. 829 ResultBuf += bytesToWrite; 830 switch (bytesToWrite) { // note: everything falls through. 831 case 4: 832 *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 833 [[fallthrough]]; 834 case 3: 835 *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 836 [[fallthrough]]; 837 case 2: 838 *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 839 [[fallthrough]]; 840 case 1: 841 *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); 842 } 843 // Update the buffer. 844 ResultBuf += bytesToWrite; 845 } 846 847 /// integer-constant: [C99 6.4.4.1] 848 /// decimal-constant integer-suffix 849 /// octal-constant integer-suffix 850 /// hexadecimal-constant integer-suffix 851 /// binary-literal integer-suffix [GNU, C++1y] 852 /// user-defined-integer-literal: [C++11 lex.ext] 853 /// decimal-literal ud-suffix 854 /// octal-literal ud-suffix 855 /// hexadecimal-literal ud-suffix 856 /// binary-literal ud-suffix [GNU, C++1y] 857 /// decimal-constant: 858 /// nonzero-digit 859 /// decimal-constant digit 860 /// octal-constant: 861 /// 0 862 /// octal-constant octal-digit 863 /// hexadecimal-constant: 864 /// hexadecimal-prefix hexadecimal-digit 865 /// hexadecimal-constant hexadecimal-digit 866 /// hexadecimal-prefix: one of 867 /// 0x 0X 868 /// binary-literal: 869 /// 0b binary-digit 870 /// 0B binary-digit 871 /// binary-literal binary-digit 872 /// integer-suffix: 873 /// unsigned-suffix [long-suffix] 874 /// unsigned-suffix [long-long-suffix] 875 /// long-suffix [unsigned-suffix] 876 /// long-long-suffix [unsigned-sufix] 877 /// nonzero-digit: 878 /// 1 2 3 4 5 6 7 8 9 879 /// octal-digit: 880 /// 0 1 2 3 4 5 6 7 881 /// hexadecimal-digit: 882 /// 0 1 2 3 4 5 6 7 8 9 883 /// a b c d e f 884 /// A B C D E F 885 /// binary-digit: 886 /// 0 887 /// 1 888 /// unsigned-suffix: one of 889 /// u U 890 /// long-suffix: one of 891 /// l L 892 /// long-long-suffix: one of 893 /// ll LL 894 /// 895 /// floating-constant: [C99 6.4.4.2] 896 /// TODO: add rules... 897 /// 898 NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling, 899 SourceLocation TokLoc, 900 const SourceManager &SM, 901 const LangOptions &LangOpts, 902 const TargetInfo &Target, 903 DiagnosticsEngine &Diags) 904 : SM(SM), LangOpts(LangOpts), Diags(Diags), 905 ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) { 906 907 s = DigitsBegin = ThisTokBegin; 908 saw_exponent = false; 909 saw_period = false; 910 saw_ud_suffix = false; 911 saw_fixed_point_suffix = false; 912 isLong = false; 913 isUnsigned = false; 914 isLongLong = false; 915 isSizeT = false; 916 isHalf = false; 917 isFloat = false; 918 isImaginary = false; 919 isFloat16 = false; 920 isFloat128 = false; 921 MicrosoftInteger = 0; 922 isFract = false; 923 isAccum = false; 924 hadError = false; 925 isBitInt = false; 926 927 // This routine assumes that the range begin/end matches the regex for integer 928 // and FP constants (specifically, the 'pp-number' regex), and assumes that 929 // the byte at "*end" is both valid and not part of the regex. Because of 930 // this, it doesn't have to check for 'overscan' in various places. 931 // Note: For HLSL, the end token is allowed to be '.' which would be in the 932 // 'pp-number' regex. This is required to support vector swizzles on numeric 933 // constants (i.e. 1.xx or 1.5f.rrr). 934 if (isPreprocessingNumberBody(*ThisTokEnd) && 935 !(LangOpts.HLSL && *ThisTokEnd == '.')) { 936 Diags.Report(TokLoc, diag::err_lexing_numeric); 937 hadError = true; 938 return; 939 } 940 941 if (*s == '0') { // parse radix 942 ParseNumberStartingWithZero(TokLoc); 943 if (hadError) 944 return; 945 } else { // the first digit is non-zero 946 radix = 10; 947 s = SkipDigits(s); 948 if (s == ThisTokEnd) { 949 // Done. 950 } else { 951 ParseDecimalOrOctalCommon(TokLoc); 952 if (hadError) 953 return; 954 } 955 } 956 957 SuffixBegin = s; 958 checkSeparator(TokLoc, s, CSK_AfterDigits); 959 960 // Initial scan to lookahead for fixed point suffix. 961 if (LangOpts.FixedPoint) { 962 for (const char *c = s; c != ThisTokEnd; ++c) { 963 if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') { 964 saw_fixed_point_suffix = true; 965 break; 966 } 967 } 968 } 969 970 // Parse the suffix. At this point we can classify whether we have an FP or 971 // integer constant. 972 bool isFixedPointConstant = isFixedPointLiteral(); 973 bool isFPConstant = isFloatingLiteral(); 974 bool HasSize = false; 975 bool DoubleUnderscore = false; 976 977 // Loop over all of the characters of the suffix. If we see something bad, 978 // we break out of the loop. 979 for (; s != ThisTokEnd; ++s) { 980 switch (*s) { 981 case 'R': 982 case 'r': 983 if (!LangOpts.FixedPoint) 984 break; 985 if (isFract || isAccum) break; 986 if (!(saw_period || saw_exponent)) break; 987 isFract = true; 988 continue; 989 case 'K': 990 case 'k': 991 if (!LangOpts.FixedPoint) 992 break; 993 if (isFract || isAccum) break; 994 if (!(saw_period || saw_exponent)) break; 995 isAccum = true; 996 continue; 997 case 'h': // FP Suffix for "half". 998 case 'H': 999 // OpenCL Extension v1.2 s9.5 - h or H suffix for half type. 1000 if (!(LangOpts.Half || LangOpts.FixedPoint)) 1001 break; 1002 if (isIntegerLiteral()) break; // Error for integer constant. 1003 if (HasSize) 1004 break; 1005 HasSize = true; 1006 isHalf = true; 1007 continue; // Success. 1008 case 'f': // FP Suffix for "float" 1009 case 'F': 1010 if (!isFPConstant) break; // Error for integer constant. 1011 if (HasSize) 1012 break; 1013 HasSize = true; 1014 1015 // CUDA host and device may have different _Float16 support, therefore 1016 // allows f16 literals to avoid false alarm. 1017 // When we compile for OpenMP target offloading on NVPTX, f16 suffix 1018 // should also be supported. 1019 // ToDo: more precise check for CUDA. 1020 // TODO: AMDGPU might also support it in the future. 1021 if ((Target.hasFloat16Type() || LangOpts.CUDA || 1022 (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) && 1023 s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') { 1024 s += 2; // success, eat up 2 characters. 1025 isFloat16 = true; 1026 continue; 1027 } 1028 1029 isFloat = true; 1030 continue; // Success. 1031 case 'q': // FP Suffix for "__float128" 1032 case 'Q': 1033 if (!isFPConstant) break; // Error for integer constant. 1034 if (HasSize) 1035 break; 1036 HasSize = true; 1037 isFloat128 = true; 1038 continue; // Success. 1039 case 'u': 1040 case 'U': 1041 if (isFPConstant) break; // Error for floating constant. 1042 if (isUnsigned) break; // Cannot be repeated. 1043 isUnsigned = true; 1044 continue; // Success. 1045 case 'l': 1046 case 'L': 1047 if (HasSize) 1048 break; 1049 HasSize = true; 1050 1051 // Check for long long. The L's need to be adjacent and the same case. 1052 if (s[1] == s[0]) { 1053 assert(s + 1 < ThisTokEnd && "didn't maximally munch?"); 1054 if (isFPConstant) break; // long long invalid for floats. 1055 isLongLong = true; 1056 ++s; // Eat both of them. 1057 } else { 1058 isLong = true; 1059 } 1060 continue; // Success. 1061 case 'z': 1062 case 'Z': 1063 if (isFPConstant) 1064 break; // Invalid for floats. 1065 if (HasSize) 1066 break; 1067 HasSize = true; 1068 isSizeT = true; 1069 continue; 1070 case 'i': 1071 case 'I': 1072 if (LangOpts.MicrosoftExt && s + 1 < ThisTokEnd && !isFPConstant) { 1073 // Allow i8, i16, i32, i64, and i128. First, look ahead and check if 1074 // suffixes are Microsoft integers and not the imaginary unit. 1075 uint8_t Bits = 0; 1076 size_t ToSkip = 0; 1077 switch (s[1]) { 1078 case '8': // i8 suffix 1079 Bits = 8; 1080 ToSkip = 2; 1081 break; 1082 case '1': 1083 if (s + 2 < ThisTokEnd && s[2] == '6') { // i16 suffix 1084 Bits = 16; 1085 ToSkip = 3; 1086 } else if (s + 3 < ThisTokEnd && s[2] == '2' && 1087 s[3] == '8') { // i128 suffix 1088 Bits = 128; 1089 ToSkip = 4; 1090 } 1091 break; 1092 case '3': 1093 if (s + 2 < ThisTokEnd && s[2] == '2') { // i32 suffix 1094 Bits = 32; 1095 ToSkip = 3; 1096 } 1097 break; 1098 case '6': 1099 if (s + 2 < ThisTokEnd && s[2] == '4') { // i64 suffix 1100 Bits = 64; 1101 ToSkip = 3; 1102 } 1103 break; 1104 default: 1105 break; 1106 } 1107 if (Bits) { 1108 if (HasSize) 1109 break; 1110 HasSize = true; 1111 MicrosoftInteger = Bits; 1112 s += ToSkip; 1113 assert(s <= ThisTokEnd && "didn't maximally munch?"); 1114 break; 1115 } 1116 } 1117 [[fallthrough]]; 1118 case 'j': 1119 case 'J': 1120 if (isImaginary) break; // Cannot be repeated. 1121 isImaginary = true; 1122 continue; // Success. 1123 case '_': 1124 if (isFPConstant) 1125 break; // Invalid for floats 1126 if (HasSize) 1127 break; 1128 // There is currently no way to reach this with DoubleUnderscore set. 1129 // If new double underscope literals are added handle it here as above. 1130 assert(!DoubleUnderscore && "unhandled double underscore case"); 1131 if (LangOpts.CPlusPlus && s + 2 < ThisTokEnd && 1132 s[1] == '_') { // s + 2 < ThisTokEnd to ensure some character exists 1133 // after __ 1134 DoubleUnderscore = true; 1135 s += 2; // Skip both '_' 1136 if (s + 1 < ThisTokEnd && 1137 (*s == 'u' || *s == 'U')) { // Ensure some character after 'u'/'U' 1138 isUnsigned = true; 1139 ++s; 1140 } 1141 if (s + 1 < ThisTokEnd && 1142 ((*s == 'w' && *(++s) == 'b') || (*s == 'W' && *(++s) == 'B'))) { 1143 isBitInt = true; 1144 HasSize = true; 1145 continue; 1146 } 1147 } 1148 break; 1149 case 'w': 1150 case 'W': 1151 if (isFPConstant) 1152 break; // Invalid for floats. 1153 if (HasSize) 1154 break; // Invalid if we already have a size for the literal. 1155 1156 // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We 1157 // explicitly do not support the suffix in C++ as an extension because a 1158 // library-based UDL that resolves to a library type may be more 1159 // appropriate there. The same rules apply for __wb/__WB. 1160 if ((!LangOpts.CPlusPlus || DoubleUnderscore) && s + 1 < ThisTokEnd && 1161 ((s[0] == 'w' && s[1] == 'b') || (s[0] == 'W' && s[1] == 'B'))) { 1162 isBitInt = true; 1163 HasSize = true; 1164 ++s; // Skip both characters (2nd char skipped on continue). 1165 continue; // Success. 1166 } 1167 } 1168 // If we reached here, there was an error or a ud-suffix. 1169 break; 1170 } 1171 1172 // "i", "if", and "il" are user-defined suffixes in C++1y. 1173 if (s != ThisTokEnd || isImaginary) { 1174 // FIXME: Don't bother expanding UCNs if !tok.hasUCN(). 1175 expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)); 1176 if (isValidUDSuffix(LangOpts, UDSuffixBuf)) { 1177 if (!isImaginary) { 1178 // Any suffix pieces we might have parsed are actually part of the 1179 // ud-suffix. 1180 isLong = false; 1181 isUnsigned = false; 1182 isLongLong = false; 1183 isSizeT = false; 1184 isFloat = false; 1185 isFloat16 = false; 1186 isHalf = false; 1187 isImaginary = false; 1188 isBitInt = false; 1189 MicrosoftInteger = 0; 1190 saw_fixed_point_suffix = false; 1191 isFract = false; 1192 isAccum = false; 1193 } 1194 1195 saw_ud_suffix = true; 1196 return; 1197 } 1198 1199 if (s != ThisTokEnd) { 1200 // Report an error if there are any. 1201 Diags.Report(Lexer::AdvanceToTokenCharacter( 1202 TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts), 1203 diag::err_invalid_suffix_constant) 1204 << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin) 1205 << (isFixedPointConstant ? 2 : isFPConstant); 1206 hadError = true; 1207 } 1208 } 1209 1210 if (!hadError && saw_fixed_point_suffix) { 1211 assert(isFract || isAccum); 1212 } 1213 } 1214 1215 /// ParseDecimalOrOctalCommon - This method is called for decimal or octal 1216 /// numbers. It issues an error for illegal digits, and handles floating point 1217 /// parsing. If it detects a floating point number, the radix is set to 10. 1218 void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){ 1219 assert((radix == 8 || radix == 10) && "Unexpected radix"); 1220 1221 // If we have a hex digit other than 'e' (which denotes a FP exponent) then 1222 // the code is using an incorrect base. 1223 if (isHexDigit(*s) && *s != 'e' && *s != 'E' && 1224 !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) { 1225 Diags.Report( 1226 Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts), 1227 diag::err_invalid_digit) 1228 << StringRef(s, 1) << (radix == 8 ? 1 : 0); 1229 hadError = true; 1230 return; 1231 } 1232 1233 if (*s == '.') { 1234 checkSeparator(TokLoc, s, CSK_AfterDigits); 1235 s++; 1236 radix = 10; 1237 saw_period = true; 1238 checkSeparator(TokLoc, s, CSK_BeforeDigits); 1239 s = SkipDigits(s); // Skip suffix. 1240 } 1241 if (*s == 'e' || *s == 'E') { // exponent 1242 checkSeparator(TokLoc, s, CSK_AfterDigits); 1243 const char *Exponent = s; 1244 s++; 1245 radix = 10; 1246 saw_exponent = true; 1247 if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign 1248 const char *first_non_digit = SkipDigits(s); 1249 if (containsDigits(s, first_non_digit)) { 1250 checkSeparator(TokLoc, s, CSK_BeforeDigits); 1251 s = first_non_digit; 1252 } else { 1253 if (!hadError) { 1254 Diags.Report(Lexer::AdvanceToTokenCharacter( 1255 TokLoc, Exponent - ThisTokBegin, SM, LangOpts), 1256 diag::err_exponent_has_no_digits); 1257 hadError = true; 1258 } 1259 return; 1260 } 1261 } 1262 } 1263 1264 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved 1265 /// suffixes as ud-suffixes, because the diagnostic experience is better if we 1266 /// treat it as an invalid suffix. 1267 bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, 1268 StringRef Suffix) { 1269 if (!LangOpts.CPlusPlus11 || Suffix.empty()) 1270 return false; 1271 1272 // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid. 1273 // Suffixes starting with '__' (double underscore) are for use by 1274 // the implementation. 1275 if (Suffix.starts_with("_") && !Suffix.starts_with("__")) 1276 return true; 1277 1278 // In C++11, there are no library suffixes. 1279 if (!LangOpts.CPlusPlus14) 1280 return false; 1281 1282 // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library. 1283 // Per tweaked N3660, "il", "i", and "if" are also used in the library. 1284 // In C++2a "d" and "y" are used in the library. 1285 return llvm::StringSwitch<bool>(Suffix) 1286 .Cases("h", "min", "s", true) 1287 .Cases("ms", "us", "ns", true) 1288 .Cases("il", "i", "if", true) 1289 .Cases("d", "y", LangOpts.CPlusPlus20) 1290 .Default(false); 1291 } 1292 1293 void NumericLiteralParser::checkSeparator(SourceLocation TokLoc, 1294 const char *Pos, 1295 CheckSeparatorKind IsAfterDigits) { 1296 if (IsAfterDigits == CSK_AfterDigits) { 1297 if (Pos == ThisTokBegin) 1298 return; 1299 --Pos; 1300 } else if (Pos == ThisTokEnd) 1301 return; 1302 1303 if (isDigitSeparator(*Pos)) { 1304 Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM, 1305 LangOpts), 1306 diag::err_digit_separator_not_between_digits) 1307 << IsAfterDigits; 1308 hadError = true; 1309 } 1310 } 1311 1312 /// ParseNumberStartingWithZero - This method is called when the first character 1313 /// of the number is found to be a zero. This means it is either an octal 1314 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or 1315 /// a floating point number (01239.123e4). Eat the prefix, determining the 1316 /// radix etc. 1317 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { 1318 assert(s[0] == '0' && "Invalid method call"); 1319 s++; 1320 1321 int c1 = s[0]; 1322 1323 // Handle a hex number like 0x1234. 1324 if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) { 1325 s++; 1326 assert(s < ThisTokEnd && "didn't maximally munch?"); 1327 radix = 16; 1328 DigitsBegin = s; 1329 s = SkipHexDigits(s); 1330 bool HasSignificandDigits = containsDigits(DigitsBegin, s); 1331 if (s == ThisTokEnd) { 1332 // Done. 1333 } else if (*s == '.') { 1334 s++; 1335 saw_period = true; 1336 const char *floatDigitsBegin = s; 1337 s = SkipHexDigits(s); 1338 if (containsDigits(floatDigitsBegin, s)) 1339 HasSignificandDigits = true; 1340 if (HasSignificandDigits) 1341 checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits); 1342 } 1343 1344 if (!HasSignificandDigits) { 1345 Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, 1346 LangOpts), 1347 diag::err_hex_constant_requires) 1348 << LangOpts.CPlusPlus << 1; 1349 hadError = true; 1350 return; 1351 } 1352 1353 // A binary exponent can appear with or with a '.'. If dotted, the 1354 // binary exponent is required. 1355 if (*s == 'p' || *s == 'P') { 1356 checkSeparator(TokLoc, s, CSK_AfterDigits); 1357 const char *Exponent = s; 1358 s++; 1359 saw_exponent = true; 1360 if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign 1361 const char *first_non_digit = SkipDigits(s); 1362 if (!containsDigits(s, first_non_digit)) { 1363 if (!hadError) { 1364 Diags.Report(Lexer::AdvanceToTokenCharacter( 1365 TokLoc, Exponent - ThisTokBegin, SM, LangOpts), 1366 diag::err_exponent_has_no_digits); 1367 hadError = true; 1368 } 1369 return; 1370 } 1371 checkSeparator(TokLoc, s, CSK_BeforeDigits); 1372 s = first_non_digit; 1373 1374 if (!LangOpts.HexFloats) 1375 Diags.Report(TokLoc, LangOpts.CPlusPlus 1376 ? diag::ext_hex_literal_invalid 1377 : diag::ext_hex_constant_invalid); 1378 else if (LangOpts.CPlusPlus17) 1379 Diags.Report(TokLoc, diag::warn_cxx17_hex_literal); 1380 } else if (saw_period) { 1381 Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, 1382 LangOpts), 1383 diag::err_hex_constant_requires) 1384 << LangOpts.CPlusPlus << 0; 1385 hadError = true; 1386 } 1387 return; 1388 } 1389 1390 // Handle simple binary numbers 0b01010 1391 if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) { 1392 // 0b101010 is a C++14 and C23 extension. 1393 unsigned DiagId; 1394 if (LangOpts.CPlusPlus14) 1395 DiagId = diag::warn_cxx11_compat_binary_literal; 1396 else if (LangOpts.C23) 1397 DiagId = diag::warn_c23_compat_binary_literal; 1398 else if (LangOpts.CPlusPlus) 1399 DiagId = diag::ext_binary_literal_cxx14; 1400 else 1401 DiagId = diag::ext_binary_literal; 1402 Diags.Report(TokLoc, DiagId); 1403 ++s; 1404 assert(s < ThisTokEnd && "didn't maximally munch?"); 1405 radix = 2; 1406 DigitsBegin = s; 1407 s = SkipBinaryDigits(s); 1408 if (s == ThisTokEnd) { 1409 // Done. 1410 } else if (isHexDigit(*s) && 1411 !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) { 1412 Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, 1413 LangOpts), 1414 diag::err_invalid_digit) 1415 << StringRef(s, 1) << 2; 1416 hadError = true; 1417 } 1418 // Other suffixes will be diagnosed by the caller. 1419 return; 1420 } 1421 1422 // Parse a potential octal literal prefix. 1423 bool IsSingleZero = false; 1424 if ((c1 == 'O' || c1 == 'o') && (s[1] >= '0' && s[1] <= '7')) { 1425 unsigned DiagId; 1426 if (LangOpts.C2y) 1427 DiagId = diag::warn_c2y_compat_octal_literal; 1428 else if (LangOpts.CPlusPlus) 1429 DiagId = diag::ext_cpp_octal_literal; 1430 else 1431 DiagId = diag::ext_octal_literal; 1432 Diags.Report(TokLoc, DiagId); 1433 ++s; 1434 DigitsBegin = s; 1435 radix = 8; 1436 s = SkipOctalDigits(s); 1437 if (s == ThisTokEnd) { 1438 // Done 1439 } else if ((isHexDigit(*s) && *s != 'e' && *s != 'E' && *s != '.') && 1440 !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) { 1441 auto InvalidDigitLoc = Lexer::AdvanceToTokenCharacter( 1442 TokLoc, s - ThisTokBegin, SM, LangOpts); 1443 Diags.Report(InvalidDigitLoc, diag::err_invalid_digit) 1444 << StringRef(s, 1) << 1; 1445 hadError = true; 1446 } 1447 // Other suffixes will be diagnosed by the caller. 1448 return; 1449 } 1450 1451 auto _ = llvm::make_scope_exit([&] { 1452 // If we still have an octal value but we did not see an octal prefix, 1453 // diagnose as being an obsolescent feature starting in C2y. 1454 if (radix == 8 && LangOpts.C2y && !hadError && !IsSingleZero) 1455 Diags.Report(TokLoc, diag::warn_unprefixed_octal_deprecated); 1456 }); 1457 1458 // For now, the radix is set to 8. If we discover that we have a 1459 // floating point constant, the radix will change to 10. Octal floating 1460 // point constants are not permitted (only decimal and hexadecimal). 1461 radix = 8; 1462 const char *PossibleNewDigitStart = s; 1463 s = SkipOctalDigits(s); 1464 // When the value is 0 followed by a suffix (like 0wb), we want to leave 0 1465 // as the start of the digits. So if skipping octal digits does not skip 1466 // anything, we leave the digit start where it was. 1467 if (s != PossibleNewDigitStart) 1468 DigitsBegin = PossibleNewDigitStart; 1469 else 1470 IsSingleZero = (s == ThisTokBegin + 1); 1471 1472 if (s == ThisTokEnd) 1473 return; // Done, simple octal number like 01234 1474 1475 // If we have some other non-octal digit that *is* a decimal digit, see if 1476 // this is part of a floating point number like 094.123 or 09e1. 1477 if (isDigit(*s)) { 1478 const char *EndDecimal = SkipDigits(s); 1479 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { 1480 s = EndDecimal; 1481 radix = 10; 1482 } 1483 } 1484 1485 ParseDecimalOrOctalCommon(TokLoc); 1486 } 1487 1488 static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) { 1489 switch (Radix) { 1490 case 2: 1491 return NumDigits <= 64; 1492 case 8: 1493 return NumDigits <= 64 / 3; // Digits are groups of 3 bits. 1494 case 10: 1495 return NumDigits <= 19; // floor(log10(2^64)) 1496 case 16: 1497 return NumDigits <= 64 / 4; // Digits are groups of 4 bits. 1498 default: 1499 llvm_unreachable("impossible Radix"); 1500 } 1501 } 1502 1503 /// GetIntegerValue - Convert this numeric literal value to an APInt that 1504 /// matches Val's input width. If there is an overflow, set Val to the low bits 1505 /// of the result and return true. Otherwise, return false. 1506 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { 1507 // Fast path: Compute a conservative bound on the maximum number of 1508 // bits per digit in this radix. If we can't possibly overflow a 1509 // uint64 based on that bound then do the simple conversion to 1510 // integer. This avoids the expensive overflow checking below, and 1511 // handles the common cases that matter (small decimal integers and 1512 // hex/octal values which don't overflow). 1513 const unsigned NumDigits = SuffixBegin - DigitsBegin; 1514 if (alwaysFitsInto64Bits(radix, NumDigits)) { 1515 uint64_t N = 0; 1516 for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr) 1517 if (!isDigitSeparator(*Ptr)) 1518 N = N * radix + llvm::hexDigitValue(*Ptr); 1519 1520 // This will truncate the value to Val's input width. Simply check 1521 // for overflow by comparing. 1522 Val = N; 1523 return Val.getZExtValue() != N; 1524 } 1525 1526 Val = 0; 1527 const char *Ptr = DigitsBegin; 1528 1529 llvm::APInt RadixVal(Val.getBitWidth(), radix); 1530 llvm::APInt CharVal(Val.getBitWidth(), 0); 1531 llvm::APInt OldVal = Val; 1532 1533 bool OverflowOccurred = false; 1534 while (Ptr < SuffixBegin) { 1535 if (isDigitSeparator(*Ptr)) { 1536 ++Ptr; 1537 continue; 1538 } 1539 1540 unsigned C = llvm::hexDigitValue(*Ptr++); 1541 1542 // If this letter is out of bound for this radix, reject it. 1543 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 1544 1545 CharVal = C; 1546 1547 // Add the digit to the value in the appropriate radix. If adding in digits 1548 // made the value smaller, then this overflowed. 1549 OldVal = Val; 1550 1551 // Multiply by radix, did overflow occur on the multiply? 1552 Val *= RadixVal; 1553 OverflowOccurred |= Val.udiv(RadixVal) != OldVal; 1554 1555 // Add value, did overflow occur on the value? 1556 // (a + b) ult b <=> overflow 1557 Val += CharVal; 1558 OverflowOccurred |= Val.ult(CharVal); 1559 } 1560 return OverflowOccurred; 1561 } 1562 1563 llvm::APFloat::opStatus 1564 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result, 1565 llvm::RoundingMode RM) { 1566 using llvm::APFloat; 1567 1568 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); 1569 1570 llvm::SmallString<16> Buffer; 1571 StringRef Str(ThisTokBegin, n); 1572 if (Str.contains('\'')) { 1573 Buffer.reserve(n); 1574 std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer), 1575 &isDigitSeparator); 1576 Str = Buffer; 1577 } 1578 1579 auto StatusOrErr = Result.convertFromString(Str, RM); 1580 assert(StatusOrErr && "Invalid floating point representation"); 1581 return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr 1582 : APFloat::opInvalidOp; 1583 } 1584 1585 static inline bool IsExponentPart(char c, bool isHex) { 1586 if (isHex) 1587 return c == 'p' || c == 'P'; 1588 return c == 'e' || c == 'E'; 1589 } 1590 1591 bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) { 1592 assert(radix == 16 || radix == 10); 1593 1594 // Find how many digits are needed to store the whole literal. 1595 unsigned NumDigits = SuffixBegin - DigitsBegin; 1596 if (saw_period) --NumDigits; 1597 1598 // Initial scan of the exponent if it exists 1599 bool ExpOverflowOccurred = false; 1600 bool NegativeExponent = false; 1601 const char *ExponentBegin; 1602 uint64_t Exponent = 0; 1603 int64_t BaseShift = 0; 1604 if (saw_exponent) { 1605 const char *Ptr = DigitsBegin; 1606 1607 while (!IsExponentPart(*Ptr, radix == 16)) 1608 ++Ptr; 1609 ExponentBegin = Ptr; 1610 ++Ptr; 1611 NegativeExponent = *Ptr == '-'; 1612 if (NegativeExponent) ++Ptr; 1613 1614 unsigned NumExpDigits = SuffixBegin - Ptr; 1615 if (alwaysFitsInto64Bits(radix, NumExpDigits)) { 1616 llvm::StringRef ExpStr(Ptr, NumExpDigits); 1617 llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10); 1618 Exponent = ExpInt.getZExtValue(); 1619 } else { 1620 ExpOverflowOccurred = true; 1621 } 1622 1623 if (NegativeExponent) BaseShift -= Exponent; 1624 else BaseShift += Exponent; 1625 } 1626 1627 // Number of bits needed for decimal literal is 1628 // ceil(NumDigits * log2(10)) Integral part 1629 // + Scale Fractional part 1630 // + ceil(Exponent * log2(10)) Exponent 1631 // -------------------------------------------------- 1632 // ceil((NumDigits + Exponent) * log2(10)) + Scale 1633 // 1634 // But for simplicity in handling integers, we can round up log2(10) to 4, 1635 // making: 1636 // 4 * (NumDigits + Exponent) + Scale 1637 // 1638 // Number of digits needed for hexadecimal literal is 1639 // 4 * NumDigits Integral part 1640 // + Scale Fractional part 1641 // + Exponent Exponent 1642 // -------------------------------------------------- 1643 // (4 * NumDigits) + Scale + Exponent 1644 uint64_t NumBitsNeeded; 1645 if (radix == 10) 1646 NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale; 1647 else 1648 NumBitsNeeded = 4 * NumDigits + Exponent + Scale; 1649 1650 if (NumBitsNeeded > std::numeric_limits<unsigned>::max()) 1651 ExpOverflowOccurred = true; 1652 llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false); 1653 1654 bool FoundDecimal = false; 1655 1656 int64_t FractBaseShift = 0; 1657 const char *End = saw_exponent ? ExponentBegin : SuffixBegin; 1658 for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) { 1659 if (*Ptr == '.') { 1660 FoundDecimal = true; 1661 continue; 1662 } 1663 1664 // Normal reading of an integer 1665 unsigned C = llvm::hexDigitValue(*Ptr); 1666 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 1667 1668 Val *= radix; 1669 Val += C; 1670 1671 if (FoundDecimal) 1672 // Keep track of how much we will need to adjust this value by from the 1673 // number of digits past the radix point. 1674 --FractBaseShift; 1675 } 1676 1677 // For a radix of 16, we will be multiplying by 2 instead of 16. 1678 if (radix == 16) FractBaseShift *= 4; 1679 BaseShift += FractBaseShift; 1680 1681 Val <<= Scale; 1682 1683 uint64_t Base = (radix == 16) ? 2 : 10; 1684 if (BaseShift > 0) { 1685 for (int64_t i = 0; i < BaseShift; ++i) { 1686 Val *= Base; 1687 } 1688 } else if (BaseShift < 0) { 1689 for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i) 1690 Val = Val.udiv(Base); 1691 } 1692 1693 bool IntOverflowOccurred = false; 1694 auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth()); 1695 if (Val.getBitWidth() > StoreVal.getBitWidth()) { 1696 IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth())); 1697 StoreVal = Val.trunc(StoreVal.getBitWidth()); 1698 } else if (Val.getBitWidth() < StoreVal.getBitWidth()) { 1699 IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal); 1700 StoreVal = Val.zext(StoreVal.getBitWidth()); 1701 } else { 1702 StoreVal = Val; 1703 } 1704 1705 return IntOverflowOccurred || ExpOverflowOccurred; 1706 } 1707 1708 /// \verbatim 1709 /// user-defined-character-literal: [C++11 lex.ext] 1710 /// character-literal ud-suffix 1711 /// ud-suffix: 1712 /// identifier 1713 /// character-literal: [C++11 lex.ccon] 1714 /// ' c-char-sequence ' 1715 /// u' c-char-sequence ' 1716 /// U' c-char-sequence ' 1717 /// L' c-char-sequence ' 1718 /// u8' c-char-sequence ' [C++1z lex.ccon] 1719 /// c-char-sequence: 1720 /// c-char 1721 /// c-char-sequence c-char 1722 /// c-char: 1723 /// any member of the source character set except the single-quote ', 1724 /// backslash \, or new-line character 1725 /// escape-sequence 1726 /// universal-character-name 1727 /// escape-sequence: 1728 /// simple-escape-sequence 1729 /// octal-escape-sequence 1730 /// hexadecimal-escape-sequence 1731 /// simple-escape-sequence: 1732 /// one of \' \" \? \\ \a \b \f \n \r \t \v 1733 /// octal-escape-sequence: 1734 /// \ octal-digit 1735 /// \ octal-digit octal-digit 1736 /// \ octal-digit octal-digit octal-digit 1737 /// hexadecimal-escape-sequence: 1738 /// \x hexadecimal-digit 1739 /// hexadecimal-escape-sequence hexadecimal-digit 1740 /// universal-character-name: [C++11 lex.charset] 1741 /// \u hex-quad 1742 /// \U hex-quad hex-quad 1743 /// hex-quad: 1744 /// hex-digit hex-digit hex-digit hex-digit 1745 /// \endverbatim 1746 /// 1747 CharLiteralParser::CharLiteralParser(const char *begin, const char *end, 1748 SourceLocation Loc, Preprocessor &PP, 1749 tok::TokenKind kind) { 1750 // At this point we know that the character matches the regex "(L|u|U)?'.*'". 1751 HadError = false; 1752 1753 Kind = kind; 1754 1755 const char *TokBegin = begin; 1756 1757 // Skip over wide character determinant. 1758 if (Kind != tok::char_constant) 1759 ++begin; 1760 if (Kind == tok::utf8_char_constant) 1761 ++begin; 1762 1763 // Skip over the entry quote. 1764 if (begin[0] != '\'') { 1765 PP.Diag(Loc, diag::err_lexing_char); 1766 HadError = true; 1767 return; 1768 } 1769 1770 ++begin; 1771 1772 // Remove an optional ud-suffix. 1773 if (end[-1] != '\'') { 1774 const char *UDSuffixEnd = end; 1775 do { 1776 --end; 1777 } while (end[-1] != '\''); 1778 // FIXME: Don't bother with this if !tok.hasUCN(). 1779 expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end)); 1780 UDSuffixOffset = end - TokBegin; 1781 } 1782 1783 // Trim the ending quote. 1784 assert(end != begin && "Invalid token lexed"); 1785 --end; 1786 1787 // FIXME: The "Value" is an uint64_t so we can handle char literals of 1788 // up to 64-bits. 1789 // FIXME: This extensively assumes that 'char' is 8-bits. 1790 assert(PP.getTargetInfo().getCharWidth() == 8 && 1791 "Assumes char is 8 bits"); 1792 assert(PP.getTargetInfo().getIntWidth() <= 64 && 1793 (PP.getTargetInfo().getIntWidth() & 7) == 0 && 1794 "Assumes sizeof(int) on target is <= 64 and a multiple of char"); 1795 assert(PP.getTargetInfo().getWCharWidth() <= 64 && 1796 "Assumes sizeof(wchar) on target is <= 64"); 1797 1798 SmallVector<uint32_t, 4> codepoint_buffer; 1799 codepoint_buffer.resize(end - begin); 1800 uint32_t *buffer_begin = &codepoint_buffer.front(); 1801 uint32_t *buffer_end = buffer_begin + codepoint_buffer.size(); 1802 1803 // Unicode escapes representing characters that cannot be correctly 1804 // represented in a single code unit are disallowed in character literals 1805 // by this implementation. 1806 uint32_t largest_character_for_kind; 1807 if (tok::wide_char_constant == Kind) { 1808 largest_character_for_kind = 1809 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth()); 1810 } else if (tok::utf8_char_constant == Kind) { 1811 largest_character_for_kind = 0x7F; 1812 } else if (tok::utf16_char_constant == Kind) { 1813 largest_character_for_kind = 0xFFFF; 1814 } else if (tok::utf32_char_constant == Kind) { 1815 largest_character_for_kind = 0x10FFFF; 1816 } else { 1817 largest_character_for_kind = 0x7Fu; 1818 } 1819 1820 while (begin != end) { 1821 // Is this a span of non-escape characters? 1822 if (begin[0] != '\\') { 1823 char const *start = begin; 1824 do { 1825 ++begin; 1826 } while (begin != end && *begin != '\\'); 1827 1828 char const *tmp_in_start = start; 1829 uint32_t *tmp_out_start = buffer_begin; 1830 llvm::ConversionResult res = 1831 llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start), 1832 reinterpret_cast<llvm::UTF8 const *>(begin), 1833 &buffer_begin, buffer_end, llvm::strictConversion); 1834 if (res != llvm::conversionOK) { 1835 // If we see bad encoding for unprefixed character literals, warn and 1836 // simply copy the byte values, for compatibility with gcc and 1837 // older versions of clang. 1838 bool NoErrorOnBadEncoding = isOrdinary(); 1839 unsigned Msg = diag::err_bad_character_encoding; 1840 if (NoErrorOnBadEncoding) 1841 Msg = diag::warn_bad_character_encoding; 1842 PP.Diag(Loc, Msg); 1843 if (NoErrorOnBadEncoding) { 1844 start = tmp_in_start; 1845 buffer_begin = tmp_out_start; 1846 for (; start != begin; ++start, ++buffer_begin) 1847 *buffer_begin = static_cast<uint8_t>(*start); 1848 } else { 1849 HadError = true; 1850 } 1851 } else { 1852 for (; tmp_out_start < buffer_begin; ++tmp_out_start) { 1853 if (*tmp_out_start > largest_character_for_kind) { 1854 HadError = true; 1855 PP.Diag(Loc, diag::err_character_too_large); 1856 } 1857 } 1858 } 1859 1860 continue; 1861 } 1862 // Is this a Universal Character Name escape? 1863 if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') { 1864 unsigned short UcnLen = 0; 1865 if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen, 1866 FullSourceLoc(Loc, PP.getSourceManager()), 1867 &PP.getDiagnostics(), PP.getLangOpts(), true)) { 1868 HadError = true; 1869 } else if (*buffer_begin > largest_character_for_kind) { 1870 HadError = true; 1871 PP.Diag(Loc, diag::err_character_too_large); 1872 } 1873 1874 ++buffer_begin; 1875 continue; 1876 } 1877 unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); 1878 uint64_t result = 1879 ProcessCharEscape(TokBegin, begin, end, HadError, 1880 FullSourceLoc(Loc, PP.getSourceManager()), CharWidth, 1881 &PP.getDiagnostics(), PP.getLangOpts(), 1882 StringLiteralEvalMethod::Evaluated); 1883 *buffer_begin++ = result; 1884 } 1885 1886 unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front(); 1887 1888 if (NumCharsSoFar > 1) { 1889 if (isOrdinary() && NumCharsSoFar == 4) 1890 PP.Diag(Loc, diag::warn_four_char_character_literal); 1891 else if (isOrdinary()) 1892 PP.Diag(Loc, diag::warn_multichar_character_literal); 1893 else { 1894 PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1); 1895 HadError = true; 1896 } 1897 IsMultiChar = true; 1898 } else { 1899 IsMultiChar = false; 1900 } 1901 1902 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); 1903 1904 // Narrow character literals act as though their value is concatenated 1905 // in this implementation, but warn on overflow. 1906 bool multi_char_too_long = false; 1907 if (isOrdinary() && isMultiChar()) { 1908 LitVal = 0; 1909 for (size_t i = 0; i < NumCharsSoFar; ++i) { 1910 // check for enough leading zeros to shift into 1911 multi_char_too_long |= (LitVal.countl_zero() < 8); 1912 LitVal <<= 8; 1913 LitVal = LitVal + (codepoint_buffer[i] & 0xFF); 1914 } 1915 } else if (NumCharsSoFar > 0) { 1916 // otherwise just take the last character 1917 LitVal = buffer_begin[-1]; 1918 } 1919 1920 if (!HadError && multi_char_too_long) { 1921 PP.Diag(Loc, diag::warn_char_constant_too_large); 1922 } 1923 1924 // Transfer the value from APInt to uint64_t 1925 Value = LitVal.getZExtValue(); 1926 1927 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") 1928 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple 1929 // character constants are not sign extended in the this implementation: 1930 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. 1931 if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) && 1932 PP.getLangOpts().CharIsSigned) 1933 Value = (signed char)Value; 1934 } 1935 1936 /// \verbatim 1937 /// string-literal: [C++0x lex.string] 1938 /// encoding-prefix " [s-char-sequence] " 1939 /// encoding-prefix R raw-string 1940 /// encoding-prefix: 1941 /// u8 1942 /// u 1943 /// U 1944 /// L 1945 /// s-char-sequence: 1946 /// s-char 1947 /// s-char-sequence s-char 1948 /// s-char: 1949 /// any member of the source character set except the double-quote ", 1950 /// backslash \, or new-line character 1951 /// escape-sequence 1952 /// universal-character-name 1953 /// raw-string: 1954 /// " d-char-sequence ( r-char-sequence ) d-char-sequence " 1955 /// r-char-sequence: 1956 /// r-char 1957 /// r-char-sequence r-char 1958 /// r-char: 1959 /// any member of the source character set, except a right parenthesis ) 1960 /// followed by the initial d-char-sequence (which may be empty) 1961 /// followed by a double quote ". 1962 /// d-char-sequence: 1963 /// d-char 1964 /// d-char-sequence d-char 1965 /// d-char: 1966 /// any member of the basic source character set except: 1967 /// space, the left parenthesis (, the right parenthesis ), 1968 /// the backslash \, and the control characters representing horizontal 1969 /// tab, vertical tab, form feed, and newline. 1970 /// escape-sequence: [C++0x lex.ccon] 1971 /// simple-escape-sequence 1972 /// octal-escape-sequence 1973 /// hexadecimal-escape-sequence 1974 /// simple-escape-sequence: 1975 /// one of \' \" \? \\ \a \b \f \n \r \t \v 1976 /// octal-escape-sequence: 1977 /// \ octal-digit 1978 /// \ octal-digit octal-digit 1979 /// \ octal-digit octal-digit octal-digit 1980 /// hexadecimal-escape-sequence: 1981 /// \x hexadecimal-digit 1982 /// hexadecimal-escape-sequence hexadecimal-digit 1983 /// universal-character-name: 1984 /// \u hex-quad 1985 /// \U hex-quad hex-quad 1986 /// hex-quad: 1987 /// hex-digit hex-digit hex-digit hex-digit 1988 /// \endverbatim 1989 /// 1990 StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks, 1991 Preprocessor &PP, 1992 StringLiteralEvalMethod EvalMethod) 1993 : SM(PP.getSourceManager()), Features(PP.getLangOpts()), 1994 Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()), 1995 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), 1996 ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false), 1997 Pascal(false) { 1998 init(StringToks); 1999 } 2000 2001 void StringLiteralParser::init(ArrayRef<Token> StringToks){ 2002 // The literal token may have come from an invalid source location (e.g. due 2003 // to a PCH error), in which case the token length will be 0. 2004 if (StringToks.empty() || StringToks[0].getLength() < 2) 2005 return DiagnoseLexingError(SourceLocation()); 2006 2007 // Scan all of the string portions, remember the max individual token length, 2008 // computing a bound on the concatenated string length, and see whether any 2009 // piece is a wide-string. If any of the string portions is a wide-string 2010 // literal, the result is a wide-string literal [C99 6.4.5p4]. 2011 assert(!StringToks.empty() && "expected at least one token"); 2012 MaxTokenLength = StringToks[0].getLength(); 2013 assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); 2014 SizeBound = StringToks[0].getLength() - 2; // -2 for "". 2015 hadError = false; 2016 2017 // Determines the kind of string from the prefix 2018 Kind = tok::string_literal; 2019 2020 /// (C99 5.1.1.2p1). The common case is only one string fragment. 2021 for (const Token &Tok : StringToks) { 2022 if (Tok.getLength() < 2) 2023 return DiagnoseLexingError(Tok.getLocation()); 2024 2025 // The string could be shorter than this if it needs cleaning, but this is a 2026 // reasonable bound, which is all we need. 2027 assert(Tok.getLength() >= 2 && "literal token is invalid!"); 2028 SizeBound += Tok.getLength() - 2; // -2 for "". 2029 2030 // Remember maximum string piece length. 2031 if (Tok.getLength() > MaxTokenLength) 2032 MaxTokenLength = Tok.getLength(); 2033 2034 // Remember if we see any wide or utf-8/16/32 strings. 2035 // Also check for illegal concatenations. 2036 if (isUnevaluated() && Tok.getKind() != tok::string_literal) { 2037 if (Diags) { 2038 SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter( 2039 Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM, 2040 Features); 2041 CharSourceRange Range = 2042 CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc}); 2043 StringRef Prefix(SM.getCharacterData(Tok.getLocation()), 2044 getEncodingPrefixLen(Tok.getKind())); 2045 Diags->Report(Tok.getLocation(), 2046 Features.CPlusPlus26 2047 ? diag::err_unevaluated_string_prefix 2048 : diag::warn_unevaluated_string_prefix) 2049 << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range); 2050 } 2051 if (Features.CPlusPlus26) 2052 hadError = true; 2053 } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) { 2054 if (isOrdinary()) { 2055 Kind = Tok.getKind(); 2056 } else { 2057 if (Diags) 2058 Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat); 2059 hadError = true; 2060 } 2061 } 2062 } 2063 2064 // Include space for the null terminator. 2065 ++SizeBound; 2066 2067 // TODO: K&R warning: "traditional C rejects string constant concatenation" 2068 2069 // Get the width in bytes of char/wchar_t/char16_t/char32_t 2070 CharByteWidth = getCharWidth(Kind, Target); 2071 assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); 2072 CharByteWidth /= 8; 2073 2074 // The output buffer size needs to be large enough to hold wide characters. 2075 // This is a worst-case assumption which basically corresponds to L"" "long". 2076 SizeBound *= CharByteWidth; 2077 2078 // Size the temporary buffer to hold the result string data. 2079 ResultBuf.resize(SizeBound); 2080 2081 // Likewise, but for each string piece. 2082 SmallString<512> TokenBuf; 2083 TokenBuf.resize(MaxTokenLength); 2084 2085 // Loop over all the strings, getting their spelling, and expanding them to 2086 // wide strings as appropriate. 2087 ResultPtr = &ResultBuf[0]; // Next byte to fill in. 2088 2089 Pascal = false; 2090 2091 SourceLocation UDSuffixTokLoc; 2092 2093 for (unsigned i = 0, e = StringToks.size(); i != e; ++i) { 2094 const char *ThisTokBuf = &TokenBuf[0]; 2095 // Get the spelling of the token, which eliminates trigraphs, etc. We know 2096 // that ThisTokBuf points to a buffer that is big enough for the whole token 2097 // and 'spelled' tokens can only shrink. 2098 bool StringInvalid = false; 2099 unsigned ThisTokLen = 2100 Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, 2101 &StringInvalid); 2102 if (StringInvalid) 2103 return DiagnoseLexingError(StringToks[i].getLocation()); 2104 2105 const char *ThisTokBegin = ThisTokBuf; 2106 const char *ThisTokEnd = ThisTokBuf+ThisTokLen; 2107 2108 // Remove an optional ud-suffix. 2109 if (ThisTokEnd[-1] != '"') { 2110 const char *UDSuffixEnd = ThisTokEnd; 2111 do { 2112 --ThisTokEnd; 2113 } while (ThisTokEnd[-1] != '"'); 2114 2115 StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd); 2116 2117 if (UDSuffixBuf.empty()) { 2118 if (StringToks[i].hasUCN()) 2119 expandUCNs(UDSuffixBuf, UDSuffix); 2120 else 2121 UDSuffixBuf.assign(UDSuffix); 2122 UDSuffixToken = i; 2123 UDSuffixOffset = ThisTokEnd - ThisTokBuf; 2124 UDSuffixTokLoc = StringToks[i].getLocation(); 2125 } else { 2126 SmallString<32> ExpandedUDSuffix; 2127 if (StringToks[i].hasUCN()) { 2128 expandUCNs(ExpandedUDSuffix, UDSuffix); 2129 UDSuffix = ExpandedUDSuffix; 2130 } 2131 2132 // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the 2133 // result of a concatenation involving at least one user-defined-string- 2134 // literal, all the participating user-defined-string-literals shall 2135 // have the same ud-suffix. 2136 bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty(); 2137 if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) { 2138 if (Diags) { 2139 SourceLocation TokLoc = StringToks[i].getLocation(); 2140 if (UnevaluatedStringHasUDL) { 2141 Diags->Report(TokLoc, diag::err_unevaluated_string_udl) 2142 << SourceRange(TokLoc, TokLoc); 2143 } else { 2144 Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix) 2145 << UDSuffixBuf << UDSuffix 2146 << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc); 2147 } 2148 } 2149 hadError = true; 2150 } 2151 } 2152 } 2153 2154 // Strip the end quote. 2155 --ThisTokEnd; 2156 2157 // TODO: Input character set mapping support. 2158 2159 // Skip marker for wide or unicode strings. 2160 if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { 2161 ++ThisTokBuf; 2162 // Skip 8 of u8 marker for utf8 strings. 2163 if (ThisTokBuf[0] == '8') 2164 ++ThisTokBuf; 2165 } 2166 2167 // Check for raw string 2168 if (ThisTokBuf[0] == 'R') { 2169 if (ThisTokBuf[1] != '"') { 2170 // The file may have come from PCH and then changed after loading the 2171 // PCH; Fail gracefully. 2172 return DiagnoseLexingError(StringToks[i].getLocation()); 2173 } 2174 ThisTokBuf += 2; // skip R" 2175 2176 // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16 2177 // characters. 2178 constexpr unsigned MaxRawStrDelimLen = 16; 2179 2180 const char *Prefix = ThisTokBuf; 2181 while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen && 2182 ThisTokBuf[0] != '(') 2183 ++ThisTokBuf; 2184 if (ThisTokBuf[0] != '(') 2185 return DiagnoseLexingError(StringToks[i].getLocation()); 2186 ++ThisTokBuf; // skip '(' 2187 2188 // Remove same number of characters from the end 2189 ThisTokEnd -= ThisTokBuf - Prefix; 2190 if (ThisTokEnd < ThisTokBuf) 2191 return DiagnoseLexingError(StringToks[i].getLocation()); 2192 2193 // C++14 [lex.string]p4: A source-file new-line in a raw string literal 2194 // results in a new-line in the resulting execution string-literal. 2195 StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf); 2196 while (!RemainingTokenSpan.empty()) { 2197 // Split the string literal on \r\n boundaries. 2198 size_t CRLFPos = RemainingTokenSpan.find("\r\n"); 2199 StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos); 2200 StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos); 2201 2202 // Copy everything before the \r\n sequence into the string literal. 2203 if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF)) 2204 hadError = true; 2205 2206 // Point into the \n inside the \r\n sequence and operate on the 2207 // remaining portion of the literal. 2208 RemainingTokenSpan = AfterCRLF.substr(1); 2209 } 2210 } else { 2211 if (ThisTokBuf[0] != '"') { 2212 // The file may have come from PCH and then changed after loading the 2213 // PCH; Fail gracefully. 2214 return DiagnoseLexingError(StringToks[i].getLocation()); 2215 } 2216 ++ThisTokBuf; // skip " 2217 2218 // Check if this is a pascal string 2219 if (!isUnevaluated() && Features.PascalStrings && 2220 ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' && 2221 ThisTokBuf[1] == 'p') { 2222 2223 // If the \p sequence is found in the first token, we have a pascal string 2224 // Otherwise, if we already have a pascal string, ignore the first \p 2225 if (i == 0) { 2226 ++ThisTokBuf; 2227 Pascal = true; 2228 } else if (Pascal) 2229 ThisTokBuf += 2; 2230 } 2231 2232 while (ThisTokBuf != ThisTokEnd) { 2233 // Is this a span of non-escape characters? 2234 if (ThisTokBuf[0] != '\\') { 2235 const char *InStart = ThisTokBuf; 2236 do { 2237 ++ThisTokBuf; 2238 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); 2239 2240 // Copy the character span over. 2241 if (CopyStringFragment(StringToks[i], ThisTokBegin, 2242 StringRef(InStart, ThisTokBuf - InStart))) 2243 hadError = true; 2244 continue; 2245 } 2246 // Is this a Universal Character Name escape? 2247 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' || 2248 ThisTokBuf[1] == 'N') { 2249 EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, 2250 ResultPtr, hadError, 2251 FullSourceLoc(StringToks[i].getLocation(), SM), 2252 CharByteWidth, Diags, Features); 2253 continue; 2254 } 2255 // Otherwise, this is a non-UCN escape character. Process it. 2256 unsigned ResultChar = 2257 ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, 2258 FullSourceLoc(StringToks[i].getLocation(), SM), 2259 CharByteWidth * 8, Diags, Features, EvalMethod); 2260 2261 if (CharByteWidth == 4) { 2262 // FIXME: Make the type of the result buffer correct instead of 2263 // using reinterpret_cast. 2264 llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr); 2265 *ResultWidePtr = ResultChar; 2266 ResultPtr += 4; 2267 } else if (CharByteWidth == 2) { 2268 // FIXME: Make the type of the result buffer correct instead of 2269 // using reinterpret_cast. 2270 llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr); 2271 *ResultWidePtr = ResultChar & 0xFFFF; 2272 ResultPtr += 2; 2273 } else { 2274 assert(CharByteWidth == 1 && "Unexpected char width"); 2275 *ResultPtr++ = ResultChar & 0xFF; 2276 } 2277 } 2278 } 2279 } 2280 2281 assert((!Pascal || !isUnevaluated()) && 2282 "Pascal string in unevaluated context"); 2283 if (Pascal) { 2284 if (CharByteWidth == 4) { 2285 // FIXME: Make the type of the result buffer correct instead of 2286 // using reinterpret_cast. 2287 llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data()); 2288 ResultWidePtr[0] = GetNumStringChars() - 1; 2289 } else if (CharByteWidth == 2) { 2290 // FIXME: Make the type of the result buffer correct instead of 2291 // using reinterpret_cast. 2292 llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data()); 2293 ResultWidePtr[0] = GetNumStringChars() - 1; 2294 } else { 2295 assert(CharByteWidth == 1 && "Unexpected char width"); 2296 ResultBuf[0] = GetNumStringChars() - 1; 2297 } 2298 2299 // Verify that pascal strings aren't too large. 2300 if (GetStringLength() > 256) { 2301 if (Diags) 2302 Diags->Report(StringToks.front().getLocation(), 2303 diag::err_pascal_string_too_long) 2304 << SourceRange(StringToks.front().getLocation(), 2305 StringToks.back().getLocation()); 2306 hadError = true; 2307 return; 2308 } 2309 } else if (Diags) { 2310 // Complain if this string literal has too many characters. 2311 unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; 2312 2313 if (GetNumStringChars() > MaxChars) 2314 Diags->Report(StringToks.front().getLocation(), 2315 diag::ext_string_too_long) 2316 << GetNumStringChars() << MaxChars 2317 << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) 2318 << SourceRange(StringToks.front().getLocation(), 2319 StringToks.back().getLocation()); 2320 } 2321 } 2322 2323 static const char *resyncUTF8(const char *Err, const char *End) { 2324 if (Err == End) 2325 return End; 2326 End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err); 2327 while (++Err != End && (*Err & 0xC0) == 0x80) 2328 ; 2329 return Err; 2330 } 2331 2332 /// This function copies from Fragment, which is a sequence of bytes 2333 /// within Tok's contents (which begin at TokBegin) into ResultPtr. 2334 /// Performs widening for multi-byte characters. 2335 bool StringLiteralParser::CopyStringFragment(const Token &Tok, 2336 const char *TokBegin, 2337 StringRef Fragment) { 2338 const llvm::UTF8 *ErrorPtrTmp; 2339 if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) 2340 return false; 2341 2342 // If we see bad encoding for unprefixed string literals, warn and 2343 // simply copy the byte values, for compatibility with gcc and older 2344 // versions of clang. 2345 bool NoErrorOnBadEncoding = isOrdinary(); 2346 if (NoErrorOnBadEncoding) { 2347 memcpy(ResultPtr, Fragment.data(), Fragment.size()); 2348 ResultPtr += Fragment.size(); 2349 } 2350 2351 if (Diags) { 2352 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); 2353 2354 FullSourceLoc SourceLoc(Tok.getLocation(), SM); 2355 const DiagnosticBuilder &Builder = 2356 Diag(Diags, Features, SourceLoc, TokBegin, 2357 ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()), 2358 NoErrorOnBadEncoding ? diag::warn_bad_string_encoding 2359 : diag::err_bad_string_encoding); 2360 2361 const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end()); 2362 StringRef NextFragment(NextStart, Fragment.end()-NextStart); 2363 2364 // Decode into a dummy buffer. 2365 SmallString<512> Dummy; 2366 Dummy.reserve(Fragment.size() * CharByteWidth); 2367 char *Ptr = Dummy.data(); 2368 2369 while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) { 2370 const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); 2371 NextStart = resyncUTF8(ErrorPtr, Fragment.end()); 2372 Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin, 2373 ErrorPtr, NextStart); 2374 NextFragment = StringRef(NextStart, Fragment.end()-NextStart); 2375 } 2376 } 2377 return !NoErrorOnBadEncoding; 2378 } 2379 2380 void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) { 2381 hadError = true; 2382 if (Diags) 2383 Diags->Report(Loc, diag::err_lexing_string); 2384 } 2385 2386 /// getOffsetOfStringByte - This function returns the offset of the 2387 /// specified byte of the string data represented by Token. This handles 2388 /// advancing over escape sequences in the string. 2389 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, 2390 unsigned ByteNo) const { 2391 // Get the spelling of the token. 2392 SmallString<32> SpellingBuffer; 2393 SpellingBuffer.resize(Tok.getLength()); 2394 2395 bool StringInvalid = false; 2396 const char *SpellingPtr = &SpellingBuffer[0]; 2397 unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, 2398 &StringInvalid); 2399 if (StringInvalid) 2400 return 0; 2401 2402 const char *SpellingStart = SpellingPtr; 2403 const char *SpellingEnd = SpellingPtr+TokLen; 2404 2405 // Handle UTF-8 strings just like narrow strings. 2406 if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8') 2407 SpellingPtr += 2; 2408 2409 assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && 2410 SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); 2411 2412 // For raw string literals, this is easy. 2413 if (SpellingPtr[0] == 'R') { 2414 assert(SpellingPtr[1] == '"' && "Should be a raw string literal!"); 2415 // Skip 'R"'. 2416 SpellingPtr += 2; 2417 while (*SpellingPtr != '(') { 2418 ++SpellingPtr; 2419 assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal"); 2420 } 2421 // Skip '('. 2422 ++SpellingPtr; 2423 return SpellingPtr - SpellingStart + ByteNo; 2424 } 2425 2426 // Skip over the leading quote 2427 assert(SpellingPtr[0] == '"' && "Should be a string literal!"); 2428 ++SpellingPtr; 2429 2430 // Skip over bytes until we find the offset we're looking for. 2431 while (ByteNo) { 2432 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); 2433 2434 // Step over non-escapes simply. 2435 if (*SpellingPtr != '\\') { 2436 ++SpellingPtr; 2437 --ByteNo; 2438 continue; 2439 } 2440 2441 // Otherwise, this is an escape character. Advance over it. 2442 bool HadError = false; 2443 if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' || 2444 SpellingPtr[1] == 'N') { 2445 const char *EscapePtr = SpellingPtr; 2446 unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd, 2447 1, Features, HadError); 2448 if (Len > ByteNo) { 2449 // ByteNo is somewhere within the escape sequence. 2450 SpellingPtr = EscapePtr; 2451 break; 2452 } 2453 ByteNo -= Len; 2454 } else { 2455 ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, 2456 FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8, 2457 Diags, Features, StringLiteralEvalMethod::Evaluated); 2458 --ByteNo; 2459 } 2460 assert(!HadError && "This method isn't valid on erroneous strings"); 2461 } 2462 2463 return SpellingPtr-SpellingStart; 2464 } 2465 2466 /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved 2467 /// suffixes as ud-suffixes, because the diagnostic experience is better if we 2468 /// treat it as an invalid suffix. 2469 bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts, 2470 StringRef Suffix) { 2471 return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) || 2472 Suffix == "sv"; 2473 } 2474