1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/Diagnostic.h" 17 #include "clang/Basic/IdentifierTable.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/LangOptions.h" 20 #include "clang/Basic/SourceLocation.h" 21 #include "clang/Basic/SourceManager.h" 22 #include "clang/Basic/TokenKinds.h" 23 #include "clang/Lex/LexDiagnostic.h" 24 #include "clang/Lex/LiteralSupport.h" 25 #include "clang/Lex/MultipleIncludeOpt.h" 26 #include "clang/Lex/Preprocessor.h" 27 #include "clang/Lex/PreprocessorOptions.h" 28 #include "clang/Lex/Token.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/StringExtras.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ConvertUTF.h" 35 #include "llvm/Support/MemoryBufferRef.h" 36 #include "llvm/Support/NativeFormatting.h" 37 #include "llvm/Support/Unicode.h" 38 #include "llvm/Support/UnicodeCharRanges.h" 39 #include <algorithm> 40 #include <cassert> 41 #include <cstddef> 42 #include <cstdint> 43 #include <cstring> 44 #include <limits> 45 #include <optional> 46 #include <string> 47 #include <tuple> 48 49 #ifdef __SSE4_2__ 50 #include <nmmintrin.h> 51 #endif 52 53 using namespace clang; 54 55 //===----------------------------------------------------------------------===// 56 // Token Class Implementation 57 //===----------------------------------------------------------------------===// 58 59 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 60 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 61 if (isAnnotation()) 62 return false; 63 if (const IdentifierInfo *II = getIdentifierInfo()) 64 return II->getObjCKeywordID() == objcKey; 65 return false; 66 } 67 68 /// getObjCKeywordID - Return the ObjC keyword kind. 69 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 70 if (isAnnotation()) 71 return tok::objc_not_keyword; 72 const IdentifierInfo *specId = getIdentifierInfo(); 73 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 74 } 75 76 /// Determine whether the token kind starts a simple-type-specifier. 77 bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const { 78 switch (getKind()) { 79 case tok::annot_typename: 80 case tok::annot_decltype: 81 case tok::annot_pack_indexing_type: 82 return true; 83 84 case tok::kw_short: 85 case tok::kw_long: 86 case tok::kw___int64: 87 case tok::kw___int128: 88 case tok::kw_signed: 89 case tok::kw_unsigned: 90 case tok::kw_void: 91 case tok::kw_char: 92 case tok::kw_int: 93 case tok::kw_half: 94 case tok::kw_float: 95 case tok::kw_double: 96 case tok::kw___bf16: 97 case tok::kw__Float16: 98 case tok::kw___float128: 99 case tok::kw___ibm128: 100 case tok::kw_wchar_t: 101 case tok::kw_bool: 102 case tok::kw__Bool: 103 case tok::kw__Accum: 104 case tok::kw__Fract: 105 case tok::kw__Sat: 106 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: 107 #include "clang/Basic/TransformTypeTraits.def" 108 case tok::kw___auto_type: 109 case tok::kw_char16_t: 110 case tok::kw_char32_t: 111 case tok::kw_typeof: 112 case tok::kw_decltype: 113 case tok::kw_char8_t: 114 return getIdentifierInfo()->isKeyword(LangOpts); 115 116 default: 117 return false; 118 } 119 } 120 121 //===----------------------------------------------------------------------===// 122 // Lexer Class Implementation 123 //===----------------------------------------------------------------------===// 124 125 void Lexer::anchor() {} 126 127 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 128 const char *BufEnd) { 129 BufferStart = BufStart; 130 BufferPtr = BufPtr; 131 BufferEnd = BufEnd; 132 133 assert(BufEnd[0] == 0 && 134 "We assume that the input buffer has a null character at the end" 135 " to simplify lexing!"); 136 137 // Check whether we have a BOM in the beginning of the buffer. If yes - act 138 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 139 // skip the UTF-8 BOM if it's present. 140 if (BufferStart == BufferPtr) { 141 // Determine the size of the BOM. 142 StringRef Buf(BufferStart, BufferEnd - BufferStart); 143 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 144 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 145 .Default(0); 146 147 // Skip the BOM. 148 BufferPtr += BOMLength; 149 } 150 151 Is_PragmaLexer = false; 152 CurrentConflictMarkerState = CMK_None; 153 154 // Start of the file is a start of line. 155 IsAtStartOfLine = true; 156 IsAtPhysicalStartOfLine = true; 157 158 HasLeadingSpace = false; 159 HasLeadingEmptyMacro = false; 160 161 // We are not after parsing a #. 162 ParsingPreprocessorDirective = false; 163 164 // We are not after parsing #include. 165 ParsingFilename = false; 166 167 // We are not in raw mode. Raw mode disables diagnostics and interpretation 168 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 169 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 170 // or otherwise skipping over tokens. 171 LexingRawMode = false; 172 173 // Default to not keeping comments. 174 ExtendedTokenMode = 0; 175 176 NewLinePtr = nullptr; 177 } 178 179 /// Lexer constructor - Create a new lexer object for the specified buffer 180 /// with the specified preprocessor managing the lexing process. This lexer 181 /// assumes that the associated file buffer and Preprocessor objects will 182 /// outlive it, so it doesn't take ownership of either of them. 183 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 184 Preprocessor &PP, bool IsFirstIncludeOfFile) 185 : PreprocessorLexer(&PP, FID), 186 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 187 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 188 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 189 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 190 InputFile.getBufferEnd()); 191 192 resetExtendedTokenMode(); 193 } 194 195 /// Lexer constructor - Create a new raw lexer object. This object is only 196 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 197 /// range will outlive it, so it doesn't take ownership of it. 198 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 199 const char *BufStart, const char *BufPtr, const char *BufEnd, 200 bool IsFirstIncludeOfFile) 201 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 202 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 203 InitLexer(BufStart, BufPtr, BufEnd); 204 205 // We *are* in raw mode. 206 LexingRawMode = true; 207 } 208 209 /// Lexer constructor - Create a new raw lexer object. This object is only 210 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 211 /// range will outlive it, so it doesn't take ownership of it. 212 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 213 const SourceManager &SM, const LangOptions &langOpts, 214 bool IsFirstIncludeOfFile) 215 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 216 FromFile.getBufferStart(), FromFile.getBufferEnd(), 217 IsFirstIncludeOfFile) {} 218 219 void Lexer::resetExtendedTokenMode() { 220 assert(PP && "Cannot reset token mode without a preprocessor"); 221 if (LangOpts.TraditionalCPP) 222 SetKeepWhitespaceMode(true); 223 else 224 SetCommentRetentionState(PP->getCommentRetentionState()); 225 } 226 227 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 228 /// _Pragma expansion. This has a variety of magic semantics that this method 229 /// sets up. It returns a new'd Lexer that must be delete'd when done. 230 /// 231 /// On entrance to this routine, TokStartLoc is a macro location which has a 232 /// spelling loc that indicates the bytes to be lexed for the token and an 233 /// expansion location that indicates where all lexed tokens should be 234 /// "expanded from". 235 /// 236 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 237 /// normal lexer that remaps tokens as they fly by. This would require making 238 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 239 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 240 /// out of the critical path of the lexer! 241 /// 242 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 243 SourceLocation ExpansionLocStart, 244 SourceLocation ExpansionLocEnd, 245 unsigned TokLen, Preprocessor &PP) { 246 SourceManager &SM = PP.getSourceManager(); 247 248 // Create the lexer as if we were going to lex the file normally. 249 FileID SpellingFID = SM.getFileID(SpellingLoc); 250 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 251 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 252 253 // Now that the lexer is created, change the start/end locations so that we 254 // just lex the subsection of the file that we want. This is lexing from a 255 // scratch buffer. 256 const char *StrData = SM.getCharacterData(SpellingLoc); 257 258 L->BufferPtr = StrData; 259 L->BufferEnd = StrData+TokLen; 260 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 261 262 // Set the SourceLocation with the remapping information. This ensures that 263 // GetMappedTokenLoc will remap the tokens as they are lexed. 264 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 265 ExpansionLocStart, 266 ExpansionLocEnd, TokLen); 267 268 // Ensure that the lexer thinks it is inside a directive, so that end \n will 269 // return an EOD token. 270 L->ParsingPreprocessorDirective = true; 271 272 // This lexer really is for _Pragma. 273 L->Is_PragmaLexer = true; 274 return L; 275 } 276 277 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 278 this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 279 this->IsAtStartOfLine = IsAtStartOfLine; 280 assert((BufferStart + Offset) <= BufferEnd); 281 BufferPtr = BufferStart + Offset; 282 } 283 284 template <typename T> static void StringifyImpl(T &Str, char Quote) { 285 typename T::size_type i = 0, e = Str.size(); 286 while (i < e) { 287 if (Str[i] == '\\' || Str[i] == Quote) { 288 Str.insert(Str.begin() + i, '\\'); 289 i += 2; 290 ++e; 291 } else if (Str[i] == '\n' || Str[i] == '\r') { 292 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 293 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 294 Str[i] != Str[i + 1]) { 295 Str[i] = '\\'; 296 Str[i + 1] = 'n'; 297 } else { 298 // Replace '\n' and '\r' to '\\' followed by 'n'. 299 Str[i] = '\\'; 300 Str.insert(Str.begin() + i + 1, 'n'); 301 ++e; 302 } 303 i += 2; 304 } else 305 ++i; 306 } 307 } 308 309 std::string Lexer::Stringify(StringRef Str, bool Charify) { 310 std::string Result = std::string(Str); 311 char Quote = Charify ? '\'' : '"'; 312 StringifyImpl(Result, Quote); 313 return Result; 314 } 315 316 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 317 318 //===----------------------------------------------------------------------===// 319 // Token Spelling 320 //===----------------------------------------------------------------------===// 321 322 /// Slow case of getSpelling. Extract the characters comprising the 323 /// spelling of this token from the provided input buffer. 324 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 325 const LangOptions &LangOpts, char *Spelling) { 326 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 327 328 size_t Length = 0; 329 const char *BufEnd = BufPtr + Tok.getLength(); 330 331 if (tok::isStringLiteral(Tok.getKind())) { 332 // Munch the encoding-prefix and opening double-quote. 333 while (BufPtr < BufEnd) { 334 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 335 Spelling[Length++] = CharAndSize.Char; 336 BufPtr += CharAndSize.Size; 337 338 if (Spelling[Length - 1] == '"') 339 break; 340 } 341 342 // Raw string literals need special handling; trigraph expansion and line 343 // splicing do not occur within their d-char-sequence nor within their 344 // r-char-sequence. 345 if (Length >= 2 && 346 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 347 // Search backwards from the end of the token to find the matching closing 348 // quote. 349 const char *RawEnd = BufEnd; 350 do --RawEnd; while (*RawEnd != '"'); 351 size_t RawLength = RawEnd - BufPtr + 1; 352 353 // Everything between the quotes is included verbatim in the spelling. 354 memcpy(Spelling + Length, BufPtr, RawLength); 355 Length += RawLength; 356 BufPtr += RawLength; 357 358 // The rest of the token is lexed normally. 359 } 360 } 361 362 while (BufPtr < BufEnd) { 363 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 364 Spelling[Length++] = CharAndSize.Char; 365 BufPtr += CharAndSize.Size; 366 } 367 368 assert(Length < Tok.getLength() && 369 "NeedsCleaning flag set on token that didn't need cleaning!"); 370 return Length; 371 } 372 373 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 374 /// token are the characters used to represent the token in the source file 375 /// after trigraph expansion and escaped-newline folding. In particular, this 376 /// wants to get the true, uncanonicalized, spelling of things like digraphs 377 /// UCNs, etc. 378 StringRef Lexer::getSpelling(SourceLocation loc, 379 SmallVectorImpl<char> &buffer, 380 const SourceManager &SM, 381 const LangOptions &options, 382 bool *invalid) { 383 // Break down the source location. 384 FileIDAndOffset locInfo = SM.getDecomposedLoc(loc); 385 386 // Try to the load the file buffer. 387 bool invalidTemp = false; 388 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 389 if (invalidTemp) { 390 if (invalid) *invalid = true; 391 return {}; 392 } 393 394 const char *tokenBegin = file.data() + locInfo.second; 395 396 // Lex from the start of the given location. 397 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 398 file.begin(), tokenBegin, file.end()); 399 Token token; 400 lexer.LexFromRawLexer(token); 401 402 unsigned length = token.getLength(); 403 404 // Common case: no need for cleaning. 405 if (!token.needsCleaning()) 406 return StringRef(tokenBegin, length); 407 408 // Hard case, we need to relex the characters into the string. 409 buffer.resize(length); 410 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 411 return StringRef(buffer.data(), buffer.size()); 412 } 413 414 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 415 /// token are the characters used to represent the token in the source file 416 /// after trigraph expansion and escaped-newline folding. In particular, this 417 /// wants to get the true, uncanonicalized, spelling of things like digraphs 418 /// UCNs, etc. 419 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 420 const LangOptions &LangOpts, bool *Invalid) { 421 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 422 423 bool CharDataInvalid = false; 424 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 425 &CharDataInvalid); 426 if (Invalid) 427 *Invalid = CharDataInvalid; 428 if (CharDataInvalid) 429 return {}; 430 431 // If this token contains nothing interesting, return it directly. 432 if (!Tok.needsCleaning()) 433 return std::string(TokStart, TokStart + Tok.getLength()); 434 435 std::string Result; 436 Result.resize(Tok.getLength()); 437 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 438 return Result; 439 } 440 441 /// getSpelling - This method is used to get the spelling of a token into a 442 /// preallocated buffer, instead of as an std::string. The caller is required 443 /// to allocate enough space for the token, which is guaranteed to be at least 444 /// Tok.getLength() bytes long. The actual length of the token is returned. 445 /// 446 /// Note that this method may do two possible things: it may either fill in 447 /// the buffer specified with characters, or it may *change the input pointer* 448 /// to point to a constant buffer with the data already in it (avoiding a 449 /// copy). The caller is not allowed to modify the returned buffer pointer 450 /// if an internal buffer is returned. 451 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 452 const SourceManager &SourceMgr, 453 const LangOptions &LangOpts, bool *Invalid) { 454 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 455 456 const char *TokStart = nullptr; 457 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 458 if (Tok.is(tok::raw_identifier)) 459 TokStart = Tok.getRawIdentifier().data(); 460 else if (!Tok.hasUCN()) { 461 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 462 // Just return the string from the identifier table, which is very quick. 463 Buffer = II->getNameStart(); 464 return II->getLength(); 465 } 466 } 467 468 // NOTE: this can be checked even after testing for an IdentifierInfo. 469 if (Tok.isLiteral()) 470 TokStart = Tok.getLiteralData(); 471 472 if (!TokStart) { 473 // Compute the start of the token in the input lexer buffer. 474 bool CharDataInvalid = false; 475 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 476 if (Invalid) 477 *Invalid = CharDataInvalid; 478 if (CharDataInvalid) { 479 Buffer = ""; 480 return 0; 481 } 482 } 483 484 // If this token contains nothing interesting, return it directly. 485 if (!Tok.needsCleaning()) { 486 Buffer = TokStart; 487 return Tok.getLength(); 488 } 489 490 // Otherwise, hard case, relex the characters into the string. 491 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 492 } 493 494 /// MeasureTokenLength - Relex the token at the specified location and return 495 /// its length in bytes in the input file. If the token needs cleaning (e.g. 496 /// includes a trigraph or an escaped newline) then this count includes bytes 497 /// that are part of that. 498 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 499 const SourceManager &SM, 500 const LangOptions &LangOpts) { 501 Token TheTok; 502 if (getRawToken(Loc, TheTok, SM, LangOpts)) 503 return 0; 504 return TheTok.getLength(); 505 } 506 507 /// Relex the token at the specified location. 508 /// \returns true if there was a failure, false on success. 509 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 510 const SourceManager &SM, 511 const LangOptions &LangOpts, 512 bool IgnoreWhiteSpace) { 513 // TODO: this could be special cased for common tokens like identifiers, ')', 514 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 515 // all obviously single-char tokens. This could use 516 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 517 // something. 518 519 // If this comes from a macro expansion, we really do want the macro name, not 520 // the token this macro expanded to. 521 Loc = SM.getExpansionLoc(Loc); 522 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc); 523 bool Invalid = false; 524 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 525 if (Invalid) 526 return true; 527 528 const char *StrData = Buffer.data()+LocInfo.second; 529 530 if (!IgnoreWhiteSpace && isWhitespace(SkipEscapedNewLines(StrData)[0])) 531 return true; 532 533 // Create a lexer starting at the beginning of this token. 534 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 535 Buffer.begin(), StrData, Buffer.end()); 536 TheLexer.SetCommentRetentionState(true); 537 TheLexer.LexFromRawLexer(Result); 538 return false; 539 } 540 541 /// Returns the pointer that points to the beginning of line that contains 542 /// the given offset, or null if the offset if invalid. 543 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 544 const char *BufStart = Buffer.data(); 545 if (Offset >= Buffer.size()) 546 return nullptr; 547 548 const char *LexStart = BufStart + Offset; 549 for (; LexStart != BufStart; --LexStart) { 550 if (isVerticalWhitespace(LexStart[0]) && 551 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 552 // LexStart should point at first character of logical line. 553 ++LexStart; 554 break; 555 } 556 } 557 return LexStart; 558 } 559 560 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 561 const SourceManager &SM, 562 const LangOptions &LangOpts) { 563 assert(Loc.isFileID()); 564 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc); 565 if (LocInfo.first.isInvalid()) 566 return Loc; 567 568 bool Invalid = false; 569 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 570 if (Invalid) 571 return Loc; 572 573 // Back up from the current location until we hit the beginning of a line 574 // (or the buffer). We'll relex from that point. 575 const char *StrData = Buffer.data() + LocInfo.second; 576 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 577 if (!LexStart || LexStart == StrData) 578 return Loc; 579 580 // Create a lexer starting at the beginning of this token. 581 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 582 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 583 Buffer.end()); 584 TheLexer.SetCommentRetentionState(true); 585 586 // Lex tokens until we find the token that contains the source location. 587 Token TheTok; 588 do { 589 TheLexer.LexFromRawLexer(TheTok); 590 591 if (TheLexer.getBufferLocation() > StrData) { 592 // Lexing this token has taken the lexer past the source location we're 593 // looking for. If the current token encompasses our source location, 594 // return the beginning of that token. 595 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 596 return TheTok.getLocation(); 597 598 // We ended up skipping over the source location entirely, which means 599 // that it points into whitespace. We're done here. 600 break; 601 } 602 } while (TheTok.getKind() != tok::eof); 603 604 // We've passed our source location; just return the original source location. 605 return Loc; 606 } 607 608 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 609 const SourceManager &SM, 610 const LangOptions &LangOpts) { 611 if (Loc.isFileID()) 612 return getBeginningOfFileToken(Loc, SM, LangOpts); 613 614 if (!SM.isMacroArgExpansion(Loc)) 615 return Loc; 616 617 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 618 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 619 FileIDAndOffset FileLocInfo = SM.getDecomposedLoc(FileLoc); 620 FileIDAndOffset BeginFileLocInfo = SM.getDecomposedLoc(BeginFileLoc); 621 assert(FileLocInfo.first == BeginFileLocInfo.first && 622 FileLocInfo.second >= BeginFileLocInfo.second); 623 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 624 } 625 626 namespace { 627 628 enum PreambleDirectiveKind { 629 PDK_Skipped, 630 PDK_Unknown 631 }; 632 633 } // namespace 634 635 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 636 const LangOptions &LangOpts, 637 unsigned MaxLines) { 638 // Create a lexer starting at the beginning of the file. Note that we use a 639 // "fake" file source location at offset 1 so that the lexer will track our 640 // position within the file. 641 const SourceLocation::UIntTy StartOffset = 1; 642 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 643 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 644 Buffer.end()); 645 TheLexer.SetCommentRetentionState(true); 646 647 bool InPreprocessorDirective = false; 648 Token TheTok; 649 SourceLocation ActiveCommentLoc; 650 651 unsigned MaxLineOffset = 0; 652 if (MaxLines) { 653 const char *CurPtr = Buffer.begin(); 654 unsigned CurLine = 0; 655 while (CurPtr != Buffer.end()) { 656 char ch = *CurPtr++; 657 if (ch == '\n') { 658 ++CurLine; 659 if (CurLine == MaxLines) 660 break; 661 } 662 } 663 if (CurPtr != Buffer.end()) 664 MaxLineOffset = CurPtr - Buffer.begin(); 665 } 666 667 do { 668 TheLexer.LexFromRawLexer(TheTok); 669 670 if (InPreprocessorDirective) { 671 // If we've hit the end of the file, we're done. 672 if (TheTok.getKind() == tok::eof) { 673 break; 674 } 675 676 // If we haven't hit the end of the preprocessor directive, skip this 677 // token. 678 if (!TheTok.isAtStartOfLine()) 679 continue; 680 681 // We've passed the end of the preprocessor directive, and will look 682 // at this token again below. 683 InPreprocessorDirective = false; 684 } 685 686 // Keep track of the # of lines in the preamble. 687 if (TheTok.isAtStartOfLine()) { 688 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 689 690 // If we were asked to limit the number of lines in the preamble, 691 // and we're about to exceed that limit, we're done. 692 if (MaxLineOffset && TokOffset >= MaxLineOffset) 693 break; 694 } 695 696 // Comments are okay; skip over them. 697 if (TheTok.getKind() == tok::comment) { 698 if (ActiveCommentLoc.isInvalid()) 699 ActiveCommentLoc = TheTok.getLocation(); 700 continue; 701 } 702 703 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 704 // This is the start of a preprocessor directive. 705 Token HashTok = TheTok; 706 InPreprocessorDirective = true; 707 ActiveCommentLoc = SourceLocation(); 708 709 // Figure out which directive this is. Since we're lexing raw tokens, 710 // we don't have an identifier table available. Instead, just look at 711 // the raw identifier to recognize and categorize preprocessor directives. 712 TheLexer.LexFromRawLexer(TheTok); 713 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 714 StringRef Keyword = TheTok.getRawIdentifier(); 715 PreambleDirectiveKind PDK 716 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 717 .Case("include", PDK_Skipped) 718 .Case("__include_macros", PDK_Skipped) 719 .Case("define", PDK_Skipped) 720 .Case("undef", PDK_Skipped) 721 .Case("line", PDK_Skipped) 722 .Case("error", PDK_Skipped) 723 .Case("pragma", PDK_Skipped) 724 .Case("import", PDK_Skipped) 725 .Case("include_next", PDK_Skipped) 726 .Case("warning", PDK_Skipped) 727 .Case("ident", PDK_Skipped) 728 .Case("sccs", PDK_Skipped) 729 .Case("assert", PDK_Skipped) 730 .Case("unassert", PDK_Skipped) 731 .Case("if", PDK_Skipped) 732 .Case("ifdef", PDK_Skipped) 733 .Case("ifndef", PDK_Skipped) 734 .Case("elif", PDK_Skipped) 735 .Case("elifdef", PDK_Skipped) 736 .Case("elifndef", PDK_Skipped) 737 .Case("else", PDK_Skipped) 738 .Case("endif", PDK_Skipped) 739 .Default(PDK_Unknown); 740 741 switch (PDK) { 742 case PDK_Skipped: 743 continue; 744 745 case PDK_Unknown: 746 // We don't know what this directive is; stop at the '#'. 747 break; 748 } 749 } 750 751 // We only end up here if we didn't recognize the preprocessor 752 // directive or it was one that can't occur in the preamble at this 753 // point. Roll back the current token to the location of the '#'. 754 TheTok = HashTok; 755 } else if (TheTok.isAtStartOfLine() && 756 TheTok.getKind() == tok::raw_identifier && 757 TheTok.getRawIdentifier() == "module" && 758 LangOpts.CPlusPlusModules) { 759 // The initial global module fragment introducer "module;" is part of 760 // the preamble, which runs up to the module declaration "module foo;". 761 Token ModuleTok = TheTok; 762 do { 763 TheLexer.LexFromRawLexer(TheTok); 764 } while (TheTok.getKind() == tok::comment); 765 if (TheTok.getKind() != tok::semi) { 766 // Not global module fragment, roll back. 767 TheTok = ModuleTok; 768 break; 769 } 770 continue; 771 } 772 773 // We hit a token that we don't recognize as being in the 774 // "preprocessing only" part of the file, so we're no longer in 775 // the preamble. 776 break; 777 } while (true); 778 779 SourceLocation End; 780 if (ActiveCommentLoc.isValid()) 781 End = ActiveCommentLoc; // don't truncate a decl comment. 782 else 783 End = TheTok.getLocation(); 784 785 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 786 TheTok.isAtStartOfLine()); 787 } 788 789 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 790 const SourceManager &SM, 791 const LangOptions &LangOpts) { 792 // Figure out how many physical characters away the specified expansion 793 // character is. This needs to take into consideration newlines and 794 // trigraphs. 795 bool Invalid = false; 796 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 797 798 // If they request the first char of the token, we're trivially done. 799 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 800 return 0; 801 802 unsigned PhysOffset = 0; 803 804 // The usual case is that tokens don't contain anything interesting. Skip 805 // over the uninteresting characters. If a token only consists of simple 806 // chars, this method is extremely fast. 807 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 808 if (CharNo == 0) 809 return PhysOffset; 810 ++TokPtr; 811 --CharNo; 812 ++PhysOffset; 813 } 814 815 // If we have a character that may be a trigraph or escaped newline, use a 816 // lexer to parse it correctly. 817 for (; CharNo; --CharNo) { 818 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 819 TokPtr += CharAndSize.Size; 820 PhysOffset += CharAndSize.Size; 821 } 822 823 // Final detail: if we end up on an escaped newline, we want to return the 824 // location of the actual byte of the token. For example foo\<newline>bar 825 // advanced by 3 should return the location of b, not of \\. One compounding 826 // detail of this is that the escape may be made by a trigraph. 827 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 828 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 829 830 return PhysOffset; 831 } 832 833 /// Computes the source location just past the end of the 834 /// token at this source location. 835 /// 836 /// This routine can be used to produce a source location that 837 /// points just past the end of the token referenced by \p Loc, and 838 /// is generally used when a diagnostic needs to point just after a 839 /// token where it expected something different that it received. If 840 /// the returned source location would not be meaningful (e.g., if 841 /// it points into a macro), this routine returns an invalid 842 /// source location. 843 /// 844 /// \param Offset an offset from the end of the token, where the source 845 /// location should refer to. The default offset (0) produces a source 846 /// location pointing just past the end of the token; an offset of 1 produces 847 /// a source location pointing to the last character in the token, etc. 848 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 849 const SourceManager &SM, 850 const LangOptions &LangOpts) { 851 if (Loc.isInvalid()) 852 return {}; 853 854 if (Loc.isMacroID()) { 855 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 856 return {}; // Points inside the macro expansion. 857 } 858 859 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 860 if (Len > Offset) 861 Len = Len - Offset; 862 else 863 return Loc; 864 865 return Loc.getLocWithOffset(Len); 866 } 867 868 /// Returns true if the given MacroID location points at the first 869 /// token of the macro expansion. 870 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 871 const SourceManager &SM, 872 const LangOptions &LangOpts, 873 SourceLocation *MacroBegin) { 874 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 875 876 SourceLocation expansionLoc; 877 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 878 return false; 879 880 if (expansionLoc.isFileID()) { 881 // No other macro expansions, this is the first. 882 if (MacroBegin) 883 *MacroBegin = expansionLoc; 884 return true; 885 } 886 887 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 888 } 889 890 /// Returns true if the given MacroID location points at the last 891 /// token of the macro expansion. 892 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 893 const SourceManager &SM, 894 const LangOptions &LangOpts, 895 SourceLocation *MacroEnd) { 896 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 897 898 SourceLocation spellLoc = SM.getSpellingLoc(loc); 899 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 900 if (tokLen == 0) 901 return false; 902 903 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 904 SourceLocation expansionLoc; 905 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 906 return false; 907 908 if (expansionLoc.isFileID()) { 909 // No other macro expansions. 910 if (MacroEnd) 911 *MacroEnd = expansionLoc; 912 return true; 913 } 914 915 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 916 } 917 918 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 919 const SourceManager &SM, 920 const LangOptions &LangOpts) { 921 SourceLocation Begin = Range.getBegin(); 922 SourceLocation End = Range.getEnd(); 923 assert(Begin.isFileID() && End.isFileID()); 924 if (Range.isTokenRange()) { 925 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 926 if (End.isInvalid()) 927 return {}; 928 } 929 930 // Break down the source locations. 931 auto [FID, BeginOffs] = SM.getDecomposedLoc(Begin); 932 if (FID.isInvalid()) 933 return {}; 934 935 unsigned EndOffs; 936 if (!SM.isInFileID(End, FID, &EndOffs) || 937 BeginOffs > EndOffs) 938 return {}; 939 940 return CharSourceRange::getCharRange(Begin, End); 941 } 942 943 // Assumes that `Loc` is in an expansion. 944 static bool isInExpansionTokenRange(const SourceLocation Loc, 945 const SourceManager &SM) { 946 return SM.getSLocEntry(SM.getFileID(Loc)) 947 .getExpansion() 948 .isExpansionTokenRange(); 949 } 950 951 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 952 const SourceManager &SM, 953 const LangOptions &LangOpts) { 954 SourceLocation Begin = Range.getBegin(); 955 SourceLocation End = Range.getEnd(); 956 if (Begin.isInvalid() || End.isInvalid()) 957 return {}; 958 959 if (Begin.isFileID() && End.isFileID()) 960 return makeRangeFromFileLocs(Range, SM, LangOpts); 961 962 if (Begin.isMacroID() && End.isFileID()) { 963 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 964 return {}; 965 Range.setBegin(Begin); 966 return makeRangeFromFileLocs(Range, SM, LangOpts); 967 } 968 969 if (Begin.isFileID() && End.isMacroID()) { 970 if (Range.isTokenRange()) { 971 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 972 return {}; 973 // Use the *original* end, not the expanded one in `End`. 974 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 975 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 976 return {}; 977 Range.setEnd(End); 978 return makeRangeFromFileLocs(Range, SM, LangOpts); 979 } 980 981 assert(Begin.isMacroID() && End.isMacroID()); 982 SourceLocation MacroBegin, MacroEnd; 983 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 984 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 985 &MacroEnd)) || 986 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 987 &MacroEnd)))) { 988 Range.setBegin(MacroBegin); 989 Range.setEnd(MacroEnd); 990 // Use the *original* `End`, not the expanded one in `MacroEnd`. 991 if (Range.isTokenRange()) 992 Range.setTokenRange(isInExpansionTokenRange(End, SM)); 993 return makeRangeFromFileLocs(Range, SM, LangOpts); 994 } 995 996 bool Invalid = false; 997 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 998 &Invalid); 999 if (Invalid) 1000 return {}; 1001 1002 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 1003 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 1004 &Invalid); 1005 if (Invalid) 1006 return {}; 1007 1008 if (EndEntry.getExpansion().isMacroArgExpansion() && 1009 BeginEntry.getExpansion().getExpansionLocStart() == 1010 EndEntry.getExpansion().getExpansionLocStart()) { 1011 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 1012 Range.setEnd(SM.getImmediateSpellingLoc(End)); 1013 return makeFileCharRange(Range, SM, LangOpts); 1014 } 1015 } 1016 1017 return {}; 1018 } 1019 1020 StringRef Lexer::getSourceText(CharSourceRange Range, 1021 const SourceManager &SM, 1022 const LangOptions &LangOpts, 1023 bool *Invalid) { 1024 Range = makeFileCharRange(Range, SM, LangOpts); 1025 if (Range.isInvalid()) { 1026 if (Invalid) *Invalid = true; 1027 return {}; 1028 } 1029 1030 // Break down the source location. 1031 FileIDAndOffset beginInfo = SM.getDecomposedLoc(Range.getBegin()); 1032 if (beginInfo.first.isInvalid()) { 1033 if (Invalid) *Invalid = true; 1034 return {}; 1035 } 1036 1037 unsigned EndOffs; 1038 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 1039 beginInfo.second > EndOffs) { 1040 if (Invalid) *Invalid = true; 1041 return {}; 1042 } 1043 1044 // Try to the load the file buffer. 1045 bool invalidTemp = false; 1046 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 1047 if (invalidTemp) { 1048 if (Invalid) *Invalid = true; 1049 return {}; 1050 } 1051 1052 if (Invalid) *Invalid = false; 1053 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 1054 } 1055 1056 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 1057 const SourceManager &SM, 1058 const LangOptions &LangOpts) { 1059 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1060 1061 // Find the location of the immediate macro expansion. 1062 while (true) { 1063 FileID FID = SM.getFileID(Loc); 1064 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 1065 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 1066 Loc = Expansion.getExpansionLocStart(); 1067 if (!Expansion.isMacroArgExpansion()) 1068 break; 1069 1070 // For macro arguments we need to check that the argument did not come 1071 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 1072 1073 // Loc points to the argument id of the macro definition, move to the 1074 // macro expansion. 1075 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1076 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 1077 if (SpellLoc.isFileID()) 1078 break; // No inner macro. 1079 1080 // If spelling location resides in the same FileID as macro expansion 1081 // location, it means there is no inner macro. 1082 FileID MacroFID = SM.getFileID(Loc); 1083 if (SM.isInFileID(SpellLoc, MacroFID)) 1084 break; 1085 1086 // Argument came from inner macro. 1087 Loc = SpellLoc; 1088 } 1089 1090 // Find the spelling location of the start of the non-argument expansion 1091 // range. This is where the macro name was spelled in order to begin 1092 // expanding this macro. 1093 Loc = SM.getSpellingLoc(Loc); 1094 1095 // Dig out the buffer where the macro name was spelled and the extents of the 1096 // name so that we can render it into the expansion note. 1097 FileIDAndOffset ExpansionInfo = SM.getDecomposedLoc(Loc); 1098 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1099 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1100 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1101 } 1102 1103 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1104 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1105 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1106 // Walk past macro argument expansions. 1107 while (SM.isMacroArgExpansion(Loc)) 1108 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1109 1110 // If the macro's spelling isn't FileID or from scratch space, then it's 1111 // actually a token paste or stringization (or similar) and not a macro at 1112 // all. 1113 SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1114 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 1115 return {}; 1116 1117 // Find the spelling location of the start of the non-argument expansion 1118 // range. This is where the macro name was spelled in order to begin 1119 // expanding this macro. 1120 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1121 1122 // Dig out the buffer where the macro name was spelled and the extents of the 1123 // name so that we can render it into the expansion note. 1124 FileIDAndOffset ExpansionInfo = SM.getDecomposedLoc(Loc); 1125 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1126 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1127 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1128 } 1129 1130 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1131 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 1132 } 1133 1134 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1135 assert(isVerticalWhitespace(Str[0])); 1136 if (Str - 1 < BufferStart) 1137 return false; 1138 1139 if ((Str[0] == '\n' && Str[-1] == '\r') || 1140 (Str[0] == '\r' && Str[-1] == '\n')) { 1141 if (Str - 2 < BufferStart) 1142 return false; 1143 --Str; 1144 } 1145 --Str; 1146 1147 // Rewind to first non-space character: 1148 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1149 --Str; 1150 1151 return *Str == '\\'; 1152 } 1153 1154 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1155 const SourceManager &SM) { 1156 if (Loc.isInvalid() || Loc.isMacroID()) 1157 return {}; 1158 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc); 1159 if (LocInfo.first.isInvalid()) 1160 return {}; 1161 bool Invalid = false; 1162 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1163 if (Invalid) 1164 return {}; 1165 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1166 if (!Line) 1167 return {}; 1168 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1169 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1170 return NumWhitespaceChars == StringRef::npos 1171 ? "" 1172 : Rest.take_front(NumWhitespaceChars); 1173 } 1174 1175 //===----------------------------------------------------------------------===// 1176 // Diagnostics forwarding code. 1177 //===----------------------------------------------------------------------===// 1178 1179 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1180 /// lexer buffer was all expanded at a single point, perform the mapping. 1181 /// This is currently only used for _Pragma implementation, so it is the slow 1182 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1183 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1184 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1185 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1186 SourceLocation FileLoc, 1187 unsigned CharNo, unsigned TokLen) { 1188 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1189 1190 // Otherwise, we're lexing "mapped tokens". This is used for things like 1191 // _Pragma handling. Combine the expansion location of FileLoc with the 1192 // spelling location. 1193 SourceManager &SM = PP.getSourceManager(); 1194 1195 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1196 // characters come from spelling(FileLoc)+Offset. 1197 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1198 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1199 1200 // Figure out the expansion loc range, which is the range covered by the 1201 // original _Pragma(...) sequence. 1202 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1203 1204 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1205 } 1206 1207 /// getSourceLocation - Return a source location identifier for the specified 1208 /// offset in the current file. 1209 SourceLocation Lexer::getSourceLocation(const char *Loc, 1210 unsigned TokLen) const { 1211 assert(Loc >= BufferStart && Loc <= BufferEnd && 1212 "Location out of range for this buffer!"); 1213 1214 // In the normal case, we're just lexing from a simple file buffer, return 1215 // the file id from FileLoc with the offset specified. 1216 unsigned CharNo = Loc-BufferStart; 1217 if (FileLoc.isFileID()) 1218 return FileLoc.getLocWithOffset(CharNo); 1219 1220 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1221 // tokens are lexed from where the _Pragma was defined. 1222 assert(PP && "This doesn't work on raw lexers"); 1223 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1224 } 1225 1226 /// Diag - Forwarding function for diagnostics. This translate a source 1227 /// position in the current buffer into a SourceLocation object for rendering. 1228 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1229 return PP->Diag(getSourceLocation(Loc), DiagID); 1230 } 1231 1232 //===----------------------------------------------------------------------===// 1233 // Trigraph and Escaped Newline Handling Code. 1234 //===----------------------------------------------------------------------===// 1235 1236 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1237 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1238 static char GetTrigraphCharForLetter(char Letter) { 1239 switch (Letter) { 1240 default: return 0; 1241 case '=': return '#'; 1242 case ')': return ']'; 1243 case '(': return '['; 1244 case '!': return '|'; 1245 case '\'': return '^'; 1246 case '>': return '}'; 1247 case '/': return '\\'; 1248 case '<': return '{'; 1249 case '-': return '~'; 1250 } 1251 } 1252 1253 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1254 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1255 /// return the result character. Finally, emit a warning about trigraph use 1256 /// whether trigraphs are enabled or not. 1257 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 1258 char Res = GetTrigraphCharForLetter(*CP); 1259 if (!Res) 1260 return Res; 1261 1262 if (!Trigraphs) { 1263 if (L && !L->isLexingRawMode()) 1264 L->Diag(CP-2, diag::trigraph_ignored); 1265 return 0; 1266 } 1267 1268 if (L && !L->isLexingRawMode()) 1269 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1270 return Res; 1271 } 1272 1273 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1274 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1275 /// trigraph equivalent on entry to this function. 1276 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1277 unsigned Size = 0; 1278 while (isWhitespace(Ptr[Size])) { 1279 ++Size; 1280 1281 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1282 continue; 1283 1284 // If this is a \r\n or \n\r, skip the other half. 1285 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1286 Ptr[Size-1] != Ptr[Size]) 1287 ++Size; 1288 1289 return Size; 1290 } 1291 1292 // Not an escaped newline, must be a \t or something else. 1293 return 0; 1294 } 1295 1296 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1297 /// them), skip over them and return the first non-escaped-newline found, 1298 /// otherwise return P. 1299 const char *Lexer::SkipEscapedNewLines(const char *P) { 1300 while (true) { 1301 const char *AfterEscape; 1302 if (*P == '\\') { 1303 AfterEscape = P+1; 1304 } else if (*P == '?') { 1305 // If not a trigraph for escape, bail out. 1306 if (P[1] != '?' || P[2] != '/') 1307 return P; 1308 // FIXME: Take LangOpts into account; the language might not 1309 // support trigraphs. 1310 AfterEscape = P+3; 1311 } else { 1312 return P; 1313 } 1314 1315 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1316 if (NewLineSize == 0) return P; 1317 P = AfterEscape+NewLineSize; 1318 } 1319 } 1320 1321 std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 1322 const SourceManager &SM, 1323 const LangOptions &LangOpts, 1324 bool IncludeComments) { 1325 if (Loc.isMacroID()) { 1326 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1327 return std::nullopt; 1328 } 1329 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1330 1331 // Break down the source location. 1332 FileIDAndOffset LocInfo = SM.getDecomposedLoc(Loc); 1333 1334 // Try to load the file buffer. 1335 bool InvalidTemp = false; 1336 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1337 if (InvalidTemp) 1338 return std::nullopt; 1339 1340 const char *TokenBegin = File.data() + LocInfo.second; 1341 1342 // Lex from the start of the given location. 1343 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1344 TokenBegin, File.end()); 1345 lexer.SetCommentRetentionState(IncludeComments); 1346 // Find the token. 1347 Token Tok; 1348 lexer.LexFromRawLexer(Tok); 1349 return Tok; 1350 } 1351 1352 std::optional<Token> Lexer::findPreviousToken(SourceLocation Loc, 1353 const SourceManager &SM, 1354 const LangOptions &LangOpts, 1355 bool IncludeComments) { 1356 const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Loc)); 1357 while (Loc != StartOfFile) { 1358 Loc = Loc.getLocWithOffset(-1); 1359 if (Loc.isInvalid()) 1360 return std::nullopt; 1361 1362 Loc = GetBeginningOfToken(Loc, SM, LangOpts); 1363 Token Tok; 1364 if (getRawToken(Loc, Tok, SM, LangOpts)) 1365 continue; // Not a token, go to prev location. 1366 if (!Tok.is(tok::comment) || IncludeComments) { 1367 return Tok; 1368 } 1369 } 1370 return std::nullopt; 1371 } 1372 1373 /// Checks that the given token is the first token that occurs after the 1374 /// given location (this excludes comments and whitespace). Returns the location 1375 /// immediately after the specified token. If the token is not found or the 1376 /// location is inside a macro, the returned source location will be invalid. 1377 SourceLocation Lexer::findLocationAfterToken( 1378 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1379 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1380 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1381 if (!Tok || Tok->isNot(TKind)) 1382 return {}; 1383 SourceLocation TokenLoc = Tok->getLocation(); 1384 1385 // Calculate how much whitespace needs to be skipped if any. 1386 unsigned NumWhitespaceChars = 0; 1387 if (SkipTrailingWhitespaceAndNewLine) { 1388 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1389 unsigned char C = *TokenEnd; 1390 while (isHorizontalWhitespace(C)) { 1391 C = *(++TokenEnd); 1392 NumWhitespaceChars++; 1393 } 1394 1395 // Skip \r, \n, \r\n, or \n\r 1396 if (C == '\n' || C == '\r') { 1397 char PrevC = C; 1398 C = *(++TokenEnd); 1399 NumWhitespaceChars++; 1400 if ((C == '\n' || C == '\r') && C != PrevC) 1401 NumWhitespaceChars++; 1402 } 1403 } 1404 1405 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1406 } 1407 1408 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1409 /// get its size, and return it. This is tricky in several cases: 1410 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1411 /// then either return the trigraph (skipping 3 chars) or the '?', 1412 /// depending on whether trigraphs are enabled or not. 1413 /// 2. If this is an escaped newline (potentially with whitespace between 1414 /// the backslash and newline), implicitly skip the newline and return 1415 /// the char after it. 1416 /// 1417 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1418 /// know that we can accumulate into Size, and that we have already incremented 1419 /// Ptr by Size bytes. 1420 /// 1421 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1422 /// be updated to match. 1423 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 1424 unsigned Size = 0; 1425 // If we have a slash, look for an escaped newline. 1426 if (Ptr[0] == '\\') { 1427 ++Size; 1428 ++Ptr; 1429 Slash: 1430 // Common case, backslash-char where the char is not whitespace. 1431 if (!isWhitespace(Ptr[0])) 1432 return {'\\', Size}; 1433 1434 // See if we have optional whitespace characters between the slash and 1435 // newline. 1436 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1437 // Remember that this token needs to be cleaned. 1438 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1439 1440 // Warn if there was whitespace between the backslash and newline. 1441 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1442 Diag(Ptr, diag::backslash_newline_space); 1443 1444 // Found backslash<whitespace><newline>. Parse the char after it. 1445 Size += EscapedNewLineSize; 1446 Ptr += EscapedNewLineSize; 1447 1448 // Use slow version to accumulate a correct size field. 1449 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 1450 CharAndSize.Size += Size; 1451 return CharAndSize; 1452 } 1453 1454 // Otherwise, this is not an escaped newline, just return the slash. 1455 return {'\\', Size}; 1456 } 1457 1458 // If this is a trigraph, process it. 1459 if (Ptr[0] == '?' && Ptr[1] == '?') { 1460 // If this is actually a legal trigraph (not something like "??x"), emit 1461 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1462 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1463 LangOpts.Trigraphs)) { 1464 // Remember that this token needs to be cleaned. 1465 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1466 1467 Ptr += 3; 1468 Size += 3; 1469 if (C == '\\') goto Slash; 1470 return {C, Size}; 1471 } 1472 } 1473 1474 // If this is neither, return a single character. 1475 return {*Ptr, Size + 1u}; 1476 } 1477 1478 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1479 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1480 /// and that we have already incremented Ptr by Size bytes. 1481 /// 1482 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1483 /// be updated to match. 1484 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 1485 const LangOptions &LangOpts) { 1486 1487 unsigned Size = 0; 1488 // If we have a slash, look for an escaped newline. 1489 if (Ptr[0] == '\\') { 1490 ++Size; 1491 ++Ptr; 1492 Slash: 1493 // Common case, backslash-char where the char is not whitespace. 1494 if (!isWhitespace(Ptr[0])) 1495 return {'\\', Size}; 1496 1497 // See if we have optional whitespace characters followed by a newline. 1498 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1499 // Found backslash<whitespace><newline>. Parse the char after it. 1500 Size += EscapedNewLineSize; 1501 Ptr += EscapedNewLineSize; 1502 1503 // Use slow version to accumulate a correct size field. 1504 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 1505 CharAndSize.Size += Size; 1506 return CharAndSize; 1507 } 1508 1509 // Otherwise, this is not an escaped newline, just return the slash. 1510 return {'\\', Size}; 1511 } 1512 1513 // If this is a trigraph, process it. 1514 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1515 // If this is actually a legal trigraph (not something like "??x"), return 1516 // it. 1517 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1518 Ptr += 3; 1519 Size += 3; 1520 if (C == '\\') goto Slash; 1521 return {C, Size}; 1522 } 1523 } 1524 1525 // If this is neither, return a single character. 1526 return {*Ptr, Size + 1u}; 1527 } 1528 1529 //===----------------------------------------------------------------------===// 1530 // Helper methods for lexing. 1531 //===----------------------------------------------------------------------===// 1532 1533 /// Routine that indiscriminately sets the offset into the source file. 1534 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1535 BufferPtr = BufferStart + Offset; 1536 if (BufferPtr > BufferEnd) 1537 BufferPtr = BufferEnd; 1538 // FIXME: What exactly does the StartOfLine bit mean? There are two 1539 // possible meanings for the "start" of the line: the first token on the 1540 // unexpanded line, or the first token on the expanded line. 1541 IsAtStartOfLine = StartOfLine; 1542 IsAtPhysicalStartOfLine = StartOfLine; 1543 } 1544 1545 static bool isUnicodeWhitespace(uint32_t Codepoint) { 1546 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1547 UnicodeWhitespaceCharRanges); 1548 return UnicodeWhitespaceChars.contains(Codepoint); 1549 } 1550 1551 static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1552 llvm::SmallString<5> CharBuf; 1553 llvm::raw_svector_ostream CharOS(CharBuf); 1554 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1555 return CharBuf; 1556 } 1557 1558 // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1559 // we allow "Mathematical Notation Characters" in identifiers. 1560 // This is a proposed profile that extends the XID_Start/XID_continue 1561 // with mathematical symbols, superscipts and subscripts digits 1562 // found in some production software. 1563 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1564 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1565 bool IsStart, bool &IsExtension) { 1566 static const llvm::sys::UnicodeCharSet MathStartChars( 1567 MathematicalNotationProfileIDStartRanges); 1568 static const llvm::sys::UnicodeCharSet MathContinueChars( 1569 MathematicalNotationProfileIDContinueRanges); 1570 if (MathStartChars.contains(C) || 1571 (!IsStart && MathContinueChars.contains(C))) { 1572 IsExtension = true; 1573 return true; 1574 } 1575 return false; 1576 } 1577 1578 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1579 bool &IsExtension) { 1580 if (LangOpts.AsmPreprocessor) { 1581 return false; 1582 } else if (LangOpts.DollarIdents && '$' == C) { 1583 return true; 1584 } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1585 // A non-leading codepoint must have the XID_Continue property. 1586 // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1587 // so we need to check both tables. 1588 // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1589 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1590 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1591 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1592 return true; 1593 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1594 IsExtension); 1595 } else if (LangOpts.C11) { 1596 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1597 C11AllowedIDCharRanges); 1598 return C11AllowedIDChars.contains(C); 1599 } else { 1600 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1601 C99AllowedIDCharRanges); 1602 return C99AllowedIDChars.contains(C); 1603 } 1604 } 1605 1606 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1607 bool &IsExtension) { 1608 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1609 IsExtension = false; 1610 if (LangOpts.AsmPreprocessor) { 1611 return false; 1612 } 1613 if (LangOpts.CPlusPlus || LangOpts.C23) { 1614 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1615 if (XIDStartChars.contains(C)) 1616 return true; 1617 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1618 IsExtension); 1619 } 1620 if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1621 return false; 1622 if (LangOpts.C11) { 1623 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1624 C11DisallowedInitialIDCharRanges); 1625 return !C11DisallowedInitialIDChars.contains(C); 1626 } 1627 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1628 C99DisallowedInitialIDCharRanges); 1629 return !C99DisallowedInitialIDChars.contains(C); 1630 } 1631 1632 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1633 CharSourceRange Range) { 1634 1635 static const llvm::sys::UnicodeCharSet MathStartChars( 1636 MathematicalNotationProfileIDStartRanges); 1637 static const llvm::sys::UnicodeCharSet MathContinueChars( 1638 MathematicalNotationProfileIDContinueRanges); 1639 1640 (void)MathStartChars; 1641 (void)MathContinueChars; 1642 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1643 "Unexpected mathematical notation codepoint"); 1644 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1645 << codepointAsHexString(C) << Range; 1646 } 1647 1648 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1649 const char *End) { 1650 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1651 L.getSourceLocation(End)); 1652 } 1653 1654 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1655 CharSourceRange Range, bool IsFirst) { 1656 // Check C99 compatibility. 1657 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1658 enum { 1659 CannotAppearInIdentifier = 0, 1660 CannotStartIdentifier 1661 }; 1662 1663 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1664 C99AllowedIDCharRanges); 1665 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1666 C99DisallowedInitialIDCharRanges); 1667 if (!C99AllowedIDChars.contains(C)) { 1668 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1669 << Range 1670 << CannotAppearInIdentifier; 1671 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1672 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1673 << Range 1674 << CannotStartIdentifier; 1675 } 1676 } 1677 } 1678 1679 /// After encountering UTF-8 character C and interpreting it as an identifier 1680 /// character, check whether it's a homoglyph for a common non-identifier 1681 /// source character that is unlikely to be an intentional identifier 1682 /// character and warn if so. 1683 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1684 CharSourceRange Range) { 1685 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1686 struct HomoglyphPair { 1687 uint32_t Character; 1688 char LooksLike; 1689 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1690 }; 1691 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1692 {U'\u00ad', 0}, // SOFT HYPHEN 1693 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1694 {U'\u037e', ';'}, // GREEK QUESTION MARK 1695 {U'\u200b', 0}, // ZERO WIDTH SPACE 1696 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1697 {U'\u200d', 0}, // ZERO WIDTH JOINER 1698 {U'\u2060', 0}, // WORD JOINER 1699 {U'\u2061', 0}, // FUNCTION APPLICATION 1700 {U'\u2062', 0}, // INVISIBLE TIMES 1701 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1702 {U'\u2064', 0}, // INVISIBLE PLUS 1703 {U'\u2212', '-'}, // MINUS SIGN 1704 {U'\u2215', '/'}, // DIVISION SLASH 1705 {U'\u2216', '\\'}, // SET MINUS 1706 {U'\u2217', '*'}, // ASTERISK OPERATOR 1707 {U'\u2223', '|'}, // DIVIDES 1708 {U'\u2227', '^'}, // LOGICAL AND 1709 {U'\u2236', ':'}, // RATIO 1710 {U'\u223c', '~'}, // TILDE OPERATOR 1711 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1712 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1713 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1714 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1715 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1716 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1717 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1718 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1719 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1720 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1721 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1722 {U'\uff0c', ','}, // FULLWIDTH COMMA 1723 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1724 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1725 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1726 {U'\uff1a', ':'}, // FULLWIDTH COLON 1727 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1728 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1729 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1730 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1731 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1732 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1733 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1734 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1735 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1736 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1737 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1738 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1739 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1740 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1741 {0, 0} 1742 }; 1743 auto Homoglyph = 1744 std::lower_bound(std::begin(SortedHomoglyphs), 1745 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1746 if (Homoglyph->Character == C) { 1747 if (Homoglyph->LooksLike) { 1748 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1749 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1750 << Range << codepointAsHexString(C) << LooksLikeStr; 1751 } else { 1752 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1753 << Range << codepointAsHexString(C); 1754 } 1755 } 1756 } 1757 1758 static void diagnoseInvalidUnicodeCodepointInIdentifier( 1759 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1760 CharSourceRange Range, bool IsFirst) { 1761 if (isASCII(CodePoint)) 1762 return; 1763 1764 bool IsExtension; 1765 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1766 bool IsIDContinue = 1767 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1768 1769 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1770 return; 1771 1772 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1773 1774 if (!IsFirst || InvalidOnlyAtStart) { 1775 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1776 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1777 << FixItHint::CreateRemoval(Range); 1778 } else { 1779 Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1780 << Range << codepointAsHexString(CodePoint) 1781 << FixItHint::CreateRemoval(Range); 1782 } 1783 } 1784 1785 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1786 Token &Result) { 1787 const char *UCNPtr = CurPtr + Size; 1788 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1789 if (CodePoint == 0) { 1790 return false; 1791 } 1792 bool IsExtension = false; 1793 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1794 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1795 return false; 1796 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1797 !PP->isPreprocessedOutput()) 1798 diagnoseInvalidUnicodeCodepointInIdentifier( 1799 PP->getDiagnostics(), LangOpts, CodePoint, 1800 makeCharRange(*this, CurPtr, UCNPtr), 1801 /*IsFirst=*/false); 1802 1803 // We got a unicode codepoint that is neither a space nor a 1804 // a valid identifier part. 1805 // Carry on as if the codepoint was valid for recovery purposes. 1806 } else if (!isLexingRawMode()) { 1807 if (IsExtension) 1808 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1809 makeCharRange(*this, CurPtr, UCNPtr)); 1810 1811 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1812 makeCharRange(*this, CurPtr, UCNPtr), 1813 /*IsFirst=*/false); 1814 } 1815 1816 Result.setFlag(Token::HasUCN); 1817 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1818 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1819 CurPtr = UCNPtr; 1820 else 1821 while (CurPtr != UCNPtr) 1822 (void)getAndAdvanceChar(CurPtr, Result); 1823 return true; 1824 } 1825 1826 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 1827 llvm::UTF32 CodePoint; 1828 1829 // If a UTF-8 codepoint appears immediately after an escaped new line, 1830 // CurPtr may point to the splicing \ on the preceding line, 1831 // so we need to skip it. 1832 unsigned FirstCodeUnitSize; 1833 getCharAndSize(CurPtr, FirstCodeUnitSize); 1834 const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 1835 const char *UnicodePtr = CharStart; 1836 1837 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 1838 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 1839 &CodePoint, llvm::strictConversion); 1840 if (ConvResult != llvm::conversionOK) 1841 return false; 1842 1843 bool IsExtension = false; 1844 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1845 IsExtension)) { 1846 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1847 return false; 1848 1849 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1850 !PP->isPreprocessedOutput()) 1851 diagnoseInvalidUnicodeCodepointInIdentifier( 1852 PP->getDiagnostics(), LangOpts, CodePoint, 1853 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1854 // We got a unicode codepoint that is neither a space nor a 1855 // a valid identifier part. Carry on as if the codepoint was 1856 // valid for recovery purposes. 1857 } else if (!isLexingRawMode()) { 1858 if (IsExtension) 1859 diagnoseExtensionInIdentifier( 1860 PP->getDiagnostics(), CodePoint, 1861 makeCharRange(*this, CharStart, UnicodePtr)); 1862 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1863 makeCharRange(*this, CharStart, UnicodePtr), 1864 /*IsFirst=*/false); 1865 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1866 makeCharRange(*this, CharStart, UnicodePtr)); 1867 } 1868 1869 // Once we sucessfully parsed some UTF-8, 1870 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 1871 // being lexed, and that warnings about trailing spaces are emitted. 1872 ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 1873 CurPtr = UnicodePtr; 1874 return true; 1875 } 1876 1877 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1878 const char *CurPtr) { 1879 bool IsExtension = false; 1880 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1881 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1882 !PP->isPreprocessedOutput()) { 1883 if (IsExtension) 1884 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1885 makeCharRange(*this, BufferPtr, CurPtr)); 1886 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1887 makeCharRange(*this, BufferPtr, CurPtr), 1888 /*IsFirst=*/true); 1889 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1890 makeCharRange(*this, BufferPtr, CurPtr)); 1891 } 1892 1893 MIOpt.ReadToken(); 1894 return LexIdentifierContinue(Result, CurPtr); 1895 } 1896 1897 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1898 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1899 !isUnicodeWhitespace(C)) { 1900 // Non-ASCII characters tend to creep into source code unintentionally. 1901 // Instead of letting the parser complain about the unknown token, 1902 // just drop the character. 1903 // Note that we can /only/ do this when the non-ASCII character is actually 1904 // spelled as Unicode, not written as a UCN. The standard requires that 1905 // we not throw away any possible preprocessor tokens, but there's a 1906 // loophole in the mapping of Unicode characters to basic character set 1907 // characters that allows us to map these particular characters to, say, 1908 // whitespace. 1909 diagnoseInvalidUnicodeCodepointInIdentifier( 1910 PP->getDiagnostics(), LangOpts, C, 1911 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1912 BufferPtr = CurPtr; 1913 return false; 1914 } 1915 1916 // Otherwise, we have an explicit UCN or a character that's unlikely to show 1917 // up by accident. 1918 MIOpt.ReadToken(); 1919 FormTokenWithChars(Result, CurPtr, tok::unknown); 1920 return true; 1921 } 1922 1923 static const char * 1924 fastParseASCIIIdentifier(const char *CurPtr, 1925 [[maybe_unused]] const char *BufferEnd) { 1926 #ifdef __SSE4_2__ 1927 alignas(16) static constexpr char AsciiIdentifierRange[16] = { 1928 '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 1929 }; 1930 constexpr ssize_t BytesPerRegister = 16; 1931 1932 __m128i AsciiIdentifierRangeV = 1933 _mm_load_si128((const __m128i *)AsciiIdentifierRange); 1934 1935 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 1936 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 1937 1938 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 1939 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 1940 _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 1941 CurPtr += Consumed; 1942 if (Consumed == BytesPerRegister) 1943 continue; 1944 return CurPtr; 1945 } 1946 #endif 1947 1948 unsigned char C = *CurPtr; 1949 while (isAsciiIdentifierContinue(C)) 1950 C = *++CurPtr; 1951 return CurPtr; 1952 } 1953 1954 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1955 // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1956 1957 while (true) { 1958 1959 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1960 1961 unsigned Size; 1962 // Slow path: handle trigraph, unicode codepoints, UCNs. 1963 unsigned char C = getCharAndSize(CurPtr, Size); 1964 if (isAsciiIdentifierContinue(C)) { 1965 CurPtr = ConsumeChar(CurPtr, Size, Result); 1966 continue; 1967 } 1968 if (C == '$') { 1969 // If we hit a $ and they are not supported in identifiers, we are done. 1970 if (!LangOpts.DollarIdents) 1971 break; 1972 // Otherwise, emit a diagnostic and continue. 1973 if (!isLexingRawMode()) 1974 Diag(CurPtr, diag::ext_dollar_in_identifier); 1975 CurPtr = ConsumeChar(CurPtr, Size, Result); 1976 continue; 1977 } 1978 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1979 continue; 1980 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1981 continue; 1982 // Neither an expected Unicode codepoint nor a UCN. 1983 break; 1984 } 1985 1986 const char *IdStart = BufferPtr; 1987 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1988 Result.setRawIdentifierData(IdStart); 1989 1990 // If we are in raw mode, return this identifier raw. There is no need to 1991 // look up identifier information or attempt to macro expand it. 1992 if (LexingRawMode) 1993 return true; 1994 1995 // Fill in Result.IdentifierInfo and update the token kind, 1996 // looking up the identifier in the identifier table. 1997 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1998 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1999 // completion, it writes IdentifierInfo into Result, and callers rely on it. 2000 2001 // If the completion point is at the end of an identifier, we want to treat 2002 // the identifier as incomplete even if it resolves to a macro or a keyword. 2003 // This allows e.g. 'class^' to complete to 'classifier'. 2004 if (isCodeCompletionPoint(CurPtr)) { 2005 // Return the code-completion token. 2006 Result.setKind(tok::code_completion); 2007 // Skip the code-completion char and all immediate identifier characters. 2008 // This ensures we get consistent behavior when completing at any point in 2009 // an identifier (i.e. at the start, in the middle, at the end). Note that 2010 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 2011 // simpler. 2012 assert(*CurPtr == 0 && "Completion character must be 0"); 2013 ++CurPtr; 2014 // Note that code completion token is not added as a separate character 2015 // when the completion point is at the end of the buffer. Therefore, we need 2016 // to check if the buffer has ended. 2017 if (CurPtr < BufferEnd) { 2018 while (isAsciiIdentifierContinue(*CurPtr)) 2019 ++CurPtr; 2020 } 2021 BufferPtr = CurPtr; 2022 return true; 2023 } 2024 2025 // Finally, now that we know we have an identifier, pass this off to the 2026 // preprocessor, which may macro expand it or something. 2027 if (II->isHandleIdentifierCase()) 2028 return PP->HandleIdentifier(Result); 2029 2030 return true; 2031 } 2032 2033 /// isHexaLiteral - Return true if Start points to a hex constant. 2034 /// in microsoft mode (where this is supposed to be several different tokens). 2035 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 2036 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 2037 char C1 = CharAndSize1.Char; 2038 if (C1 != '0') 2039 return false; 2040 2041 auto CharAndSize2 = 2042 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 2043 char C2 = CharAndSize2.Char; 2044 return (C2 == 'x' || C2 == 'X'); 2045 } 2046 2047 /// LexNumericConstant - Lex the remainder of a integer or floating point 2048 /// constant. From[-1] is the first character lexed. Return the end of the 2049 /// constant. 2050 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 2051 unsigned Size; 2052 char C = getCharAndSize(CurPtr, Size); 2053 char PrevCh = 0; 2054 while (isPreprocessingNumberBody(C)) { 2055 CurPtr = ConsumeChar(CurPtr, Size, Result); 2056 PrevCh = C; 2057 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) { 2058 CurPtr -= Size; 2059 break; 2060 } 2061 C = getCharAndSize(CurPtr, Size); 2062 } 2063 2064 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 2065 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 2066 // If we are in Microsoft mode, don't continue if the constant is hex. 2067 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 2068 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 2069 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2070 } 2071 2072 // If we have a hex FP constant, continue. 2073 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 2074 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 2075 // not-quite-conforming extension. Only do so if this looks like it's 2076 // actually meant to be a hexfloat, and not if it has a ud-suffix. 2077 bool IsHexFloat = true; 2078 if (!LangOpts.C99) { 2079 if (!isHexaLiteral(BufferPtr, LangOpts)) 2080 IsHexFloat = false; 2081 else if (!LangOpts.CPlusPlus17 && 2082 std::find(BufferPtr, CurPtr, '_') != CurPtr) 2083 IsHexFloat = false; 2084 } 2085 if (IsHexFloat) 2086 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2087 } 2088 2089 // If we have a digit separator, continue. 2090 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 2091 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2092 if (isAsciiIdentifierContinue(Next)) { 2093 if (!isLexingRawMode()) 2094 Diag(CurPtr, LangOpts.CPlusPlus 2095 ? diag::warn_cxx11_compat_digit_separator 2096 : diag::warn_c23_compat_digit_separator); 2097 CurPtr = ConsumeChar(CurPtr, Size, Result); 2098 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 2099 return LexNumericConstant(Result, CurPtr); 2100 } 2101 } 2102 2103 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 2104 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2105 return LexNumericConstant(Result, CurPtr); 2106 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2107 return LexNumericConstant(Result, CurPtr); 2108 2109 // Update the location of token as well as BufferPtr. 2110 const char *TokStart = BufferPtr; 2111 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 2112 Result.setLiteralData(TokStart); 2113 return true; 2114 } 2115 2116 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 2117 /// in C++11, or warn on a ud-suffix in C++98. 2118 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 2119 bool IsStringLiteral) { 2120 assert(LangOpts.CPlusPlus); 2121 2122 // Maximally munch an identifier. 2123 unsigned Size; 2124 char C = getCharAndSize(CurPtr, Size); 2125 bool Consumed = false; 2126 2127 if (!isAsciiIdentifierStart(C)) { 2128 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2129 Consumed = true; 2130 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2131 Consumed = true; 2132 else 2133 return CurPtr; 2134 } 2135 2136 if (!LangOpts.CPlusPlus11) { 2137 if (!isLexingRawMode()) 2138 Diag(CurPtr, 2139 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 2140 : diag::warn_cxx11_compat_reserved_user_defined_literal) 2141 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2142 return CurPtr; 2143 } 2144 2145 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 2146 // that does not start with an underscore is ill-formed. As a conforming 2147 // extension, we treat all such suffixes as if they had whitespace before 2148 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 2149 // likely to be a ud-suffix than a macro, however, and accept that. 2150 if (!Consumed) { 2151 bool IsUDSuffix = false; 2152 if (C == '_') 2153 IsUDSuffix = true; 2154 else if (IsStringLiteral && LangOpts.CPlusPlus14) { 2155 // In C++1y, we need to look ahead a few characters to see if this is a 2156 // valid suffix for a string literal or a numeric literal (this could be 2157 // the 'operator""if' defining a numeric literal operator). 2158 const unsigned MaxStandardSuffixLength = 3; 2159 char Buffer[MaxStandardSuffixLength] = { C }; 2160 unsigned Consumed = Size; 2161 unsigned Chars = 1; 2162 while (true) { 2163 auto [Next, NextSize] = 2164 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2165 if (!isAsciiIdentifierContinue(Next)) { 2166 // End of suffix. Check whether this is on the allowed list. 2167 const StringRef CompleteSuffix(Buffer, Chars); 2168 IsUDSuffix = 2169 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 2170 break; 2171 } 2172 2173 if (Chars == MaxStandardSuffixLength) 2174 // Too long: can't be a standard suffix. 2175 break; 2176 2177 Buffer[Chars++] = Next; 2178 Consumed += NextSize; 2179 } 2180 } 2181 2182 if (!IsUDSuffix) { 2183 if (!isLexingRawMode()) 2184 Diag(CurPtr, LangOpts.MSVCCompat 2185 ? diag::ext_ms_reserved_user_defined_literal 2186 : diag::ext_reserved_user_defined_literal) 2187 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2188 return CurPtr; 2189 } 2190 2191 CurPtr = ConsumeChar(CurPtr, Size, Result); 2192 } 2193 2194 Result.setFlag(Token::HasUDSuffix); 2195 while (true) { 2196 C = getCharAndSize(CurPtr, Size); 2197 if (isAsciiIdentifierContinue(C)) { 2198 CurPtr = ConsumeChar(CurPtr, Size, Result); 2199 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2200 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2201 } else 2202 break; 2203 } 2204 2205 return CurPtr; 2206 } 2207 2208 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 2209 /// either " or L" or u8" or u" or U". 2210 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 2211 tok::TokenKind Kind) { 2212 const char *AfterQuote = CurPtr; 2213 // Does this string contain the \0 character? 2214 const char *NulCharacter = nullptr; 2215 2216 if (!isLexingRawMode() && 2217 (Kind == tok::utf8_string_literal || 2218 Kind == tok::utf16_string_literal || 2219 Kind == tok::utf32_string_literal)) 2220 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 2221 : diag::warn_c99_compat_unicode_literal); 2222 2223 char C = getAndAdvanceChar(CurPtr, Result); 2224 while (C != '"') { 2225 // Skip escaped characters. Escaped newlines will already be processed by 2226 // getAndAdvanceChar. 2227 if (C == '\\') 2228 C = getAndAdvanceChar(CurPtr, Result); 2229 2230 if (C == '\n' || C == '\r' || // Newline. 2231 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2232 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2233 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 2234 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2235 return true; 2236 } 2237 2238 if (C == 0) { 2239 if (isCodeCompletionPoint(CurPtr-1)) { 2240 if (ParsingFilename) 2241 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 2242 else 2243 PP->CodeCompleteNaturalLanguage(); 2244 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2245 cutOffLexing(); 2246 return true; 2247 } 2248 2249 NulCharacter = CurPtr-1; 2250 } 2251 C = getAndAdvanceChar(CurPtr, Result); 2252 } 2253 2254 // If we are in C++11, lex the optional ud-suffix. 2255 if (LangOpts.CPlusPlus) 2256 CurPtr = LexUDSuffix(Result, CurPtr, true); 2257 2258 // If a nul character existed in the string, warn about it. 2259 if (NulCharacter && !isLexingRawMode()) 2260 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2261 2262 // Update the location of the token as well as the BufferPtr instance var. 2263 const char *TokStart = BufferPtr; 2264 FormTokenWithChars(Result, CurPtr, Kind); 2265 Result.setLiteralData(TokStart); 2266 return true; 2267 } 2268 2269 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 2270 /// having lexed R", LR", u8R", uR", or UR". 2271 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 2272 tok::TokenKind Kind) { 2273 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 2274 // Between the initial and final double quote characters of the raw string, 2275 // any transformations performed in phases 1 and 2 (trigraphs, 2276 // universal-character-names, and line splicing) are reverted. 2277 2278 if (!isLexingRawMode()) 2279 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 2280 2281 unsigned PrefixLen = 0; 2282 2283 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) { 2284 if (!isLexingRawMode() && 2285 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) { 2286 const char *Pos = &CurPtr[PrefixLen]; 2287 Diag(Pos, LangOpts.CPlusPlus26 2288 ? diag::warn_cxx26_compat_raw_string_literal_character_set 2289 : diag::ext_cxx26_raw_string_literal_character_set) 2290 << StringRef(Pos, 1); 2291 } 2292 ++PrefixLen; 2293 } 2294 2295 // If the last character was not a '(', then we didn't lex a valid delimiter. 2296 if (CurPtr[PrefixLen] != '(') { 2297 if (!isLexingRawMode()) { 2298 const char *PrefixEnd = &CurPtr[PrefixLen]; 2299 if (PrefixLen == 16) { 2300 Diag(PrefixEnd, diag::err_raw_delim_too_long); 2301 } else if (*PrefixEnd == '\n') { 2302 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim); 2303 } else { 2304 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 2305 << StringRef(PrefixEnd, 1); 2306 } 2307 } 2308 2309 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 2310 // it's possible the '"' was intended to be part of the raw string, but 2311 // there's not much we can do about that. 2312 while (true) { 2313 char C = *CurPtr++; 2314 2315 if (C == '"') 2316 break; 2317 if (C == 0 && CurPtr-1 == BufferEnd) { 2318 --CurPtr; 2319 break; 2320 } 2321 } 2322 2323 FormTokenWithChars(Result, CurPtr, tok::unknown); 2324 return true; 2325 } 2326 2327 // Save prefix and move CurPtr past it 2328 const char *Prefix = CurPtr; 2329 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2330 2331 while (true) { 2332 char C = *CurPtr++; 2333 2334 if (C == ')') { 2335 // Check for prefix match and closing quote. 2336 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2337 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2338 break; 2339 } 2340 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2341 if (!isLexingRawMode()) 2342 Diag(BufferPtr, diag::err_unterminated_raw_string) 2343 << StringRef(Prefix, PrefixLen); 2344 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2345 return true; 2346 } 2347 } 2348 2349 // If we are in C++11, lex the optional ud-suffix. 2350 if (LangOpts.CPlusPlus) 2351 CurPtr = LexUDSuffix(Result, CurPtr, true); 2352 2353 // Update the location of token as well as BufferPtr. 2354 const char *TokStart = BufferPtr; 2355 FormTokenWithChars(Result, CurPtr, Kind); 2356 Result.setLiteralData(TokStart); 2357 return true; 2358 } 2359 2360 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2361 /// after having lexed the '<' character. This is used for #include filenames. 2362 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2363 // Does this string contain the \0 character? 2364 const char *NulCharacter = nullptr; 2365 const char *AfterLessPos = CurPtr; 2366 char C = getAndAdvanceChar(CurPtr, Result); 2367 while (C != '>') { 2368 // Skip escaped characters. Escaped newlines will already be processed by 2369 // getAndAdvanceChar. 2370 if (C == '\\') 2371 C = getAndAdvanceChar(CurPtr, Result); 2372 2373 if (isVerticalWhitespace(C) || // Newline. 2374 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2375 // If the filename is unterminated, then it must just be a lone < 2376 // character. Return this as such. 2377 FormTokenWithChars(Result, AfterLessPos, tok::less); 2378 return true; 2379 } 2380 2381 if (C == 0) { 2382 if (isCodeCompletionPoint(CurPtr - 1)) { 2383 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2384 cutOffLexing(); 2385 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2386 return true; 2387 } 2388 NulCharacter = CurPtr-1; 2389 } 2390 C = getAndAdvanceChar(CurPtr, Result); 2391 } 2392 2393 // If a nul character existed in the string, warn about it. 2394 if (NulCharacter && !isLexingRawMode()) 2395 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2396 2397 // Update the location of token as well as BufferPtr. 2398 const char *TokStart = BufferPtr; 2399 FormTokenWithChars(Result, CurPtr, tok::header_name); 2400 Result.setLiteralData(TokStart); 2401 return true; 2402 } 2403 2404 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2405 const char *CompletionPoint, 2406 bool IsAngled) { 2407 // Completion only applies to the filename, after the last slash. 2408 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2409 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2410 auto Slash = PartialPath.find_last_of(SlashChars); 2411 StringRef Dir = 2412 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2413 const char *StartOfFilename = 2414 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2415 // Code completion filter range is the filename only, up to completion point. 2416 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2417 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2418 // We should replace the characters up to the closing quote or closest slash, 2419 // if any. 2420 while (CompletionPoint < BufferEnd) { 2421 char Next = *(CompletionPoint + 1); 2422 if (Next == 0 || Next == '\r' || Next == '\n') 2423 break; 2424 ++CompletionPoint; 2425 if (Next == (IsAngled ? '>' : '"')) 2426 break; 2427 if (SlashChars.contains(Next)) 2428 break; 2429 } 2430 2431 PP->setCodeCompletionTokenRange( 2432 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2433 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2434 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2435 } 2436 2437 /// LexCharConstant - Lex the remainder of a character constant, after having 2438 /// lexed either ' or L' or u8' or u' or U'. 2439 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2440 tok::TokenKind Kind) { 2441 // Does this character contain the \0 character? 2442 const char *NulCharacter = nullptr; 2443 2444 if (!isLexingRawMode()) { 2445 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2446 Diag(BufferPtr, LangOpts.CPlusPlus 2447 ? diag::warn_cxx98_compat_unicode_literal 2448 : diag::warn_c99_compat_unicode_literal); 2449 else if (Kind == tok::utf8_char_constant) 2450 Diag(BufferPtr, LangOpts.CPlusPlus 2451 ? diag::warn_cxx14_compat_u8_character_literal 2452 : diag::warn_c17_compat_u8_character_literal); 2453 } 2454 2455 char C = getAndAdvanceChar(CurPtr, Result); 2456 if (C == '\'') { 2457 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2458 Diag(BufferPtr, diag::ext_empty_character); 2459 FormTokenWithChars(Result, CurPtr, tok::unknown); 2460 return true; 2461 } 2462 2463 while (C != '\'') { 2464 // Skip escaped characters. 2465 if (C == '\\') 2466 C = getAndAdvanceChar(CurPtr, Result); 2467 2468 if (C == '\n' || C == '\r' || // Newline. 2469 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2470 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2471 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2472 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2473 return true; 2474 } 2475 2476 if (C == 0) { 2477 if (isCodeCompletionPoint(CurPtr-1)) { 2478 PP->CodeCompleteNaturalLanguage(); 2479 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2480 cutOffLexing(); 2481 return true; 2482 } 2483 2484 NulCharacter = CurPtr-1; 2485 } 2486 C = getAndAdvanceChar(CurPtr, Result); 2487 } 2488 2489 // If we are in C++11, lex the optional ud-suffix. 2490 if (LangOpts.CPlusPlus) 2491 CurPtr = LexUDSuffix(Result, CurPtr, false); 2492 2493 // If a nul character existed in the character, warn about it. 2494 if (NulCharacter && !isLexingRawMode()) 2495 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2496 2497 // Update the location of token as well as BufferPtr. 2498 const char *TokStart = BufferPtr; 2499 FormTokenWithChars(Result, CurPtr, Kind); 2500 Result.setLiteralData(TokStart); 2501 return true; 2502 } 2503 2504 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2505 /// Update BufferPtr to point to the next non-whitespace character and return. 2506 /// 2507 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2508 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2509 bool &TokAtPhysicalStartOfLine) { 2510 // Whitespace - Skip it, then return the token after the whitespace. 2511 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2512 2513 unsigned char Char = *CurPtr; 2514 2515 const char *lastNewLine = nullptr; 2516 auto setLastNewLine = [&](const char *Ptr) { 2517 lastNewLine = Ptr; 2518 if (!NewLinePtr) 2519 NewLinePtr = Ptr; 2520 }; 2521 if (SawNewline) 2522 setLastNewLine(CurPtr - 1); 2523 2524 // Skip consecutive spaces efficiently. 2525 while (true) { 2526 // Skip horizontal whitespace very aggressively. 2527 while (isHorizontalWhitespace(Char)) 2528 Char = *++CurPtr; 2529 2530 // Otherwise if we have something other than whitespace, we're done. 2531 if (!isVerticalWhitespace(Char)) 2532 break; 2533 2534 if (ParsingPreprocessorDirective) { 2535 // End of preprocessor directive line, let LexTokenInternal handle this. 2536 BufferPtr = CurPtr; 2537 return false; 2538 } 2539 2540 // OK, but handle newline. 2541 if (*CurPtr == '\n') 2542 setLastNewLine(CurPtr); 2543 SawNewline = true; 2544 Char = *++CurPtr; 2545 } 2546 2547 // If the client wants us to return whitespace, return it now. 2548 if (isKeepWhitespaceMode()) { 2549 FormTokenWithChars(Result, CurPtr, tok::unknown); 2550 if (SawNewline) { 2551 IsAtStartOfLine = true; 2552 IsAtPhysicalStartOfLine = true; 2553 } 2554 // FIXME: The next token will not have LeadingSpace set. 2555 return true; 2556 } 2557 2558 // If this isn't immediately after a newline, there is leading space. 2559 char PrevChar = CurPtr[-1]; 2560 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2561 2562 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2563 if (SawNewline) { 2564 Result.setFlag(Token::StartOfLine); 2565 TokAtPhysicalStartOfLine = true; 2566 2567 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2568 if (auto *Handler = PP->getEmptylineHandler()) 2569 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2570 getSourceLocation(lastNewLine))); 2571 } 2572 } 2573 2574 BufferPtr = CurPtr; 2575 return false; 2576 } 2577 2578 /// We have just read the // characters from input. Skip until we find the 2579 /// newline character that terminates the comment. Then update BufferPtr and 2580 /// return. 2581 /// 2582 /// If we're in KeepCommentMode or any CommentHandler has inserted 2583 /// some tokens, this will store the first token and return true. 2584 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2585 bool &TokAtPhysicalStartOfLine) { 2586 // If Line comments aren't explicitly enabled for this language, emit an 2587 // extension warning. 2588 if (!LineComment) { 2589 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 2590 Diag(BufferPtr, diag::ext_line_comment); 2591 2592 // Mark them enabled so we only emit one warning for this translation 2593 // unit. 2594 LineComment = true; 2595 } 2596 2597 // Scan over the body of the comment. The common case, when scanning, is that 2598 // the comment contains normal ascii characters with nothing interesting in 2599 // them. As such, optimize for this case with the inner loop. 2600 // 2601 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2602 // character that ends the line comment. 2603 2604 // C++23 [lex.phases] p1 2605 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2606 // diagnostic only once per entire ill-formed subsequence to avoid 2607 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2608 bool UnicodeDecodingAlreadyDiagnosed = false; 2609 2610 char C; 2611 while (true) { 2612 C = *CurPtr; 2613 // Skip over characters in the fast loop. 2614 while (isASCII(C) && C != 0 && // Potentially EOF. 2615 C != '\n' && C != '\r') { // Newline or DOS-style newline. 2616 C = *++CurPtr; 2617 UnicodeDecodingAlreadyDiagnosed = false; 2618 } 2619 2620 if (!isASCII(C)) { 2621 unsigned Length = llvm::getUTF8SequenceSize( 2622 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2623 if (Length == 0) { 2624 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2625 Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2626 UnicodeDecodingAlreadyDiagnosed = true; 2627 ++CurPtr; 2628 } else { 2629 UnicodeDecodingAlreadyDiagnosed = false; 2630 CurPtr += Length; 2631 } 2632 continue; 2633 } 2634 2635 const char *NextLine = CurPtr; 2636 if (C != 0) { 2637 // We found a newline, see if it's escaped. 2638 const char *EscapePtr = CurPtr-1; 2639 bool HasSpace = false; 2640 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2641 --EscapePtr; 2642 HasSpace = true; 2643 } 2644 2645 if (*EscapePtr == '\\') 2646 // Escaped newline. 2647 CurPtr = EscapePtr; 2648 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2649 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2650 // Trigraph-escaped newline. 2651 CurPtr = EscapePtr-2; 2652 else 2653 break; // This is a newline, we're done. 2654 2655 // If there was space between the backslash and newline, warn about it. 2656 if (HasSpace && !isLexingRawMode()) 2657 Diag(EscapePtr, diag::backslash_newline_space); 2658 } 2659 2660 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2661 // properly decode the character. Read it in raw mode to avoid emitting 2662 // diagnostics about things like trigraphs. If we see an escaped newline, 2663 // we'll handle it below. 2664 const char *OldPtr = CurPtr; 2665 bool OldRawMode = isLexingRawMode(); 2666 LexingRawMode = true; 2667 C = getAndAdvanceChar(CurPtr, Result); 2668 LexingRawMode = OldRawMode; 2669 2670 // If we only read only one character, then no special handling is needed. 2671 // We're done and can skip forward to the newline. 2672 if (C != 0 && CurPtr == OldPtr+1) { 2673 CurPtr = NextLine; 2674 break; 2675 } 2676 2677 // If we read multiple characters, and one of those characters was a \r or 2678 // \n, then we had an escaped newline within the comment. Emit diagnostic 2679 // unless the next line is also a // comment. 2680 if (CurPtr != OldPtr + 1 && C != '/' && 2681 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2682 for (; OldPtr != CurPtr; ++OldPtr) 2683 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2684 // Okay, we found a // comment that ends in a newline, if the next 2685 // line is also a // comment, but has spaces, don't emit a diagnostic. 2686 if (isWhitespace(C)) { 2687 const char *ForwardPtr = CurPtr; 2688 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2689 ++ForwardPtr; 2690 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2691 break; 2692 } 2693 2694 if (!isLexingRawMode()) 2695 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2696 break; 2697 } 2698 } 2699 2700 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2701 --CurPtr; 2702 break; 2703 } 2704 2705 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2706 PP->CodeCompleteNaturalLanguage(); 2707 cutOffLexing(); 2708 return false; 2709 } 2710 } 2711 2712 // Found but did not consume the newline. Notify comment handlers about the 2713 // comment unless we're in a #if 0 block. 2714 if (PP && !isLexingRawMode() && 2715 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2716 getSourceLocation(CurPtr)))) { 2717 BufferPtr = CurPtr; 2718 return true; // A token has to be returned. 2719 } 2720 2721 // If we are returning comments as tokens, return this comment as a token. 2722 if (inKeepCommentMode()) 2723 return SaveLineComment(Result, CurPtr); 2724 2725 // If we are inside a preprocessor directive and we see the end of line, 2726 // return immediately, so that the lexer can return this as an EOD token. 2727 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2728 BufferPtr = CurPtr; 2729 return false; 2730 } 2731 2732 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2733 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2734 // contribute to another token), it isn't needed for correctness. Note that 2735 // this is ok even in KeepWhitespaceMode, because we would have returned the 2736 // comment above in that mode. 2737 NewLinePtr = CurPtr++; 2738 2739 // The next returned token is at the start of the line. 2740 Result.setFlag(Token::StartOfLine); 2741 TokAtPhysicalStartOfLine = true; 2742 // No leading whitespace seen so far. 2743 Result.clearFlag(Token::LeadingSpace); 2744 BufferPtr = CurPtr; 2745 return false; 2746 } 2747 2748 /// If in save-comment mode, package up this Line comment in an appropriate 2749 /// way and return it. 2750 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2751 // If we're not in a preprocessor directive, just return the // comment 2752 // directly. 2753 FormTokenWithChars(Result, CurPtr, tok::comment); 2754 2755 if (!ParsingPreprocessorDirective || LexingRawMode) 2756 return true; 2757 2758 // If this Line-style comment is in a macro definition, transmogrify it into 2759 // a C-style block comment. 2760 bool Invalid = false; 2761 std::string Spelling = PP->getSpelling(Result, &Invalid); 2762 if (Invalid) 2763 return true; 2764 2765 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2766 Spelling[1] = '*'; // Change prefix to "/*". 2767 Spelling += "*/"; // add suffix. 2768 2769 Result.setKind(tok::comment); 2770 PP->CreateString(Spelling, Result, 2771 Result.getLocation(), Result.getLocation()); 2772 return true; 2773 } 2774 2775 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2776 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2777 /// a diagnostic if so. We know that the newline is inside of a block comment. 2778 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2779 bool Trigraphs) { 2780 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2781 2782 // Position of the first trigraph in the ending sequence. 2783 const char *TrigraphPos = nullptr; 2784 // Position of the first whitespace after a '\' in the ending sequence. 2785 const char *SpacePos = nullptr; 2786 2787 while (true) { 2788 // Back up off the newline. 2789 --CurPtr; 2790 2791 // If this is a two-character newline sequence, skip the other character. 2792 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2793 // \n\n or \r\r -> not escaped newline. 2794 if (CurPtr[0] == CurPtr[1]) 2795 return false; 2796 // \n\r or \r\n -> skip the newline. 2797 --CurPtr; 2798 } 2799 2800 // If we have horizontal whitespace, skip over it. We allow whitespace 2801 // between the slash and newline. 2802 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2803 SpacePos = CurPtr; 2804 --CurPtr; 2805 } 2806 2807 // If we have a slash, this is an escaped newline. 2808 if (*CurPtr == '\\') { 2809 --CurPtr; 2810 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2811 // This is a trigraph encoding of a slash. 2812 TrigraphPos = CurPtr - 2; 2813 CurPtr -= 3; 2814 } else { 2815 return false; 2816 } 2817 2818 // If the character preceding the escaped newline is a '*', then after line 2819 // splicing we have a '*/' ending the comment. 2820 if (*CurPtr == '*') 2821 break; 2822 2823 if (*CurPtr != '\n' && *CurPtr != '\r') 2824 return false; 2825 } 2826 2827 if (TrigraphPos) { 2828 // If no trigraphs are enabled, warn that we ignored this trigraph and 2829 // ignore this * character. 2830 if (!Trigraphs) { 2831 if (!L->isLexingRawMode()) 2832 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 2833 return false; 2834 } 2835 if (!L->isLexingRawMode()) 2836 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 2837 } 2838 2839 // Warn about having an escaped newline between the */ characters. 2840 if (!L->isLexingRawMode()) 2841 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 2842 2843 // If there was space between the backslash and newline, warn about it. 2844 if (SpacePos && !L->isLexingRawMode()) 2845 L->Diag(SpacePos, diag::backslash_newline_space); 2846 2847 return true; 2848 } 2849 2850 #ifdef __SSE2__ 2851 #include <emmintrin.h> 2852 #elif __ALTIVEC__ 2853 #include <altivec.h> 2854 #undef bool 2855 #endif 2856 2857 /// We have just read from input the / and * characters that started a comment. 2858 /// Read until we find the * and / characters that terminate the comment. 2859 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2860 /// comments, because they cannot cause the comment to end. The only thing 2861 /// that can happen is the comment could end with an escaped newline between 2862 /// the terminating * and /. 2863 /// 2864 /// If we're in KeepCommentMode or any CommentHandler has inserted 2865 /// some tokens, this will store the first token and return true. 2866 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2867 bool &TokAtPhysicalStartOfLine) { 2868 // Scan one character past where we should, looking for a '/' character. Once 2869 // we find it, check to see if it was preceded by a *. This common 2870 // optimization helps people who like to put a lot of * characters in their 2871 // comments. 2872 2873 // The first character we get with newlines and trigraphs skipped to handle 2874 // the degenerate /*/ case below correctly if the * has an escaped newline 2875 // after it. 2876 unsigned CharSize; 2877 unsigned char C = getCharAndSize(CurPtr, CharSize); 2878 CurPtr += CharSize; 2879 if (C == 0 && CurPtr == BufferEnd+1) { 2880 if (!isLexingRawMode()) 2881 Diag(BufferPtr, diag::err_unterminated_block_comment); 2882 --CurPtr; 2883 2884 // KeepWhitespaceMode should return this broken comment as a token. Since 2885 // it isn't a well formed comment, just return it as an 'unknown' token. 2886 if (isKeepWhitespaceMode()) { 2887 FormTokenWithChars(Result, CurPtr, tok::unknown); 2888 return true; 2889 } 2890 2891 BufferPtr = CurPtr; 2892 return false; 2893 } 2894 2895 // Check to see if the first character after the '/*' is another /. If so, 2896 // then this slash does not end the block comment, it is part of it. 2897 if (C == '/') 2898 C = *CurPtr++; 2899 2900 // C++23 [lex.phases] p1 2901 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2902 // diagnostic only once per entire ill-formed subsequence to avoid 2903 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2904 bool UnicodeDecodingAlreadyDiagnosed = false; 2905 2906 while (true) { 2907 // Skip over all non-interesting characters until we find end of buffer or a 2908 // (probably ending) '/' character. 2909 if (CurPtr + 24 < BufferEnd && 2910 // If there is a code-completion point avoid the fast scan because it 2911 // doesn't check for '\0'. 2912 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2913 // While not aligned to a 16-byte boundary. 2914 while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2915 if (!isASCII(C)) 2916 goto MultiByteUTF8; 2917 C = *CurPtr++; 2918 } 2919 if (C == '/') goto FoundSlash; 2920 2921 #ifdef __SSE2__ 2922 __m128i Slashes = _mm_set1_epi8('/'); 2923 while (CurPtr + 16 < BufferEnd) { 2924 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2925 if (LLVM_UNLIKELY(Mask != 0)) { 2926 goto MultiByteUTF8; 2927 } 2928 // look for slashes 2929 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2930 Slashes)); 2931 if (cmp != 0) { 2932 // Adjust the pointer to point directly after the first slash. It's 2933 // not necessary to set C here, it will be overwritten at the end of 2934 // the outer loop. 2935 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 2936 goto FoundSlash; 2937 } 2938 CurPtr += 16; 2939 } 2940 #elif __ALTIVEC__ 2941 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2942 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2943 0x80, 0x80, 0x80, 0x80}; 2944 __vector unsigned char Slashes = { 2945 '/', '/', '/', '/', '/', '/', '/', '/', 2946 '/', '/', '/', '/', '/', '/', '/', '/' 2947 }; 2948 while (CurPtr + 16 < BufferEnd) { 2949 if (LLVM_UNLIKELY( 2950 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2951 goto MultiByteUTF8; 2952 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2953 break; 2954 } 2955 CurPtr += 16; 2956 } 2957 2958 #else 2959 while (CurPtr + 16 < BufferEnd) { 2960 bool HasNonASCII = false; 2961 for (unsigned I = 0; I < 16; ++I) 2962 HasNonASCII |= !isASCII(CurPtr[I]); 2963 2964 if (LLVM_UNLIKELY(HasNonASCII)) 2965 goto MultiByteUTF8; 2966 2967 bool HasSlash = false; 2968 for (unsigned I = 0; I < 16; ++I) 2969 HasSlash |= CurPtr[I] == '/'; 2970 if (HasSlash) 2971 break; 2972 CurPtr += 16; 2973 } 2974 #endif 2975 2976 // It has to be one of the bytes scanned, increment to it and read one. 2977 C = *CurPtr++; 2978 } 2979 2980 // Loop to scan the remainder, warning on invalid UTF-8 2981 // if the corresponding warning is enabled, emitting a diagnostic only once 2982 // per sequence that cannot be decoded. 2983 while (C != '/' && C != '\0') { 2984 if (isASCII(C)) { 2985 UnicodeDecodingAlreadyDiagnosed = false; 2986 C = *CurPtr++; 2987 continue; 2988 } 2989 MultiByteUTF8: 2990 // CurPtr is 1 code unit past C, so to decode 2991 // the codepoint, we need to read from the previous position. 2992 unsigned Length = llvm::getUTF8SequenceSize( 2993 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2994 if (Length == 0) { 2995 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2996 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2997 UnicodeDecodingAlreadyDiagnosed = true; 2998 } else { 2999 UnicodeDecodingAlreadyDiagnosed = false; 3000 CurPtr += Length - 1; 3001 } 3002 C = *CurPtr++; 3003 } 3004 3005 if (C == '/') { 3006 FoundSlash: 3007 if (CurPtr[-2] == '*') // We found the final */. We're done! 3008 break; 3009 3010 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 3011 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 3012 LangOpts.Trigraphs)) { 3013 // We found the final */, though it had an escaped newline between the 3014 // * and /. We're done! 3015 break; 3016 } 3017 } 3018 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 3019 // If this is a /* inside of the comment, emit a warning. Don't do this 3020 // if this is a /*/, which will end the comment. This misses cases with 3021 // embedded escaped newlines, but oh well. 3022 if (!isLexingRawMode()) 3023 Diag(CurPtr-1, diag::warn_nested_block_comment); 3024 } 3025 } else if (C == 0 && CurPtr == BufferEnd+1) { 3026 if (!isLexingRawMode()) 3027 Diag(BufferPtr, diag::err_unterminated_block_comment); 3028 // Note: the user probably forgot a */. We could continue immediately 3029 // after the /*, but this would involve lexing a lot of what really is the 3030 // comment, which surely would confuse the parser. 3031 --CurPtr; 3032 3033 // KeepWhitespaceMode should return this broken comment as a token. Since 3034 // it isn't a well formed comment, just return it as an 'unknown' token. 3035 if (isKeepWhitespaceMode()) { 3036 FormTokenWithChars(Result, CurPtr, tok::unknown); 3037 return true; 3038 } 3039 3040 BufferPtr = CurPtr; 3041 return false; 3042 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 3043 PP->CodeCompleteNaturalLanguage(); 3044 cutOffLexing(); 3045 return false; 3046 } 3047 3048 C = *CurPtr++; 3049 } 3050 3051 // Notify comment handlers about the comment unless we're in a #if 0 block. 3052 if (PP && !isLexingRawMode() && 3053 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 3054 getSourceLocation(CurPtr)))) { 3055 BufferPtr = CurPtr; 3056 return true; // A token has to be returned. 3057 } 3058 3059 // If we are returning comments as tokens, return this comment as a token. 3060 if (inKeepCommentMode()) { 3061 FormTokenWithChars(Result, CurPtr, tok::comment); 3062 return true; 3063 } 3064 3065 // It is common for the tokens immediately after a /**/ comment to be 3066 // whitespace. Instead of going through the big switch, handle it 3067 // efficiently now. This is safe even in KeepWhitespaceMode because we would 3068 // have already returned above with the comment as a token. 3069 if (isHorizontalWhitespace(*CurPtr)) { 3070 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 3071 return false; 3072 } 3073 3074 // Otherwise, just return so that the next character will be lexed as a token. 3075 BufferPtr = CurPtr; 3076 Result.setFlag(Token::LeadingSpace); 3077 return false; 3078 } 3079 3080 //===----------------------------------------------------------------------===// 3081 // Primary Lexing Entry Points 3082 //===----------------------------------------------------------------------===// 3083 3084 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 3085 /// uninterpreted string. This switches the lexer out of directive mode. 3086 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 3087 assert(ParsingPreprocessorDirective && ParsingFilename == false && 3088 "Must be in a preprocessing directive!"); 3089 Token Tmp; 3090 Tmp.startToken(); 3091 3092 // CurPtr - Cache BufferPtr in an automatic variable. 3093 const char *CurPtr = BufferPtr; 3094 while (true) { 3095 char Char = getAndAdvanceChar(CurPtr, Tmp); 3096 switch (Char) { 3097 default: 3098 if (Result) 3099 Result->push_back(Char); 3100 break; 3101 case 0: // Null. 3102 // Found end of file? 3103 if (CurPtr-1 != BufferEnd) { 3104 if (isCodeCompletionPoint(CurPtr-1)) { 3105 PP->CodeCompleteNaturalLanguage(); 3106 cutOffLexing(); 3107 return; 3108 } 3109 3110 // Nope, normal character, continue. 3111 if (Result) 3112 Result->push_back(Char); 3113 break; 3114 } 3115 // FALL THROUGH. 3116 [[fallthrough]]; 3117 case '\r': 3118 case '\n': 3119 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 3120 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 3121 BufferPtr = CurPtr-1; 3122 3123 // Next, lex the character, which should handle the EOD transition. 3124 Lex(Tmp); 3125 if (Tmp.is(tok::code_completion)) { 3126 if (PP) 3127 PP->CodeCompleteNaturalLanguage(); 3128 Lex(Tmp); 3129 } 3130 assert(Tmp.is(tok::eod) && "Unexpected token!"); 3131 3132 // Finally, we're done; 3133 return; 3134 } 3135 } 3136 } 3137 3138 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 3139 /// condition, reporting diagnostics and handling other edge cases as required. 3140 /// This returns true if Result contains a token, false if PP.Lex should be 3141 /// called again. 3142 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 3143 // If we hit the end of the file while parsing a preprocessor directive, 3144 // end the preprocessor directive first. The next token returned will 3145 // then be the end of file. 3146 if (ParsingPreprocessorDirective) { 3147 // Done parsing the "line". 3148 ParsingPreprocessorDirective = false; 3149 // Update the location of token as well as BufferPtr. 3150 FormTokenWithChars(Result, CurPtr, tok::eod); 3151 3152 // Restore comment saving mode, in case it was disabled for directive. 3153 if (PP) 3154 resetExtendedTokenMode(); 3155 return true; // Have a token. 3156 } 3157 3158 // If we are in raw mode, return this event as an EOF token. Let the caller 3159 // that put us in raw mode handle the event. 3160 if (isLexingRawMode()) { 3161 Result.startToken(); 3162 BufferPtr = BufferEnd; 3163 FormTokenWithChars(Result, BufferEnd, tok::eof); 3164 return true; 3165 } 3166 3167 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 3168 PP->setRecordedPreambleConditionalStack(ConditionalStack); 3169 // If the preamble cuts off the end of a header guard, consider it guarded. 3170 // The guard is valid for the preamble content itself, and for tools the 3171 // most useful answer is "yes, this file has a header guard". 3172 if (!ConditionalStack.empty()) 3173 MIOpt.ExitTopLevelConditional(); 3174 ConditionalStack.clear(); 3175 } 3176 3177 // Issue diagnostics for unterminated #if and missing newline. 3178 3179 // If we are in a #if directive, emit an error. 3180 while (!ConditionalStack.empty()) { 3181 if (PP->getCodeCompletionFileLoc() != FileLoc) 3182 PP->Diag(ConditionalStack.back().IfLoc, 3183 diag::err_pp_unterminated_conditional); 3184 ConditionalStack.pop_back(); 3185 } 3186 3187 // Before C++11 and C2y, a file not ending with a newline was UB. Both 3188 // standards changed this behavior (as a DR or equivalent), but we still have 3189 // an opt-in diagnostic to warn about it. 3190 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 3191 Diag(BufferEnd, diag::warn_no_newline_eof) 3192 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 3193 3194 BufferPtr = CurPtr; 3195 3196 // Finally, let the preprocessor handle this. 3197 return PP->HandleEndOfFile(Result, isPragmaLexer()); 3198 } 3199 3200 /// peekNextPPToken - Return std::nullopt if there are no more tokens in the 3201 /// buffer controlled by this lexer, otherwise return the next unexpanded 3202 /// token. 3203 std::optional<Token> Lexer::peekNextPPToken() { 3204 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 3205 3206 if (isDependencyDirectivesLexer()) { 3207 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 3208 return std::nullopt; 3209 Token Result; 3210 (void)convertDependencyDirectiveToken( 3211 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex], Result); 3212 return Result; 3213 } 3214 3215 // Switch to 'skipping' mode. This will ensure that we can lex a token 3216 // without emitting diagnostics, disables macro expansion, and will cause EOF 3217 // to return an EOF token instead of popping the include stack. 3218 LexingRawMode = true; 3219 3220 // Save state that can be changed while lexing so that we can restore it. 3221 const char *TmpBufferPtr = BufferPtr; 3222 bool inPPDirectiveMode = ParsingPreprocessorDirective; 3223 bool atStartOfLine = IsAtStartOfLine; 3224 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3225 bool leadingSpace = HasLeadingSpace; 3226 3227 Token Tok; 3228 Lex(Tok); 3229 3230 // Restore state that may have changed. 3231 BufferPtr = TmpBufferPtr; 3232 ParsingPreprocessorDirective = inPPDirectiveMode; 3233 HasLeadingSpace = leadingSpace; 3234 IsAtStartOfLine = atStartOfLine; 3235 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 3236 // Restore the lexer back to non-skipping mode. 3237 LexingRawMode = false; 3238 3239 if (Tok.is(tok::eof)) 3240 return std::nullopt; 3241 return Tok; 3242 } 3243 3244 /// Find the end of a version control conflict marker. 3245 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 3246 ConflictMarkerKind CMK) { 3247 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 3248 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 3249 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 3250 size_t Pos = RestOfBuffer.find(Terminator); 3251 while (Pos != StringRef::npos) { 3252 // Must occur at start of line. 3253 if (Pos == 0 || 3254 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 3255 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 3256 Pos = RestOfBuffer.find(Terminator); 3257 continue; 3258 } 3259 return RestOfBuffer.data()+Pos; 3260 } 3261 return nullptr; 3262 } 3263 3264 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 3265 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 3266 /// and recover nicely. This returns true if it is a conflict marker and false 3267 /// if not. 3268 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 3269 // Only a conflict marker if it starts at the beginning of a line. 3270 if (CurPtr != BufferStart && 3271 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3272 return false; 3273 3274 // Check to see if we have <<<<<<< or >>>>. 3275 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") && 3276 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> ")) 3277 return false; 3278 3279 // If we have a situation where we don't care about conflict markers, ignore 3280 // it. 3281 if (CurrentConflictMarkerState || isLexingRawMode()) 3282 return false; 3283 3284 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 3285 3286 // Check to see if there is an ending marker somewhere in the buffer at the 3287 // start of a line to terminate this conflict marker. 3288 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 3289 // We found a match. We are really in a conflict marker. 3290 // Diagnose this, and ignore to the end of line. 3291 Diag(CurPtr, diag::err_conflict_marker); 3292 CurrentConflictMarkerState = Kind; 3293 3294 // Skip ahead to the end of line. We know this exists because the 3295 // end-of-conflict marker starts with \r or \n. 3296 while (*CurPtr != '\r' && *CurPtr != '\n') { 3297 assert(CurPtr != BufferEnd && "Didn't find end of line"); 3298 ++CurPtr; 3299 } 3300 BufferPtr = CurPtr; 3301 return true; 3302 } 3303 3304 // No end of conflict marker found. 3305 return false; 3306 } 3307 3308 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 3309 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 3310 /// is the end of a conflict marker. Handle it by ignoring up until the end of 3311 /// the line. This returns true if it is a conflict marker and false if not. 3312 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 3313 // Only a conflict marker if it starts at the beginning of a line. 3314 if (CurPtr != BufferStart && 3315 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3316 return false; 3317 3318 // If we have a situation where we don't care about conflict markers, ignore 3319 // it. 3320 if (!CurrentConflictMarkerState || isLexingRawMode()) 3321 return false; 3322 3323 // Check to see if we have the marker (4 characters in a row). 3324 for (unsigned i = 1; i != 4; ++i) 3325 if (CurPtr[i] != CurPtr[0]) 3326 return false; 3327 3328 // If we do have it, search for the end of the conflict marker. This could 3329 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 3330 // be the end of conflict marker. 3331 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 3332 CurrentConflictMarkerState)) { 3333 CurPtr = End; 3334 3335 // Skip ahead to the end of line. 3336 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 3337 ++CurPtr; 3338 3339 BufferPtr = CurPtr; 3340 3341 // No longer in the conflict marker. 3342 CurrentConflictMarkerState = CMK_None; 3343 return true; 3344 } 3345 3346 return false; 3347 } 3348 3349 static const char *findPlaceholderEnd(const char *CurPtr, 3350 const char *BufferEnd) { 3351 if (CurPtr == BufferEnd) 3352 return nullptr; 3353 BufferEnd -= 1; // Scan until the second last character. 3354 for (; CurPtr != BufferEnd; ++CurPtr) { 3355 if (CurPtr[0] == '#' && CurPtr[1] == '>') 3356 return CurPtr + 2; 3357 } 3358 return nullptr; 3359 } 3360 3361 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 3362 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 3363 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 3364 return false; 3365 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 3366 if (!End) 3367 return false; 3368 const char *Start = CurPtr - 1; 3369 if (!LangOpts.AllowEditorPlaceholders) 3370 Diag(Start, diag::err_placeholder_in_source); 3371 Result.startToken(); 3372 FormTokenWithChars(Result, End, tok::raw_identifier); 3373 Result.setRawIdentifierData(Start); 3374 PP->LookUpIdentifierInfo(Result); 3375 Result.setFlag(Token::IsEditorPlaceholder); 3376 BufferPtr = End; 3377 return true; 3378 } 3379 3380 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 3381 if (PP && PP->isCodeCompletionEnabled()) { 3382 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 3383 return Loc == PP->getCodeCompletionLoc(); 3384 } 3385 3386 return false; 3387 } 3388 3389 void Lexer::DiagnoseDelimitedOrNamedEscapeSequence(SourceLocation Loc, 3390 bool Named, 3391 const LangOptions &Opts, 3392 DiagnosticsEngine &Diags) { 3393 unsigned DiagId; 3394 if (Opts.CPlusPlus23) 3395 DiagId = diag::warn_cxx23_delimited_escape_sequence; 3396 else if (Opts.C2y && !Named) 3397 DiagId = diag::warn_c2y_delimited_escape_sequence; 3398 else 3399 DiagId = diag::ext_delimited_escape_sequence; 3400 3401 // The trailing arguments are only used by the extension warning; either this 3402 // is a C2y extension or a C++23 extension, unless it's a named escape 3403 // sequence in C, then it's a Clang extension. 3404 unsigned Ext; 3405 if (!Opts.CPlusPlus) 3406 Ext = Named ? 2 /* Clang extension */ : 1 /* C2y extension */; 3407 else 3408 Ext = 0; // C++23 extension 3409 3410 Diags.Report(Loc, DiagId) << Named << Ext; 3411 } 3412 3413 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3414 const char *SlashLoc, 3415 Token *Result) { 3416 unsigned CharSize; 3417 char Kind = getCharAndSize(StartPtr, CharSize); 3418 assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 3419 3420 unsigned NumHexDigits; 3421 if (Kind == 'u') 3422 NumHexDigits = 4; 3423 else if (Kind == 'U') 3424 NumHexDigits = 8; 3425 3426 bool Delimited = false; 3427 bool FoundEndDelimiter = false; 3428 unsigned Count = 0; 3429 bool Diagnose = Result && !isLexingRawMode(); 3430 3431 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3432 if (Diagnose) 3433 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3434 return std::nullopt; 3435 } 3436 3437 const char *CurPtr = StartPtr + CharSize; 3438 const char *KindLoc = &CurPtr[-1]; 3439 3440 uint32_t CodePoint = 0; 3441 while (Count != NumHexDigits || Delimited) { 3442 char C = getCharAndSize(CurPtr, CharSize); 3443 if (!Delimited && Count == 0 && C == '{') { 3444 Delimited = true; 3445 CurPtr += CharSize; 3446 continue; 3447 } 3448 3449 if (Delimited && C == '}') { 3450 CurPtr += CharSize; 3451 FoundEndDelimiter = true; 3452 break; 3453 } 3454 3455 unsigned Value = llvm::hexDigitValue(C); 3456 if (Value == std::numeric_limits<unsigned>::max()) { 3457 if (!Delimited) 3458 break; 3459 if (Diagnose) 3460 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 3461 << StringRef(KindLoc, 1); 3462 return std::nullopt; 3463 } 3464 3465 if (CodePoint & 0xF000'0000) { 3466 if (Diagnose) 3467 Diag(KindLoc, diag::err_escape_too_large) << 0; 3468 return std::nullopt; 3469 } 3470 3471 CodePoint <<= 4; 3472 CodePoint |= Value; 3473 CurPtr += CharSize; 3474 Count++; 3475 } 3476 3477 if (Count == 0) { 3478 if (Diagnose) 3479 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3480 : diag::warn_ucn_escape_no_digits) 3481 << StringRef(KindLoc, 1); 3482 return std::nullopt; 3483 } 3484 3485 if (Delimited && Kind == 'U') { 3486 if (Diagnose) 3487 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3488 return std::nullopt; 3489 } 3490 3491 if (!Delimited && Count != NumHexDigits) { 3492 if (Diagnose) { 3493 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3494 // If the user wrote \U1234, suggest a fixit to \u. 3495 if (Count == 4 && NumHexDigits == 8) { 3496 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3497 Diag(KindLoc, diag::note_ucn_four_not_eight) 3498 << FixItHint::CreateReplacement(URange, "u"); 3499 } 3500 } 3501 return std::nullopt; 3502 } 3503 3504 if (Delimited && PP) 3505 DiagnoseDelimitedOrNamedEscapeSequence(getSourceLocation(SlashLoc), false, 3506 PP->getLangOpts(), 3507 PP->getDiagnostics()); 3508 3509 if (Result) { 3510 Result->setFlag(Token::HasUCN); 3511 // If the UCN contains either a trigraph or a line splicing, 3512 // we need to call getAndAdvanceChar again to set the appropriate flags 3513 // on Result. 3514 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 3515 StartPtr = CurPtr; 3516 else 3517 while (StartPtr != CurPtr) 3518 (void)getAndAdvanceChar(StartPtr, *Result); 3519 } else { 3520 StartPtr = CurPtr; 3521 } 3522 return CodePoint; 3523 } 3524 3525 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3526 const char *SlashLoc, 3527 Token *Result) { 3528 unsigned CharSize; 3529 bool Diagnose = Result && !isLexingRawMode(); 3530 3531 char C = getCharAndSize(StartPtr, CharSize); 3532 assert(C == 'N' && "expected \\N{...}"); 3533 3534 const char *CurPtr = StartPtr + CharSize; 3535 const char *KindLoc = &CurPtr[-1]; 3536 3537 C = getCharAndSize(CurPtr, CharSize); 3538 if (C != '{') { 3539 if (Diagnose) 3540 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3541 return std::nullopt; 3542 } 3543 CurPtr += CharSize; 3544 const char *StartName = CurPtr; 3545 bool FoundEndDelimiter = false; 3546 llvm::SmallVector<char, 30> Buffer; 3547 while (C) { 3548 C = getCharAndSize(CurPtr, CharSize); 3549 CurPtr += CharSize; 3550 if (C == '}') { 3551 FoundEndDelimiter = true; 3552 break; 3553 } 3554 3555 if (isVerticalWhitespace(C)) 3556 break; 3557 Buffer.push_back(C); 3558 } 3559 3560 if (!FoundEndDelimiter || Buffer.empty()) { 3561 if (Diagnose) 3562 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3563 : diag::warn_delimited_ucn_incomplete) 3564 << StringRef(KindLoc, 1); 3565 return std::nullopt; 3566 } 3567 3568 StringRef Name(Buffer.data(), Buffer.size()); 3569 std::optional<char32_t> Match = 3570 llvm::sys::unicode::nameToCodepointStrict(Name); 3571 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3572 if (!Match) { 3573 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3574 if (Diagnose) { 3575 Diag(StartName, diag::err_invalid_ucn_name) 3576 << StringRef(Buffer.data(), Buffer.size()) 3577 << makeCharRange(*this, StartName, CurPtr - CharSize); 3578 if (LooseMatch) { 3579 Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3580 << FixItHint::CreateReplacement( 3581 makeCharRange(*this, StartName, CurPtr - CharSize), 3582 LooseMatch->Name); 3583 } 3584 } 3585 // We do not offer misspelled character names suggestions here 3586 // as the set of what would be a valid suggestion depends on context, 3587 // and we should not make invalid suggestions. 3588 } 3589 3590 if (Diagnose && Match) 3591 DiagnoseDelimitedOrNamedEscapeSequence(getSourceLocation(SlashLoc), true, 3592 PP->getLangOpts(), 3593 PP->getDiagnostics()); 3594 3595 // If no diagnostic has been emitted yet, likely because we are doing a 3596 // tentative lexing, we do not want to recover here to make sure the token 3597 // will not be incorrectly considered valid. This function will be called 3598 // again and a diagnostic emitted then. 3599 if (LooseMatch && Diagnose) 3600 Match = LooseMatch->CodePoint; 3601 3602 if (Result) { 3603 Result->setFlag(Token::HasUCN); 3604 // If the UCN contains either a trigraph or a line splicing, 3605 // we need to call getAndAdvanceChar again to set the appropriate flags 3606 // on Result. 3607 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 3608 StartPtr = CurPtr; 3609 else 3610 while (StartPtr != CurPtr) 3611 (void)getAndAdvanceChar(StartPtr, *Result); 3612 } else { 3613 StartPtr = CurPtr; 3614 } 3615 return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 3616 } 3617 3618 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3619 Token *Result) { 3620 3621 unsigned CharSize; 3622 std::optional<uint32_t> CodePointOpt; 3623 char Kind = getCharAndSize(StartPtr, CharSize); 3624 if (Kind == 'u' || Kind == 'U') 3625 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3626 else if (Kind == 'N') 3627 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 3628 3629 if (!CodePointOpt) 3630 return 0; 3631 3632 uint32_t CodePoint = *CodePointOpt; 3633 3634 // Don't apply C family restrictions to UCNs in assembly mode 3635 if (LangOpts.AsmPreprocessor) 3636 return CodePoint; 3637 3638 // C23 6.4.3p2: A universal character name shall not designate a code point 3639 // where the hexadecimal value is: 3640 // - in the range D800 through DFFF inclusive; or 3641 // - greater than 10FFFF. 3642 // A universal-character-name outside the c-char-sequence of a character 3643 // constant, or the s-char-sequence of a string-literal shall not designate 3644 // a control character or a character in the basic character set. 3645 3646 // C++11 [lex.charset]p2: If the hexadecimal value for a 3647 // universal-character-name corresponds to a surrogate code point (in the 3648 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3649 // if the hexadecimal value for a universal-character-name outside the 3650 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3651 // string literal corresponds to a control character (in either of the 3652 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3653 // basic source character set, the program is ill-formed. 3654 if (CodePoint < 0xA0) { 3655 // We don't use isLexingRawMode() here because we need to warn about bad 3656 // UCNs even when skipping preprocessing tokens in a #if block. 3657 if (Result && PP) { 3658 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3659 Diag(BufferPtr, diag::err_ucn_control_character); 3660 else { 3661 char C = static_cast<char>(CodePoint); 3662 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3663 } 3664 } 3665 3666 return 0; 3667 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3668 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3669 // We don't use isLexingRawMode() here because we need to diagnose bad 3670 // UCNs even when skipping preprocessing tokens in a #if block. 3671 if (Result && PP) { 3672 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3673 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3674 else 3675 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3676 } 3677 return 0; 3678 } 3679 3680 return CodePoint; 3681 } 3682 3683 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3684 const char *CurPtr) { 3685 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3686 isUnicodeWhitespace(C)) { 3687 Diag(BufferPtr, diag::ext_unicode_whitespace) 3688 << makeCharRange(*this, BufferPtr, CurPtr); 3689 3690 Result.setFlag(Token::LeadingSpace); 3691 return true; 3692 } 3693 return false; 3694 } 3695 3696 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3697 IsAtStartOfLine = Result.isAtStartOfLine(); 3698 HasLeadingSpace = Result.hasLeadingSpace(); 3699 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3700 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3701 } 3702 3703 bool Lexer::Lex(Token &Result) { 3704 assert(!isDependencyDirectivesLexer()); 3705 3706 // Start a new token. 3707 Result.startToken(); 3708 3709 // Set up misc whitespace flags for LexTokenInternal. 3710 if (IsAtStartOfLine) { 3711 Result.setFlag(Token::StartOfLine); 3712 IsAtStartOfLine = false; 3713 } 3714 3715 if (HasLeadingSpace) { 3716 Result.setFlag(Token::LeadingSpace); 3717 HasLeadingSpace = false; 3718 } 3719 3720 if (HasLeadingEmptyMacro) { 3721 Result.setFlag(Token::LeadingEmptyMacro); 3722 HasLeadingEmptyMacro = false; 3723 } 3724 3725 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3726 IsAtPhysicalStartOfLine = false; 3727 bool isRawLex = isLexingRawMode(); 3728 (void) isRawLex; 3729 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3730 // (After the LexTokenInternal call, the lexer might be destroyed.) 3731 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3732 return returnedToken; 3733 } 3734 3735 /// LexTokenInternal - This implements a simple C family lexer. It is an 3736 /// extremely performance critical piece of code. This assumes that the buffer 3737 /// has a null character at the end of the file. This returns a preprocessing 3738 /// token, not a normal token, as such, it is an internal interface. It assumes 3739 /// that the Flags of result have been cleared before calling this. 3740 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3741 LexStart: 3742 assert(!Result.needsCleaning() && "Result needs cleaning"); 3743 assert(!Result.hasPtrData() && "Result has not been reset"); 3744 3745 // CurPtr - Cache BufferPtr in an automatic variable. 3746 const char *CurPtr = BufferPtr; 3747 3748 // Small amounts of horizontal whitespace is very common between tokens. 3749 if (isHorizontalWhitespace(*CurPtr)) { 3750 do { 3751 ++CurPtr; 3752 } while (isHorizontalWhitespace(*CurPtr)); 3753 3754 // If we are keeping whitespace and other tokens, just return what we just 3755 // skipped. The next lexer invocation will return the token after the 3756 // whitespace. 3757 if (isKeepWhitespaceMode()) { 3758 FormTokenWithChars(Result, CurPtr, tok::unknown); 3759 // FIXME: The next token will not have LeadingSpace set. 3760 return true; 3761 } 3762 3763 BufferPtr = CurPtr; 3764 Result.setFlag(Token::LeadingSpace); 3765 } 3766 3767 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3768 3769 // Read a character, advancing over it. 3770 char Char = getAndAdvanceChar(CurPtr, Result); 3771 tok::TokenKind Kind; 3772 3773 if (!isVerticalWhitespace(Char)) 3774 NewLinePtr = nullptr; 3775 3776 switch (Char) { 3777 case 0: // Null. 3778 // Found end of file? 3779 if (CurPtr-1 == BufferEnd) 3780 return LexEndOfFile(Result, CurPtr-1); 3781 3782 // Check if we are performing code completion. 3783 if (isCodeCompletionPoint(CurPtr-1)) { 3784 // Return the code-completion token. 3785 Result.startToken(); 3786 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3787 return true; 3788 } 3789 3790 if (!isLexingRawMode()) 3791 Diag(CurPtr-1, diag::null_in_file); 3792 Result.setFlag(Token::LeadingSpace); 3793 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3794 return true; // KeepWhitespaceMode 3795 3796 // We know the lexer hasn't changed, so just try again with this lexer. 3797 // (We manually eliminate the tail call to avoid recursion.) 3798 goto LexNextToken; 3799 3800 case 26: // DOS & CP/M EOF: "^Z". 3801 // If we're in Microsoft extensions mode, treat this as end of file. 3802 if (LangOpts.MicrosoftExt) { 3803 if (!isLexingRawMode()) 3804 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3805 return LexEndOfFile(Result, CurPtr-1); 3806 } 3807 3808 // If Microsoft extensions are disabled, this is just random garbage. 3809 Kind = tok::unknown; 3810 break; 3811 3812 case '\r': 3813 if (CurPtr[0] == '\n') 3814 (void)getAndAdvanceChar(CurPtr, Result); 3815 [[fallthrough]]; 3816 case '\n': 3817 // If we are inside a preprocessor directive and we see the end of line, 3818 // we know we are done with the directive, so return an EOD token. 3819 if (ParsingPreprocessorDirective) { 3820 // Done parsing the "line". 3821 ParsingPreprocessorDirective = false; 3822 3823 // Restore comment saving mode, in case it was disabled for directive. 3824 if (PP) 3825 resetExtendedTokenMode(); 3826 3827 // Since we consumed a newline, we are back at the start of a line. 3828 IsAtStartOfLine = true; 3829 IsAtPhysicalStartOfLine = true; 3830 NewLinePtr = CurPtr - 1; 3831 3832 Kind = tok::eod; 3833 break; 3834 } 3835 3836 // No leading whitespace seen so far. 3837 Result.clearFlag(Token::LeadingSpace); 3838 3839 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3840 return true; // KeepWhitespaceMode 3841 3842 // We only saw whitespace, so just try again with this lexer. 3843 // (We manually eliminate the tail call to avoid recursion.) 3844 goto LexNextToken; 3845 case ' ': 3846 case '\t': 3847 case '\f': 3848 case '\v': 3849 SkipHorizontalWhitespace: 3850 Result.setFlag(Token::LeadingSpace); 3851 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3852 return true; // KeepWhitespaceMode 3853 3854 SkipIgnoredUnits: 3855 CurPtr = BufferPtr; 3856 3857 // If the next token is obviously a // or /* */ comment, skip it efficiently 3858 // too (without going through the big switch stmt). 3859 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3860 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3861 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3862 return true; // There is a token to return. 3863 goto SkipIgnoredUnits; 3864 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3865 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3866 return true; // There is a token to return. 3867 goto SkipIgnoredUnits; 3868 } else if (isHorizontalWhitespace(*CurPtr)) { 3869 goto SkipHorizontalWhitespace; 3870 } 3871 // We only saw whitespace, so just try again with this lexer. 3872 // (We manually eliminate the tail call to avoid recursion.) 3873 goto LexNextToken; 3874 3875 // C99 6.4.4.1: Integer Constants. 3876 // C99 6.4.4.2: Floating Constants. 3877 case '0': case '1': case '2': case '3': case '4': 3878 case '5': case '6': case '7': case '8': case '9': 3879 // Notify MIOpt that we read a non-whitespace/non-comment token. 3880 MIOpt.ReadToken(); 3881 return LexNumericConstant(Result, CurPtr); 3882 3883 // Identifier (e.g., uber), or 3884 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 3885 // UTF-8 or UTF-16 string literal (C11/C++11). 3886 case 'u': 3887 // Notify MIOpt that we read a non-whitespace/non-comment token. 3888 MIOpt.ReadToken(); 3889 3890 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3891 Char = getCharAndSize(CurPtr, SizeTmp); 3892 3893 // UTF-16 string literal 3894 if (Char == '"') 3895 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3896 tok::utf16_string_literal); 3897 3898 // UTF-16 character constant 3899 if (Char == '\'') 3900 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3901 tok::utf16_char_constant); 3902 3903 // UTF-16 raw string literal 3904 if (Char == 'R' && LangOpts.RawStringLiterals && 3905 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3906 return LexRawStringLiteral(Result, 3907 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3908 SizeTmp2, Result), 3909 tok::utf16_string_literal); 3910 3911 if (Char == '8') { 3912 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3913 3914 // UTF-8 string literal 3915 if (Char2 == '"') 3916 return LexStringLiteral(Result, 3917 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3918 SizeTmp2, Result), 3919 tok::utf8_string_literal); 3920 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 3921 return LexCharConstant( 3922 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3923 SizeTmp2, Result), 3924 tok::utf8_char_constant); 3925 3926 if (Char2 == 'R' && LangOpts.RawStringLiterals) { 3927 unsigned SizeTmp3; 3928 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3929 // UTF-8 raw string literal 3930 if (Char3 == '"') { 3931 return LexRawStringLiteral(Result, 3932 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3933 SizeTmp2, Result), 3934 SizeTmp3, Result), 3935 tok::utf8_string_literal); 3936 } 3937 } 3938 } 3939 } 3940 3941 // treat u like the start of an identifier. 3942 return LexIdentifierContinue(Result, CurPtr); 3943 3944 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 3945 // Notify MIOpt that we read a non-whitespace/non-comment token. 3946 MIOpt.ReadToken(); 3947 3948 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3949 Char = getCharAndSize(CurPtr, SizeTmp); 3950 3951 // UTF-32 string literal 3952 if (Char == '"') 3953 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3954 tok::utf32_string_literal); 3955 3956 // UTF-32 character constant 3957 if (Char == '\'') 3958 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3959 tok::utf32_char_constant); 3960 3961 // UTF-32 raw string literal 3962 if (Char == 'R' && LangOpts.RawStringLiterals && 3963 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3964 return LexRawStringLiteral(Result, 3965 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3966 SizeTmp2, Result), 3967 tok::utf32_string_literal); 3968 } 3969 3970 // treat U like the start of an identifier. 3971 return LexIdentifierContinue(Result, CurPtr); 3972 3973 case 'R': // Identifier or C++0x raw string literal 3974 // Notify MIOpt that we read a non-whitespace/non-comment token. 3975 MIOpt.ReadToken(); 3976 3977 if (LangOpts.RawStringLiterals) { 3978 Char = getCharAndSize(CurPtr, SizeTmp); 3979 3980 if (Char == '"') 3981 return LexRawStringLiteral(Result, 3982 ConsumeChar(CurPtr, SizeTmp, Result), 3983 tok::string_literal); 3984 } 3985 3986 // treat R like the start of an identifier. 3987 return LexIdentifierContinue(Result, CurPtr); 3988 3989 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3990 // Notify MIOpt that we read a non-whitespace/non-comment token. 3991 MIOpt.ReadToken(); 3992 Char = getCharAndSize(CurPtr, SizeTmp); 3993 3994 // Wide string literal. 3995 if (Char == '"') 3996 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3997 tok::wide_string_literal); 3998 3999 // Wide raw string literal. 4000 if (LangOpts.RawStringLiterals && Char == 'R' && 4001 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 4002 return LexRawStringLiteral(Result, 4003 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4004 SizeTmp2, Result), 4005 tok::wide_string_literal); 4006 4007 // Wide character constant. 4008 if (Char == '\'') 4009 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4010 tok::wide_char_constant); 4011 // FALL THROUGH, treating L like the start of an identifier. 4012 [[fallthrough]]; 4013 4014 // C99 6.4.2: Identifiers. 4015 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 4016 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 4017 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 4018 case 'V': case 'W': case 'X': case 'Y': case 'Z': 4019 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 4020 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 4021 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 4022 case 'v': case 'w': case 'x': case 'y': case 'z': 4023 case '_': 4024 // Notify MIOpt that we read a non-whitespace/non-comment token. 4025 MIOpt.ReadToken(); 4026 return LexIdentifierContinue(Result, CurPtr); 4027 4028 case '$': // $ in identifiers. 4029 if (LangOpts.DollarIdents) { 4030 if (!isLexingRawMode()) 4031 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 4032 // Notify MIOpt that we read a non-whitespace/non-comment token. 4033 MIOpt.ReadToken(); 4034 return LexIdentifierContinue(Result, CurPtr); 4035 } 4036 4037 Kind = tok::unknown; 4038 break; 4039 4040 // C99 6.4.4: Character Constants. 4041 case '\'': 4042 // Notify MIOpt that we read a non-whitespace/non-comment token. 4043 MIOpt.ReadToken(); 4044 return LexCharConstant(Result, CurPtr, tok::char_constant); 4045 4046 // C99 6.4.5: String Literals. 4047 case '"': 4048 // Notify MIOpt that we read a non-whitespace/non-comment token. 4049 MIOpt.ReadToken(); 4050 return LexStringLiteral(Result, CurPtr, 4051 ParsingFilename ? tok::header_name 4052 : tok::string_literal); 4053 4054 // C99 6.4.6: Punctuators. 4055 case '?': 4056 Kind = tok::question; 4057 break; 4058 case '[': 4059 Kind = tok::l_square; 4060 break; 4061 case ']': 4062 Kind = tok::r_square; 4063 break; 4064 case '(': 4065 Kind = tok::l_paren; 4066 break; 4067 case ')': 4068 Kind = tok::r_paren; 4069 break; 4070 case '{': 4071 Kind = tok::l_brace; 4072 break; 4073 case '}': 4074 Kind = tok::r_brace; 4075 break; 4076 case '.': 4077 Char = getCharAndSize(CurPtr, SizeTmp); 4078 if (Char >= '0' && Char <= '9') { 4079 // Notify MIOpt that we read a non-whitespace/non-comment token. 4080 MIOpt.ReadToken(); 4081 4082 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 4083 } else if (LangOpts.CPlusPlus && Char == '*') { 4084 Kind = tok::periodstar; 4085 CurPtr += SizeTmp; 4086 } else if (Char == '.' && 4087 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 4088 Kind = tok::ellipsis; 4089 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4090 SizeTmp2, Result); 4091 } else { 4092 Kind = tok::period; 4093 } 4094 break; 4095 case '&': 4096 Char = getCharAndSize(CurPtr, SizeTmp); 4097 if (Char == '&') { 4098 Kind = tok::ampamp; 4099 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4100 } else if (Char == '=') { 4101 Kind = tok::ampequal; 4102 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4103 } else { 4104 Kind = tok::amp; 4105 } 4106 break; 4107 case '*': 4108 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4109 Kind = tok::starequal; 4110 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4111 } else { 4112 Kind = tok::star; 4113 } 4114 break; 4115 case '+': 4116 Char = getCharAndSize(CurPtr, SizeTmp); 4117 if (Char == '+') { 4118 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4119 Kind = tok::plusplus; 4120 } else if (Char == '=') { 4121 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4122 Kind = tok::plusequal; 4123 } else { 4124 Kind = tok::plus; 4125 } 4126 break; 4127 case '-': 4128 Char = getCharAndSize(CurPtr, SizeTmp); 4129 if (Char == '-') { // -- 4130 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4131 Kind = tok::minusminus; 4132 } else if (Char == '>' && LangOpts.CPlusPlus && 4133 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 4134 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4135 SizeTmp2, Result); 4136 Kind = tok::arrowstar; 4137 } else if (Char == '>') { // -> 4138 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4139 Kind = tok::arrow; 4140 } else if (Char == '=') { // -= 4141 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4142 Kind = tok::minusequal; 4143 } else { 4144 Kind = tok::minus; 4145 } 4146 break; 4147 case '~': 4148 Kind = tok::tilde; 4149 break; 4150 case '!': 4151 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4152 Kind = tok::exclaimequal; 4153 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4154 } else { 4155 Kind = tok::exclaim; 4156 } 4157 break; 4158 case '/': 4159 // 6.4.9: Comments 4160 Char = getCharAndSize(CurPtr, SizeTmp); 4161 if (Char == '/') { // Line comment. 4162 // Even if Line comments are disabled (e.g. in C89 mode), we generally 4163 // want to lex this as a comment. There is one problem with this though, 4164 // that in one particular corner case, this can change the behavior of the 4165 // resultant program. For example, In "foo //**/ bar", C89 would lex 4166 // this as "foo / bar" and languages with Line comments would lex it as 4167 // "foo". Check to see if the character after the second slash is a '*'. 4168 // If so, we will lex that as a "/" instead of the start of a comment. 4169 // However, we never do this if we are just preprocessing. 4170 bool TreatAsComment = 4171 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 4172 if (!TreatAsComment) 4173 if (!(PP && PP->isPreprocessedOutput())) 4174 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 4175 4176 if (TreatAsComment) { 4177 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4178 TokAtPhysicalStartOfLine)) 4179 return true; // There is a token to return. 4180 4181 // It is common for the tokens immediately after a // comment to be 4182 // whitespace (indentation for the next line). Instead of going through 4183 // the big switch, handle it efficiently now. 4184 goto SkipIgnoredUnits; 4185 } 4186 } 4187 4188 if (Char == '*') { // /**/ comment. 4189 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4190 TokAtPhysicalStartOfLine)) 4191 return true; // There is a token to return. 4192 4193 // We only saw whitespace, so just try again with this lexer. 4194 // (We manually eliminate the tail call to avoid recursion.) 4195 goto LexNextToken; 4196 } 4197 4198 if (Char == '=') { 4199 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4200 Kind = tok::slashequal; 4201 } else { 4202 Kind = tok::slash; 4203 } 4204 break; 4205 case '%': 4206 Char = getCharAndSize(CurPtr, SizeTmp); 4207 if (Char == '=') { 4208 Kind = tok::percentequal; 4209 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4210 } else if (LangOpts.Digraphs && Char == '>') { 4211 Kind = tok::r_brace; // '%>' -> '}' 4212 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4213 } else if (LangOpts.Digraphs && Char == ':') { 4214 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4215 Char = getCharAndSize(CurPtr, SizeTmp); 4216 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 4217 Kind = tok::hashhash; // '%:%:' -> '##' 4218 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4219 SizeTmp2, Result); 4220 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 4221 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4222 if (!isLexingRawMode()) 4223 Diag(BufferPtr, diag::ext_charize_microsoft); 4224 Kind = tok::hashat; 4225 } else { // '%:' -> '#' 4226 // We parsed a # character. If this occurs at the start of the line, 4227 // it's actually the start of a preprocessing directive. Callback to 4228 // the preprocessor to handle it. 4229 // TODO: -fpreprocessed mode?? 4230 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4231 goto HandleDirective; 4232 4233 Kind = tok::hash; 4234 } 4235 } else { 4236 Kind = tok::percent; 4237 } 4238 break; 4239 case '<': 4240 Char = getCharAndSize(CurPtr, SizeTmp); 4241 if (ParsingFilename) { 4242 return LexAngledStringLiteral(Result, CurPtr); 4243 } else if (Char == '<') { 4244 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4245 if (After == '=') { 4246 Kind = tok::lesslessequal; 4247 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4248 SizeTmp2, Result); 4249 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 4250 // If this is actually a '<<<<<<<' version control conflict marker, 4251 // recognize it as such and recover nicely. 4252 goto LexNextToken; 4253 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 4254 // If this is '<<<<' and we're in a Perforce-style conflict marker, 4255 // ignore it. 4256 goto LexNextToken; 4257 } else if (LangOpts.CUDA && After == '<') { 4258 Kind = tok::lesslessless; 4259 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4260 SizeTmp2, Result); 4261 } else { 4262 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4263 Kind = tok::lessless; 4264 } 4265 } else if (Char == '=') { 4266 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4267 if (After == '>') { 4268 if (LangOpts.CPlusPlus20) { 4269 if (!isLexingRawMode()) 4270 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 4271 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4272 SizeTmp2, Result); 4273 Kind = tok::spaceship; 4274 break; 4275 } 4276 // Suggest adding a space between the '<=' and the '>' to avoid a 4277 // change in semantics if this turns up in C++ <=17 mode. 4278 if (LangOpts.CPlusPlus && !isLexingRawMode()) { 4279 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 4280 << FixItHint::CreateInsertion( 4281 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 4282 } 4283 } 4284 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4285 Kind = tok::lessequal; 4286 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 4287 if (LangOpts.CPlusPlus11 && 4288 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 4289 // C++0x [lex.pptoken]p3: 4290 // Otherwise, if the next three characters are <:: and the subsequent 4291 // character is neither : nor >, the < is treated as a preprocessor 4292 // token by itself and not as the first character of the alternative 4293 // token <:. 4294 unsigned SizeTmp3; 4295 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 4296 if (After != ':' && After != '>') { 4297 Kind = tok::less; 4298 if (!isLexingRawMode()) 4299 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 4300 break; 4301 } 4302 } 4303 4304 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4305 Kind = tok::l_square; 4306 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 4307 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4308 Kind = tok::l_brace; 4309 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 4310 lexEditorPlaceholder(Result, CurPtr)) { 4311 return true; 4312 } else { 4313 Kind = tok::less; 4314 } 4315 break; 4316 case '>': 4317 Char = getCharAndSize(CurPtr, SizeTmp); 4318 if (Char == '=') { 4319 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4320 Kind = tok::greaterequal; 4321 } else if (Char == '>') { 4322 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4323 if (After == '=') { 4324 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4325 SizeTmp2, Result); 4326 Kind = tok::greatergreaterequal; 4327 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 4328 // If this is actually a '>>>>' conflict marker, recognize it as such 4329 // and recover nicely. 4330 goto LexNextToken; 4331 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 4332 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 4333 goto LexNextToken; 4334 } else if (LangOpts.CUDA && After == '>') { 4335 Kind = tok::greatergreatergreater; 4336 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4337 SizeTmp2, Result); 4338 } else { 4339 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4340 Kind = tok::greatergreater; 4341 } 4342 } else { 4343 Kind = tok::greater; 4344 } 4345 break; 4346 case '^': 4347 Char = getCharAndSize(CurPtr, SizeTmp); 4348 if (Char == '=') { 4349 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4350 Kind = tok::caretequal; 4351 } else { 4352 if (LangOpts.OpenCL && Char == '^') 4353 Diag(CurPtr, diag::err_opencl_logical_exclusive_or); 4354 Kind = tok::caret; 4355 } 4356 break; 4357 case '|': 4358 Char = getCharAndSize(CurPtr, SizeTmp); 4359 if (Char == '=') { 4360 Kind = tok::pipeequal; 4361 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4362 } else if (Char == '|') { 4363 // If this is '|||||||' and we're in a conflict marker, ignore it. 4364 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 4365 goto LexNextToken; 4366 Kind = tok::pipepipe; 4367 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4368 } else { 4369 Kind = tok::pipe; 4370 } 4371 break; 4372 case ':': 4373 Char = getCharAndSize(CurPtr, SizeTmp); 4374 if (LangOpts.Digraphs && Char == '>') { 4375 Kind = tok::r_square; // ':>' -> ']' 4376 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4377 } else if (Char == ':') { 4378 Kind = tok::coloncolon; 4379 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4380 } else { 4381 Kind = tok::colon; 4382 } 4383 break; 4384 case ';': 4385 Kind = tok::semi; 4386 break; 4387 case '=': 4388 Char = getCharAndSize(CurPtr, SizeTmp); 4389 if (Char == '=') { 4390 // If this is '====' and we're in a conflict marker, ignore it. 4391 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 4392 goto LexNextToken; 4393 4394 Kind = tok::equalequal; 4395 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4396 } else { 4397 Kind = tok::equal; 4398 } 4399 break; 4400 case ',': 4401 Kind = tok::comma; 4402 break; 4403 case '#': 4404 Char = getCharAndSize(CurPtr, SizeTmp); 4405 if (Char == '#') { 4406 Kind = tok::hashhash; 4407 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4408 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 4409 Kind = tok::hashat; 4410 if (!isLexingRawMode()) 4411 Diag(BufferPtr, diag::ext_charize_microsoft); 4412 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4413 } else { 4414 // We parsed a # character. If this occurs at the start of the line, 4415 // it's actually the start of a preprocessing directive. Callback to 4416 // the preprocessor to handle it. 4417 // TODO: -fpreprocessed mode?? 4418 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4419 goto HandleDirective; 4420 4421 Kind = tok::hash; 4422 } 4423 break; 4424 4425 case '@': 4426 // Objective C support. 4427 if (CurPtr[-1] == '@' && LangOpts.ObjC) 4428 Kind = tok::at; 4429 else 4430 Kind = tok::unknown; 4431 break; 4432 4433 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 4434 case '\\': 4435 if (!LangOpts.AsmPreprocessor) { 4436 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 4437 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4438 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4439 return true; // KeepWhitespaceMode 4440 4441 // We only saw whitespace, so just try again with this lexer. 4442 // (We manually eliminate the tail call to avoid recursion.) 4443 goto LexNextToken; 4444 } 4445 4446 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4447 } 4448 } 4449 4450 Kind = tok::unknown; 4451 break; 4452 4453 default: { 4454 if (isASCII(Char)) { 4455 Kind = tok::unknown; 4456 break; 4457 } 4458 4459 llvm::UTF32 CodePoint; 4460 4461 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 4462 // an escaped newline. 4463 --CurPtr; 4464 llvm::ConversionResult Status = 4465 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 4466 (const llvm::UTF8 *)BufferEnd, 4467 &CodePoint, 4468 llvm::strictConversion); 4469 if (Status == llvm::conversionOK) { 4470 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4471 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4472 return true; // KeepWhitespaceMode 4473 4474 // We only saw whitespace, so just try again with this lexer. 4475 // (We manually eliminate the tail call to avoid recursion.) 4476 goto LexNextToken; 4477 } 4478 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4479 } 4480 4481 if (isLexingRawMode() || ParsingPreprocessorDirective || 4482 PP->isPreprocessedOutput()) { 4483 ++CurPtr; 4484 Kind = tok::unknown; 4485 break; 4486 } 4487 4488 // Non-ASCII characters tend to creep into source code unintentionally. 4489 // Instead of letting the parser complain about the unknown token, 4490 // just diagnose the invalid UTF-8, then drop the character. 4491 Diag(CurPtr, diag::err_invalid_utf8); 4492 4493 BufferPtr = CurPtr+1; 4494 // We're pretending the character didn't exist, so just try again with 4495 // this lexer. 4496 // (We manually eliminate the tail call to avoid recursion.) 4497 goto LexNextToken; 4498 } 4499 } 4500 4501 // Notify MIOpt that we read a non-whitespace/non-comment token. 4502 MIOpt.ReadToken(); 4503 4504 // Update the location of token as well as BufferPtr. 4505 FormTokenWithChars(Result, CurPtr, Kind); 4506 return true; 4507 4508 HandleDirective: 4509 // We parsed a # character and it's the start of a preprocessing directive. 4510 4511 FormTokenWithChars(Result, CurPtr, tok::hash); 4512 PP->HandleDirective(Result); 4513 4514 if (PP->hadModuleLoaderFatalFailure()) 4515 // With a fatal failure in the module loader, we abort parsing. 4516 return true; 4517 4518 // We parsed the directive; lex a token with the new state. 4519 return false; 4520 4521 LexNextToken: 4522 Result.clearFlag(Token::NeedsCleaning); 4523 goto LexStart; 4524 } 4525 4526 const char *Lexer::convertDependencyDirectiveToken( 4527 const dependency_directives_scan::Token &DDTok, Token &Result) { 4528 const char *TokPtr = BufferStart + DDTok.Offset; 4529 Result.startToken(); 4530 Result.setLocation(getSourceLocation(TokPtr)); 4531 Result.setKind(DDTok.Kind); 4532 Result.setFlag((Token::TokenFlags)DDTok.Flags); 4533 Result.setLength(DDTok.Length); 4534 BufferPtr = TokPtr + DDTok.Length; 4535 return TokPtr; 4536 } 4537 4538 bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4539 assert(isDependencyDirectivesLexer()); 4540 4541 using namespace dependency_directives_scan; 4542 4543 if (BufferPtr == BufferEnd) 4544 return LexEndOfFile(Result, BufferPtr); 4545 4546 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4547 if (DepDirectives.front().Kind == pp_eof) 4548 return LexEndOfFile(Result, BufferEnd); 4549 if (DepDirectives.front().Kind == tokens_present_before_eof) 4550 MIOpt.ReadToken(); 4551 NextDepDirectiveTokenIndex = 0; 4552 DepDirectives = DepDirectives.drop_front(); 4553 } 4554 4555 const dependency_directives_scan::Token &DDTok = 4556 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4557 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4558 // Read something other than a preprocessor directive hash. 4559 MIOpt.ReadToken(); 4560 } 4561 4562 if (ParsingFilename && DDTok.is(tok::less)) { 4563 BufferPtr = BufferStart + DDTok.Offset; 4564 LexAngledStringLiteral(Result, BufferPtr + 1); 4565 if (Result.isNot(tok::header_name)) 4566 return true; 4567 // Advance the index of lexed tokens. 4568 while (true) { 4569 const dependency_directives_scan::Token &NextTok = 4570 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4571 if (BufferStart + NextTok.Offset >= BufferPtr) 4572 break; 4573 ++NextDepDirectiveTokenIndex; 4574 } 4575 return true; 4576 } 4577 4578 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4579 4580 if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4581 PP->HandleDirective(Result); 4582 if (PP->hadModuleLoaderFatalFailure()) 4583 // With a fatal failure in the module loader, we abort parsing. 4584 return true; 4585 return false; 4586 } 4587 if (Result.is(tok::raw_identifier)) { 4588 Result.setRawIdentifierData(TokPtr); 4589 if (!isLexingRawMode()) { 4590 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4591 if (II->isHandleIdentifierCase()) 4592 return PP->HandleIdentifier(Result); 4593 } 4594 return true; 4595 } 4596 if (Result.isLiteral()) { 4597 Result.setLiteralData(TokPtr); 4598 return true; 4599 } 4600 if (Result.is(tok::colon)) { 4601 // Convert consecutive colons to 'tok::coloncolon'. 4602 if (*BufferPtr == ':') { 4603 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4604 tok::colon)); 4605 ++NextDepDirectiveTokenIndex; 4606 Result.setKind(tok::coloncolon); 4607 } 4608 return true; 4609 } 4610 if (Result.is(tok::eod)) 4611 ParsingPreprocessorDirective = false; 4612 4613 return true; 4614 } 4615 4616 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4617 assert(isDependencyDirectivesLexer()); 4618 4619 using namespace dependency_directives_scan; 4620 4621 bool Stop = false; 4622 unsigned NestedIfs = 0; 4623 do { 4624 DepDirectives = DepDirectives.drop_front(); 4625 switch (DepDirectives.front().Kind) { 4626 case pp_none: 4627 llvm_unreachable("unexpected 'pp_none'"); 4628 case pp_include: 4629 case pp___include_macros: 4630 case pp_define: 4631 case pp_undef: 4632 case pp_import: 4633 case pp_pragma_import: 4634 case pp_pragma_once: 4635 case pp_pragma_push_macro: 4636 case pp_pragma_pop_macro: 4637 case pp_pragma_include_alias: 4638 case pp_pragma_system_header: 4639 case pp_include_next: 4640 case decl_at_import: 4641 case cxx_module_decl: 4642 case cxx_import_decl: 4643 case cxx_export_module_decl: 4644 case cxx_export_import_decl: 4645 case tokens_present_before_eof: 4646 break; 4647 case pp_if: 4648 case pp_ifdef: 4649 case pp_ifndef: 4650 ++NestedIfs; 4651 break; 4652 case pp_elif: 4653 case pp_elifdef: 4654 case pp_elifndef: 4655 case pp_else: 4656 if (!NestedIfs) { 4657 Stop = true; 4658 } 4659 break; 4660 case pp_endif: 4661 if (!NestedIfs) { 4662 Stop = true; 4663 } else { 4664 --NestedIfs; 4665 } 4666 break; 4667 case pp_eof: 4668 NextDepDirectiveTokenIndex = 0; 4669 return LexEndOfFile(Result, BufferEnd); 4670 } 4671 } while (!Stop); 4672 4673 const dependency_directives_scan::Token &DDTok = 4674 DepDirectives.front().Tokens.front(); 4675 assert(DDTok.is(tok::hash)); 4676 NextDepDirectiveTokenIndex = 1; 4677 4678 convertDependencyDirectiveToken(DDTok, Result); 4679 return false; 4680 } 4681