1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/Diagnostic.h" 17 #include "clang/Basic/IdentifierTable.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/LangOptions.h" 20 #include "clang/Basic/SourceLocation.h" 21 #include "clang/Basic/SourceManager.h" 22 #include "clang/Basic/TokenKinds.h" 23 #include "clang/Lex/LexDiagnostic.h" 24 #include "clang/Lex/LiteralSupport.h" 25 #include "clang/Lex/MultipleIncludeOpt.h" 26 #include "clang/Lex/Preprocessor.h" 27 #include "clang/Lex/PreprocessorOptions.h" 28 #include "clang/Lex/Token.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/StringExtras.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ConvertUTF.h" 35 #include "llvm/Support/MathExtras.h" 36 #include "llvm/Support/MemoryBufferRef.h" 37 #include "llvm/Support/NativeFormatting.h" 38 #include "llvm/Support/Unicode.h" 39 #include "llvm/Support/UnicodeCharRanges.h" 40 #include <algorithm> 41 #include <cassert> 42 #include <cstddef> 43 #include <cstdint> 44 #include <cstring> 45 #include <optional> 46 #include <string> 47 #include <tuple> 48 #include <utility> 49 50 #ifdef __SSE4_2__ 51 #include <nmmintrin.h> 52 #endif 53 54 using namespace clang; 55 56 //===----------------------------------------------------------------------===// 57 // Token Class Implementation 58 //===----------------------------------------------------------------------===// 59 60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 61 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 62 if (isAnnotation()) 63 return false; 64 if (const IdentifierInfo *II = getIdentifierInfo()) 65 return II->getObjCKeywordID() == objcKey; 66 return false; 67 } 68 69 /// getObjCKeywordID - Return the ObjC keyword kind. 70 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 71 if (isAnnotation()) 72 return tok::objc_not_keyword; 73 const IdentifierInfo *specId = getIdentifierInfo(); 74 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 75 } 76 77 //===----------------------------------------------------------------------===// 78 // Lexer Class Implementation 79 //===----------------------------------------------------------------------===// 80 81 void Lexer::anchor() {} 82 83 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 84 const char *BufEnd) { 85 BufferStart = BufStart; 86 BufferPtr = BufPtr; 87 BufferEnd = BufEnd; 88 89 assert(BufEnd[0] == 0 && 90 "We assume that the input buffer has a null character at the end" 91 " to simplify lexing!"); 92 93 // Check whether we have a BOM in the beginning of the buffer. If yes - act 94 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 95 // skip the UTF-8 BOM if it's present. 96 if (BufferStart == BufferPtr) { 97 // Determine the size of the BOM. 98 StringRef Buf(BufferStart, BufferEnd - BufferStart); 99 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 100 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 101 .Default(0); 102 103 // Skip the BOM. 104 BufferPtr += BOMLength; 105 } 106 107 Is_PragmaLexer = false; 108 CurrentConflictMarkerState = CMK_None; 109 110 // Start of the file is a start of line. 111 IsAtStartOfLine = true; 112 IsAtPhysicalStartOfLine = true; 113 114 HasLeadingSpace = false; 115 HasLeadingEmptyMacro = false; 116 117 // We are not after parsing a #. 118 ParsingPreprocessorDirective = false; 119 120 // We are not after parsing #include. 121 ParsingFilename = false; 122 123 // We are not in raw mode. Raw mode disables diagnostics and interpretation 124 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 125 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 126 // or otherwise skipping over tokens. 127 LexingRawMode = false; 128 129 // Default to not keeping comments. 130 ExtendedTokenMode = 0; 131 132 NewLinePtr = nullptr; 133 } 134 135 /// Lexer constructor - Create a new lexer object for the specified buffer 136 /// with the specified preprocessor managing the lexing process. This lexer 137 /// assumes that the associated file buffer and Preprocessor objects will 138 /// outlive it, so it doesn't take ownership of either of them. 139 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 140 Preprocessor &PP, bool IsFirstIncludeOfFile) 141 : PreprocessorLexer(&PP, FID), 142 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 143 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 144 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 145 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 146 InputFile.getBufferEnd()); 147 148 resetExtendedTokenMode(); 149 } 150 151 /// Lexer constructor - Create a new raw lexer object. This object is only 152 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 153 /// range will outlive it, so it doesn't take ownership of it. 154 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 155 const char *BufStart, const char *BufPtr, const char *BufEnd, 156 bool IsFirstIncludeOfFile) 157 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 158 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 159 InitLexer(BufStart, BufPtr, BufEnd); 160 161 // We *are* in raw mode. 162 LexingRawMode = true; 163 } 164 165 /// Lexer constructor - Create a new raw lexer object. This object is only 166 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 167 /// range will outlive it, so it doesn't take ownership of it. 168 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 169 const SourceManager &SM, const LangOptions &langOpts, 170 bool IsFirstIncludeOfFile) 171 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 172 FromFile.getBufferStart(), FromFile.getBufferEnd(), 173 IsFirstIncludeOfFile) {} 174 175 void Lexer::resetExtendedTokenMode() { 176 assert(PP && "Cannot reset token mode without a preprocessor"); 177 if (LangOpts.TraditionalCPP) 178 SetKeepWhitespaceMode(true); 179 else 180 SetCommentRetentionState(PP->getCommentRetentionState()); 181 } 182 183 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 184 /// _Pragma expansion. This has a variety of magic semantics that this method 185 /// sets up. It returns a new'd Lexer that must be delete'd when done. 186 /// 187 /// On entrance to this routine, TokStartLoc is a macro location which has a 188 /// spelling loc that indicates the bytes to be lexed for the token and an 189 /// expansion location that indicates where all lexed tokens should be 190 /// "expanded from". 191 /// 192 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 193 /// normal lexer that remaps tokens as they fly by. This would require making 194 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 195 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 196 /// out of the critical path of the lexer! 197 /// 198 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 199 SourceLocation ExpansionLocStart, 200 SourceLocation ExpansionLocEnd, 201 unsigned TokLen, Preprocessor &PP) { 202 SourceManager &SM = PP.getSourceManager(); 203 204 // Create the lexer as if we were going to lex the file normally. 205 FileID SpellingFID = SM.getFileID(SpellingLoc); 206 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 207 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 208 209 // Now that the lexer is created, change the start/end locations so that we 210 // just lex the subsection of the file that we want. This is lexing from a 211 // scratch buffer. 212 const char *StrData = SM.getCharacterData(SpellingLoc); 213 214 L->BufferPtr = StrData; 215 L->BufferEnd = StrData+TokLen; 216 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 217 218 // Set the SourceLocation with the remapping information. This ensures that 219 // GetMappedTokenLoc will remap the tokens as they are lexed. 220 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 221 ExpansionLocStart, 222 ExpansionLocEnd, TokLen); 223 224 // Ensure that the lexer thinks it is inside a directive, so that end \n will 225 // return an EOD token. 226 L->ParsingPreprocessorDirective = true; 227 228 // This lexer really is for _Pragma. 229 L->Is_PragmaLexer = true; 230 return L; 231 } 232 233 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 234 this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 235 this->IsAtStartOfLine = IsAtStartOfLine; 236 assert((BufferStart + Offset) <= BufferEnd); 237 BufferPtr = BufferStart + Offset; 238 } 239 240 template <typename T> static void StringifyImpl(T &Str, char Quote) { 241 typename T::size_type i = 0, e = Str.size(); 242 while (i < e) { 243 if (Str[i] == '\\' || Str[i] == Quote) { 244 Str.insert(Str.begin() + i, '\\'); 245 i += 2; 246 ++e; 247 } else if (Str[i] == '\n' || Str[i] == '\r') { 248 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 249 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 250 Str[i] != Str[i + 1]) { 251 Str[i] = '\\'; 252 Str[i + 1] = 'n'; 253 } else { 254 // Replace '\n' and '\r' to '\\' followed by 'n'. 255 Str[i] = '\\'; 256 Str.insert(Str.begin() + i + 1, 'n'); 257 ++e; 258 } 259 i += 2; 260 } else 261 ++i; 262 } 263 } 264 265 std::string Lexer::Stringify(StringRef Str, bool Charify) { 266 std::string Result = std::string(Str); 267 char Quote = Charify ? '\'' : '"'; 268 StringifyImpl(Result, Quote); 269 return Result; 270 } 271 272 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 273 274 //===----------------------------------------------------------------------===// 275 // Token Spelling 276 //===----------------------------------------------------------------------===// 277 278 /// Slow case of getSpelling. Extract the characters comprising the 279 /// spelling of this token from the provided input buffer. 280 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 281 const LangOptions &LangOpts, char *Spelling) { 282 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 283 284 size_t Length = 0; 285 const char *BufEnd = BufPtr + Tok.getLength(); 286 287 if (tok::isStringLiteral(Tok.getKind())) { 288 // Munch the encoding-prefix and opening double-quote. 289 while (BufPtr < BufEnd) { 290 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 291 Spelling[Length++] = CharAndSize.Char; 292 BufPtr += CharAndSize.Size; 293 294 if (Spelling[Length - 1] == '"') 295 break; 296 } 297 298 // Raw string literals need special handling; trigraph expansion and line 299 // splicing do not occur within their d-char-sequence nor within their 300 // r-char-sequence. 301 if (Length >= 2 && 302 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 303 // Search backwards from the end of the token to find the matching closing 304 // quote. 305 const char *RawEnd = BufEnd; 306 do --RawEnd; while (*RawEnd != '"'); 307 size_t RawLength = RawEnd - BufPtr + 1; 308 309 // Everything between the quotes is included verbatim in the spelling. 310 memcpy(Spelling + Length, BufPtr, RawLength); 311 Length += RawLength; 312 BufPtr += RawLength; 313 314 // The rest of the token is lexed normally. 315 } 316 } 317 318 while (BufPtr < BufEnd) { 319 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 320 Spelling[Length++] = CharAndSize.Char; 321 BufPtr += CharAndSize.Size; 322 } 323 324 assert(Length < Tok.getLength() && 325 "NeedsCleaning flag set on token that didn't need cleaning!"); 326 return Length; 327 } 328 329 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 330 /// token are the characters used to represent the token in the source file 331 /// after trigraph expansion and escaped-newline folding. In particular, this 332 /// wants to get the true, uncanonicalized, spelling of things like digraphs 333 /// UCNs, etc. 334 StringRef Lexer::getSpelling(SourceLocation loc, 335 SmallVectorImpl<char> &buffer, 336 const SourceManager &SM, 337 const LangOptions &options, 338 bool *invalid) { 339 // Break down the source location. 340 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 341 342 // Try to the load the file buffer. 343 bool invalidTemp = false; 344 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 345 if (invalidTemp) { 346 if (invalid) *invalid = true; 347 return {}; 348 } 349 350 const char *tokenBegin = file.data() + locInfo.second; 351 352 // Lex from the start of the given location. 353 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 354 file.begin(), tokenBegin, file.end()); 355 Token token; 356 lexer.LexFromRawLexer(token); 357 358 unsigned length = token.getLength(); 359 360 // Common case: no need for cleaning. 361 if (!token.needsCleaning()) 362 return StringRef(tokenBegin, length); 363 364 // Hard case, we need to relex the characters into the string. 365 buffer.resize(length); 366 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 367 return StringRef(buffer.data(), buffer.size()); 368 } 369 370 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 371 /// token are the characters used to represent the token in the source file 372 /// after trigraph expansion and escaped-newline folding. In particular, this 373 /// wants to get the true, uncanonicalized, spelling of things like digraphs 374 /// UCNs, etc. 375 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 376 const LangOptions &LangOpts, bool *Invalid) { 377 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 378 379 bool CharDataInvalid = false; 380 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 381 &CharDataInvalid); 382 if (Invalid) 383 *Invalid = CharDataInvalid; 384 if (CharDataInvalid) 385 return {}; 386 387 // If this token contains nothing interesting, return it directly. 388 if (!Tok.needsCleaning()) 389 return std::string(TokStart, TokStart + Tok.getLength()); 390 391 std::string Result; 392 Result.resize(Tok.getLength()); 393 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 394 return Result; 395 } 396 397 /// getSpelling - This method is used to get the spelling of a token into a 398 /// preallocated buffer, instead of as an std::string. The caller is required 399 /// to allocate enough space for the token, which is guaranteed to be at least 400 /// Tok.getLength() bytes long. The actual length of the token is returned. 401 /// 402 /// Note that this method may do two possible things: it may either fill in 403 /// the buffer specified with characters, or it may *change the input pointer* 404 /// to point to a constant buffer with the data already in it (avoiding a 405 /// copy). The caller is not allowed to modify the returned buffer pointer 406 /// if an internal buffer is returned. 407 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 408 const SourceManager &SourceMgr, 409 const LangOptions &LangOpts, bool *Invalid) { 410 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 411 412 const char *TokStart = nullptr; 413 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 414 if (Tok.is(tok::raw_identifier)) 415 TokStart = Tok.getRawIdentifier().data(); 416 else if (!Tok.hasUCN()) { 417 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 418 // Just return the string from the identifier table, which is very quick. 419 Buffer = II->getNameStart(); 420 return II->getLength(); 421 } 422 } 423 424 // NOTE: this can be checked even after testing for an IdentifierInfo. 425 if (Tok.isLiteral()) 426 TokStart = Tok.getLiteralData(); 427 428 if (!TokStart) { 429 // Compute the start of the token in the input lexer buffer. 430 bool CharDataInvalid = false; 431 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 432 if (Invalid) 433 *Invalid = CharDataInvalid; 434 if (CharDataInvalid) { 435 Buffer = ""; 436 return 0; 437 } 438 } 439 440 // If this token contains nothing interesting, return it directly. 441 if (!Tok.needsCleaning()) { 442 Buffer = TokStart; 443 return Tok.getLength(); 444 } 445 446 // Otherwise, hard case, relex the characters into the string. 447 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 448 } 449 450 /// MeasureTokenLength - Relex the token at the specified location and return 451 /// its length in bytes in the input file. If the token needs cleaning (e.g. 452 /// includes a trigraph or an escaped newline) then this count includes bytes 453 /// that are part of that. 454 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 455 const SourceManager &SM, 456 const LangOptions &LangOpts) { 457 Token TheTok; 458 if (getRawToken(Loc, TheTok, SM, LangOpts)) 459 return 0; 460 return TheTok.getLength(); 461 } 462 463 /// Relex the token at the specified location. 464 /// \returns true if there was a failure, false on success. 465 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 466 const SourceManager &SM, 467 const LangOptions &LangOpts, 468 bool IgnoreWhiteSpace) { 469 // TODO: this could be special cased for common tokens like identifiers, ')', 470 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 471 // all obviously single-char tokens. This could use 472 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 473 // something. 474 475 // If this comes from a macro expansion, we really do want the macro name, not 476 // the token this macro expanded to. 477 Loc = SM.getExpansionLoc(Loc); 478 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 479 bool Invalid = false; 480 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 481 if (Invalid) 482 return true; 483 484 const char *StrData = Buffer.data()+LocInfo.second; 485 486 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 487 return true; 488 489 // Create a lexer starting at the beginning of this token. 490 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 491 Buffer.begin(), StrData, Buffer.end()); 492 TheLexer.SetCommentRetentionState(true); 493 TheLexer.LexFromRawLexer(Result); 494 return false; 495 } 496 497 /// Returns the pointer that points to the beginning of line that contains 498 /// the given offset, or null if the offset if invalid. 499 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 500 const char *BufStart = Buffer.data(); 501 if (Offset >= Buffer.size()) 502 return nullptr; 503 504 const char *LexStart = BufStart + Offset; 505 for (; LexStart != BufStart; --LexStart) { 506 if (isVerticalWhitespace(LexStart[0]) && 507 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 508 // LexStart should point at first character of logical line. 509 ++LexStart; 510 break; 511 } 512 } 513 return LexStart; 514 } 515 516 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 517 const SourceManager &SM, 518 const LangOptions &LangOpts) { 519 assert(Loc.isFileID()); 520 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 521 if (LocInfo.first.isInvalid()) 522 return Loc; 523 524 bool Invalid = false; 525 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 526 if (Invalid) 527 return Loc; 528 529 // Back up from the current location until we hit the beginning of a line 530 // (or the buffer). We'll relex from that point. 531 const char *StrData = Buffer.data() + LocInfo.second; 532 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 533 if (!LexStart || LexStart == StrData) 534 return Loc; 535 536 // Create a lexer starting at the beginning of this token. 537 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 538 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 539 Buffer.end()); 540 TheLexer.SetCommentRetentionState(true); 541 542 // Lex tokens until we find the token that contains the source location. 543 Token TheTok; 544 do { 545 TheLexer.LexFromRawLexer(TheTok); 546 547 if (TheLexer.getBufferLocation() > StrData) { 548 // Lexing this token has taken the lexer past the source location we're 549 // looking for. If the current token encompasses our source location, 550 // return the beginning of that token. 551 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 552 return TheTok.getLocation(); 553 554 // We ended up skipping over the source location entirely, which means 555 // that it points into whitespace. We're done here. 556 break; 557 } 558 } while (TheTok.getKind() != tok::eof); 559 560 // We've passed our source location; just return the original source location. 561 return Loc; 562 } 563 564 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 565 const SourceManager &SM, 566 const LangOptions &LangOpts) { 567 if (Loc.isFileID()) 568 return getBeginningOfFileToken(Loc, SM, LangOpts); 569 570 if (!SM.isMacroArgExpansion(Loc)) 571 return Loc; 572 573 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 574 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 575 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 576 std::pair<FileID, unsigned> BeginFileLocInfo = 577 SM.getDecomposedLoc(BeginFileLoc); 578 assert(FileLocInfo.first == BeginFileLocInfo.first && 579 FileLocInfo.second >= BeginFileLocInfo.second); 580 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 581 } 582 583 namespace { 584 585 enum PreambleDirectiveKind { 586 PDK_Skipped, 587 PDK_Unknown 588 }; 589 590 } // namespace 591 592 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 593 const LangOptions &LangOpts, 594 unsigned MaxLines) { 595 // Create a lexer starting at the beginning of the file. Note that we use a 596 // "fake" file source location at offset 1 so that the lexer will track our 597 // position within the file. 598 const SourceLocation::UIntTy StartOffset = 1; 599 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 600 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 601 Buffer.end()); 602 TheLexer.SetCommentRetentionState(true); 603 604 bool InPreprocessorDirective = false; 605 Token TheTok; 606 SourceLocation ActiveCommentLoc; 607 608 unsigned MaxLineOffset = 0; 609 if (MaxLines) { 610 const char *CurPtr = Buffer.begin(); 611 unsigned CurLine = 0; 612 while (CurPtr != Buffer.end()) { 613 char ch = *CurPtr++; 614 if (ch == '\n') { 615 ++CurLine; 616 if (CurLine == MaxLines) 617 break; 618 } 619 } 620 if (CurPtr != Buffer.end()) 621 MaxLineOffset = CurPtr - Buffer.begin(); 622 } 623 624 do { 625 TheLexer.LexFromRawLexer(TheTok); 626 627 if (InPreprocessorDirective) { 628 // If we've hit the end of the file, we're done. 629 if (TheTok.getKind() == tok::eof) { 630 break; 631 } 632 633 // If we haven't hit the end of the preprocessor directive, skip this 634 // token. 635 if (!TheTok.isAtStartOfLine()) 636 continue; 637 638 // We've passed the end of the preprocessor directive, and will look 639 // at this token again below. 640 InPreprocessorDirective = false; 641 } 642 643 // Keep track of the # of lines in the preamble. 644 if (TheTok.isAtStartOfLine()) { 645 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 646 647 // If we were asked to limit the number of lines in the preamble, 648 // and we're about to exceed that limit, we're done. 649 if (MaxLineOffset && TokOffset >= MaxLineOffset) 650 break; 651 } 652 653 // Comments are okay; skip over them. 654 if (TheTok.getKind() == tok::comment) { 655 if (ActiveCommentLoc.isInvalid()) 656 ActiveCommentLoc = TheTok.getLocation(); 657 continue; 658 } 659 660 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 661 // This is the start of a preprocessor directive. 662 Token HashTok = TheTok; 663 InPreprocessorDirective = true; 664 ActiveCommentLoc = SourceLocation(); 665 666 // Figure out which directive this is. Since we're lexing raw tokens, 667 // we don't have an identifier table available. Instead, just look at 668 // the raw identifier to recognize and categorize preprocessor directives. 669 TheLexer.LexFromRawLexer(TheTok); 670 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 671 StringRef Keyword = TheTok.getRawIdentifier(); 672 PreambleDirectiveKind PDK 673 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 674 .Case("include", PDK_Skipped) 675 .Case("__include_macros", PDK_Skipped) 676 .Case("define", PDK_Skipped) 677 .Case("undef", PDK_Skipped) 678 .Case("line", PDK_Skipped) 679 .Case("error", PDK_Skipped) 680 .Case("pragma", PDK_Skipped) 681 .Case("import", PDK_Skipped) 682 .Case("include_next", PDK_Skipped) 683 .Case("warning", PDK_Skipped) 684 .Case("ident", PDK_Skipped) 685 .Case("sccs", PDK_Skipped) 686 .Case("assert", PDK_Skipped) 687 .Case("unassert", PDK_Skipped) 688 .Case("if", PDK_Skipped) 689 .Case("ifdef", PDK_Skipped) 690 .Case("ifndef", PDK_Skipped) 691 .Case("elif", PDK_Skipped) 692 .Case("elifdef", PDK_Skipped) 693 .Case("elifndef", PDK_Skipped) 694 .Case("else", PDK_Skipped) 695 .Case("endif", PDK_Skipped) 696 .Default(PDK_Unknown); 697 698 switch (PDK) { 699 case PDK_Skipped: 700 continue; 701 702 case PDK_Unknown: 703 // We don't know what this directive is; stop at the '#'. 704 break; 705 } 706 } 707 708 // We only end up here if we didn't recognize the preprocessor 709 // directive or it was one that can't occur in the preamble at this 710 // point. Roll back the current token to the location of the '#'. 711 TheTok = HashTok; 712 } else if (TheTok.isAtStartOfLine() && 713 TheTok.getKind() == tok::raw_identifier && 714 TheTok.getRawIdentifier() == "module" && 715 LangOpts.CPlusPlusModules) { 716 // The initial global module fragment introducer "module;" is part of 717 // the preamble, which runs up to the module declaration "module foo;". 718 Token ModuleTok = TheTok; 719 do { 720 TheLexer.LexFromRawLexer(TheTok); 721 } while (TheTok.getKind() == tok::comment); 722 if (TheTok.getKind() != tok::semi) { 723 // Not global module fragment, roll back. 724 TheTok = ModuleTok; 725 break; 726 } 727 continue; 728 } 729 730 // We hit a token that we don't recognize as being in the 731 // "preprocessing only" part of the file, so we're no longer in 732 // the preamble. 733 break; 734 } while (true); 735 736 SourceLocation End; 737 if (ActiveCommentLoc.isValid()) 738 End = ActiveCommentLoc; // don't truncate a decl comment. 739 else 740 End = TheTok.getLocation(); 741 742 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 743 TheTok.isAtStartOfLine()); 744 } 745 746 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 747 const SourceManager &SM, 748 const LangOptions &LangOpts) { 749 // Figure out how many physical characters away the specified expansion 750 // character is. This needs to take into consideration newlines and 751 // trigraphs. 752 bool Invalid = false; 753 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 754 755 // If they request the first char of the token, we're trivially done. 756 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 757 return 0; 758 759 unsigned PhysOffset = 0; 760 761 // The usual case is that tokens don't contain anything interesting. Skip 762 // over the uninteresting characters. If a token only consists of simple 763 // chars, this method is extremely fast. 764 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 765 if (CharNo == 0) 766 return PhysOffset; 767 ++TokPtr; 768 --CharNo; 769 ++PhysOffset; 770 } 771 772 // If we have a character that may be a trigraph or escaped newline, use a 773 // lexer to parse it correctly. 774 for (; CharNo; --CharNo) { 775 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 776 TokPtr += CharAndSize.Size; 777 PhysOffset += CharAndSize.Size; 778 } 779 780 // Final detail: if we end up on an escaped newline, we want to return the 781 // location of the actual byte of the token. For example foo\<newline>bar 782 // advanced by 3 should return the location of b, not of \\. One compounding 783 // detail of this is that the escape may be made by a trigraph. 784 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 785 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 786 787 return PhysOffset; 788 } 789 790 /// Computes the source location just past the end of the 791 /// token at this source location. 792 /// 793 /// This routine can be used to produce a source location that 794 /// points just past the end of the token referenced by \p Loc, and 795 /// is generally used when a diagnostic needs to point just after a 796 /// token where it expected something different that it received. If 797 /// the returned source location would not be meaningful (e.g., if 798 /// it points into a macro), this routine returns an invalid 799 /// source location. 800 /// 801 /// \param Offset an offset from the end of the token, where the source 802 /// location should refer to. The default offset (0) produces a source 803 /// location pointing just past the end of the token; an offset of 1 produces 804 /// a source location pointing to the last character in the token, etc. 805 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 806 const SourceManager &SM, 807 const LangOptions &LangOpts) { 808 if (Loc.isInvalid()) 809 return {}; 810 811 if (Loc.isMacroID()) { 812 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 813 return {}; // Points inside the macro expansion. 814 } 815 816 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 817 if (Len > Offset) 818 Len = Len - Offset; 819 else 820 return Loc; 821 822 return Loc.getLocWithOffset(Len); 823 } 824 825 /// Returns true if the given MacroID location points at the first 826 /// token of the macro expansion. 827 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 828 const SourceManager &SM, 829 const LangOptions &LangOpts, 830 SourceLocation *MacroBegin) { 831 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 832 833 SourceLocation expansionLoc; 834 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 835 return false; 836 837 if (expansionLoc.isFileID()) { 838 // No other macro expansions, this is the first. 839 if (MacroBegin) 840 *MacroBegin = expansionLoc; 841 return true; 842 } 843 844 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 845 } 846 847 /// Returns true if the given MacroID location points at the last 848 /// token of the macro expansion. 849 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 850 const SourceManager &SM, 851 const LangOptions &LangOpts, 852 SourceLocation *MacroEnd) { 853 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 854 855 SourceLocation spellLoc = SM.getSpellingLoc(loc); 856 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 857 if (tokLen == 0) 858 return false; 859 860 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 861 SourceLocation expansionLoc; 862 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 863 return false; 864 865 if (expansionLoc.isFileID()) { 866 // No other macro expansions. 867 if (MacroEnd) 868 *MacroEnd = expansionLoc; 869 return true; 870 } 871 872 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 873 } 874 875 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 876 const SourceManager &SM, 877 const LangOptions &LangOpts) { 878 SourceLocation Begin = Range.getBegin(); 879 SourceLocation End = Range.getEnd(); 880 assert(Begin.isFileID() && End.isFileID()); 881 if (Range.isTokenRange()) { 882 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 883 if (End.isInvalid()) 884 return {}; 885 } 886 887 // Break down the source locations. 888 FileID FID; 889 unsigned BeginOffs; 890 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 891 if (FID.isInvalid()) 892 return {}; 893 894 unsigned EndOffs; 895 if (!SM.isInFileID(End, FID, &EndOffs) || 896 BeginOffs > EndOffs) 897 return {}; 898 899 return CharSourceRange::getCharRange(Begin, End); 900 } 901 902 // Assumes that `Loc` is in an expansion. 903 static bool isInExpansionTokenRange(const SourceLocation Loc, 904 const SourceManager &SM) { 905 return SM.getSLocEntry(SM.getFileID(Loc)) 906 .getExpansion() 907 .isExpansionTokenRange(); 908 } 909 910 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 911 const SourceManager &SM, 912 const LangOptions &LangOpts) { 913 SourceLocation Begin = Range.getBegin(); 914 SourceLocation End = Range.getEnd(); 915 if (Begin.isInvalid() || End.isInvalid()) 916 return {}; 917 918 if (Begin.isFileID() && End.isFileID()) 919 return makeRangeFromFileLocs(Range, SM, LangOpts); 920 921 if (Begin.isMacroID() && End.isFileID()) { 922 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 923 return {}; 924 Range.setBegin(Begin); 925 return makeRangeFromFileLocs(Range, SM, LangOpts); 926 } 927 928 if (Begin.isFileID() && End.isMacroID()) { 929 if (Range.isTokenRange()) { 930 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 931 return {}; 932 // Use the *original* end, not the expanded one in `End`. 933 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 934 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 935 return {}; 936 Range.setEnd(End); 937 return makeRangeFromFileLocs(Range, SM, LangOpts); 938 } 939 940 assert(Begin.isMacroID() && End.isMacroID()); 941 SourceLocation MacroBegin, MacroEnd; 942 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 943 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 944 &MacroEnd)) || 945 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 946 &MacroEnd)))) { 947 Range.setBegin(MacroBegin); 948 Range.setEnd(MacroEnd); 949 // Use the *original* `End`, not the expanded one in `MacroEnd`. 950 if (Range.isTokenRange()) 951 Range.setTokenRange(isInExpansionTokenRange(End, SM)); 952 return makeRangeFromFileLocs(Range, SM, LangOpts); 953 } 954 955 bool Invalid = false; 956 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 957 &Invalid); 958 if (Invalid) 959 return {}; 960 961 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 962 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 963 &Invalid); 964 if (Invalid) 965 return {}; 966 967 if (EndEntry.getExpansion().isMacroArgExpansion() && 968 BeginEntry.getExpansion().getExpansionLocStart() == 969 EndEntry.getExpansion().getExpansionLocStart()) { 970 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 971 Range.setEnd(SM.getImmediateSpellingLoc(End)); 972 return makeFileCharRange(Range, SM, LangOpts); 973 } 974 } 975 976 return {}; 977 } 978 979 StringRef Lexer::getSourceText(CharSourceRange Range, 980 const SourceManager &SM, 981 const LangOptions &LangOpts, 982 bool *Invalid) { 983 Range = makeFileCharRange(Range, SM, LangOpts); 984 if (Range.isInvalid()) { 985 if (Invalid) *Invalid = true; 986 return {}; 987 } 988 989 // Break down the source location. 990 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 991 if (beginInfo.first.isInvalid()) { 992 if (Invalid) *Invalid = true; 993 return {}; 994 } 995 996 unsigned EndOffs; 997 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 998 beginInfo.second > EndOffs) { 999 if (Invalid) *Invalid = true; 1000 return {}; 1001 } 1002 1003 // Try to the load the file buffer. 1004 bool invalidTemp = false; 1005 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 1006 if (invalidTemp) { 1007 if (Invalid) *Invalid = true; 1008 return {}; 1009 } 1010 1011 if (Invalid) *Invalid = false; 1012 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 1013 } 1014 1015 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 1016 const SourceManager &SM, 1017 const LangOptions &LangOpts) { 1018 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1019 1020 // Find the location of the immediate macro expansion. 1021 while (true) { 1022 FileID FID = SM.getFileID(Loc); 1023 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 1024 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 1025 Loc = Expansion.getExpansionLocStart(); 1026 if (!Expansion.isMacroArgExpansion()) 1027 break; 1028 1029 // For macro arguments we need to check that the argument did not come 1030 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 1031 1032 // Loc points to the argument id of the macro definition, move to the 1033 // macro expansion. 1034 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1035 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 1036 if (SpellLoc.isFileID()) 1037 break; // No inner macro. 1038 1039 // If spelling location resides in the same FileID as macro expansion 1040 // location, it means there is no inner macro. 1041 FileID MacroFID = SM.getFileID(Loc); 1042 if (SM.isInFileID(SpellLoc, MacroFID)) 1043 break; 1044 1045 // Argument came from inner macro. 1046 Loc = SpellLoc; 1047 } 1048 1049 // Find the spelling location of the start of the non-argument expansion 1050 // range. This is where the macro name was spelled in order to begin 1051 // expanding this macro. 1052 Loc = SM.getSpellingLoc(Loc); 1053 1054 // Dig out the buffer where the macro name was spelled and the extents of the 1055 // name so that we can render it into the expansion note. 1056 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1057 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1058 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1059 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1060 } 1061 1062 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1063 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1064 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1065 // Walk past macro argument expansions. 1066 while (SM.isMacroArgExpansion(Loc)) 1067 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1068 1069 // If the macro's spelling isn't FileID or from scratch space, then it's 1070 // actually a token paste or stringization (or similar) and not a macro at 1071 // all. 1072 SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1073 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 1074 return {}; 1075 1076 // Find the spelling location of the start of the non-argument expansion 1077 // range. This is where the macro name was spelled in order to begin 1078 // expanding this macro. 1079 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1080 1081 // Dig out the buffer where the macro name was spelled and the extents of the 1082 // name so that we can render it into the expansion note. 1083 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1084 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1085 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1086 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1087 } 1088 1089 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1090 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 1091 } 1092 1093 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1094 assert(isVerticalWhitespace(Str[0])); 1095 if (Str - 1 < BufferStart) 1096 return false; 1097 1098 if ((Str[0] == '\n' && Str[-1] == '\r') || 1099 (Str[0] == '\r' && Str[-1] == '\n')) { 1100 if (Str - 2 < BufferStart) 1101 return false; 1102 --Str; 1103 } 1104 --Str; 1105 1106 // Rewind to first non-space character: 1107 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1108 --Str; 1109 1110 return *Str == '\\'; 1111 } 1112 1113 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1114 const SourceManager &SM) { 1115 if (Loc.isInvalid() || Loc.isMacroID()) 1116 return {}; 1117 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1118 if (LocInfo.first.isInvalid()) 1119 return {}; 1120 bool Invalid = false; 1121 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1122 if (Invalid) 1123 return {}; 1124 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1125 if (!Line) 1126 return {}; 1127 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1128 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1129 return NumWhitespaceChars == StringRef::npos 1130 ? "" 1131 : Rest.take_front(NumWhitespaceChars); 1132 } 1133 1134 //===----------------------------------------------------------------------===// 1135 // Diagnostics forwarding code. 1136 //===----------------------------------------------------------------------===// 1137 1138 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1139 /// lexer buffer was all expanded at a single point, perform the mapping. 1140 /// This is currently only used for _Pragma implementation, so it is the slow 1141 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1142 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1143 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1144 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1145 SourceLocation FileLoc, 1146 unsigned CharNo, unsigned TokLen) { 1147 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1148 1149 // Otherwise, we're lexing "mapped tokens". This is used for things like 1150 // _Pragma handling. Combine the expansion location of FileLoc with the 1151 // spelling location. 1152 SourceManager &SM = PP.getSourceManager(); 1153 1154 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1155 // characters come from spelling(FileLoc)+Offset. 1156 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1157 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1158 1159 // Figure out the expansion loc range, which is the range covered by the 1160 // original _Pragma(...) sequence. 1161 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1162 1163 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1164 } 1165 1166 /// getSourceLocation - Return a source location identifier for the specified 1167 /// offset in the current file. 1168 SourceLocation Lexer::getSourceLocation(const char *Loc, 1169 unsigned TokLen) const { 1170 assert(Loc >= BufferStart && Loc <= BufferEnd && 1171 "Location out of range for this buffer!"); 1172 1173 // In the normal case, we're just lexing from a simple file buffer, return 1174 // the file id from FileLoc with the offset specified. 1175 unsigned CharNo = Loc-BufferStart; 1176 if (FileLoc.isFileID()) 1177 return FileLoc.getLocWithOffset(CharNo); 1178 1179 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1180 // tokens are lexed from where the _Pragma was defined. 1181 assert(PP && "This doesn't work on raw lexers"); 1182 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1183 } 1184 1185 /// Diag - Forwarding function for diagnostics. This translate a source 1186 /// position in the current buffer into a SourceLocation object for rendering. 1187 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1188 return PP->Diag(getSourceLocation(Loc), DiagID); 1189 } 1190 1191 //===----------------------------------------------------------------------===// 1192 // Trigraph and Escaped Newline Handling Code. 1193 //===----------------------------------------------------------------------===// 1194 1195 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1196 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1197 static char GetTrigraphCharForLetter(char Letter) { 1198 switch (Letter) { 1199 default: return 0; 1200 case '=': return '#'; 1201 case ')': return ']'; 1202 case '(': return '['; 1203 case '!': return '|'; 1204 case '\'': return '^'; 1205 case '>': return '}'; 1206 case '/': return '\\'; 1207 case '<': return '{'; 1208 case '-': return '~'; 1209 } 1210 } 1211 1212 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1213 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1214 /// return the result character. Finally, emit a warning about trigraph use 1215 /// whether trigraphs are enabled or not. 1216 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 1217 char Res = GetTrigraphCharForLetter(*CP); 1218 if (!Res) 1219 return Res; 1220 1221 if (!Trigraphs) { 1222 if (L && !L->isLexingRawMode()) 1223 L->Diag(CP-2, diag::trigraph_ignored); 1224 return 0; 1225 } 1226 1227 if (L && !L->isLexingRawMode()) 1228 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1229 return Res; 1230 } 1231 1232 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1233 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1234 /// trigraph equivalent on entry to this function. 1235 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1236 unsigned Size = 0; 1237 while (isWhitespace(Ptr[Size])) { 1238 ++Size; 1239 1240 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1241 continue; 1242 1243 // If this is a \r\n or \n\r, skip the other half. 1244 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1245 Ptr[Size-1] != Ptr[Size]) 1246 ++Size; 1247 1248 return Size; 1249 } 1250 1251 // Not an escaped newline, must be a \t or something else. 1252 return 0; 1253 } 1254 1255 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1256 /// them), skip over them and return the first non-escaped-newline found, 1257 /// otherwise return P. 1258 const char *Lexer::SkipEscapedNewLines(const char *P) { 1259 while (true) { 1260 const char *AfterEscape; 1261 if (*P == '\\') { 1262 AfterEscape = P+1; 1263 } else if (*P == '?') { 1264 // If not a trigraph for escape, bail out. 1265 if (P[1] != '?' || P[2] != '/') 1266 return P; 1267 // FIXME: Take LangOpts into account; the language might not 1268 // support trigraphs. 1269 AfterEscape = P+3; 1270 } else { 1271 return P; 1272 } 1273 1274 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1275 if (NewLineSize == 0) return P; 1276 P = AfterEscape+NewLineSize; 1277 } 1278 } 1279 1280 std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 1281 const SourceManager &SM, 1282 const LangOptions &LangOpts) { 1283 if (Loc.isMacroID()) { 1284 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1285 return std::nullopt; 1286 } 1287 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1288 1289 // Break down the source location. 1290 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1291 1292 // Try to load the file buffer. 1293 bool InvalidTemp = false; 1294 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1295 if (InvalidTemp) 1296 return std::nullopt; 1297 1298 const char *TokenBegin = File.data() + LocInfo.second; 1299 1300 // Lex from the start of the given location. 1301 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1302 TokenBegin, File.end()); 1303 // Find the token. 1304 Token Tok; 1305 lexer.LexFromRawLexer(Tok); 1306 return Tok; 1307 } 1308 1309 /// Checks that the given token is the first token that occurs after the 1310 /// given location (this excludes comments and whitespace). Returns the location 1311 /// immediately after the specified token. If the token is not found or the 1312 /// location is inside a macro, the returned source location will be invalid. 1313 SourceLocation Lexer::findLocationAfterToken( 1314 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1315 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1316 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1317 if (!Tok || Tok->isNot(TKind)) 1318 return {}; 1319 SourceLocation TokenLoc = Tok->getLocation(); 1320 1321 // Calculate how much whitespace needs to be skipped if any. 1322 unsigned NumWhitespaceChars = 0; 1323 if (SkipTrailingWhitespaceAndNewLine) { 1324 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1325 unsigned char C = *TokenEnd; 1326 while (isHorizontalWhitespace(C)) { 1327 C = *(++TokenEnd); 1328 NumWhitespaceChars++; 1329 } 1330 1331 // Skip \r, \n, \r\n, or \n\r 1332 if (C == '\n' || C == '\r') { 1333 char PrevC = C; 1334 C = *(++TokenEnd); 1335 NumWhitespaceChars++; 1336 if ((C == '\n' || C == '\r') && C != PrevC) 1337 NumWhitespaceChars++; 1338 } 1339 } 1340 1341 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1342 } 1343 1344 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1345 /// get its size, and return it. This is tricky in several cases: 1346 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1347 /// then either return the trigraph (skipping 3 chars) or the '?', 1348 /// depending on whether trigraphs are enabled or not. 1349 /// 2. If this is an escaped newline (potentially with whitespace between 1350 /// the backslash and newline), implicitly skip the newline and return 1351 /// the char after it. 1352 /// 1353 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1354 /// know that we can accumulate into Size, and that we have already incremented 1355 /// Ptr by Size bytes. 1356 /// 1357 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1358 /// be updated to match. 1359 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 1360 unsigned Size = 0; 1361 // If we have a slash, look for an escaped newline. 1362 if (Ptr[0] == '\\') { 1363 ++Size; 1364 ++Ptr; 1365 Slash: 1366 // Common case, backslash-char where the char is not whitespace. 1367 if (!isWhitespace(Ptr[0])) 1368 return {'\\', Size}; 1369 1370 // See if we have optional whitespace characters between the slash and 1371 // newline. 1372 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1373 // Remember that this token needs to be cleaned. 1374 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1375 1376 // Warn if there was whitespace between the backslash and newline. 1377 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1378 Diag(Ptr, diag::backslash_newline_space); 1379 1380 // Found backslash<whitespace><newline>. Parse the char after it. 1381 Size += EscapedNewLineSize; 1382 Ptr += EscapedNewLineSize; 1383 1384 // Use slow version to accumulate a correct size field. 1385 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 1386 CharAndSize.Size += Size; 1387 return CharAndSize; 1388 } 1389 1390 // Otherwise, this is not an escaped newline, just return the slash. 1391 return {'\\', Size}; 1392 } 1393 1394 // If this is a trigraph, process it. 1395 if (Ptr[0] == '?' && Ptr[1] == '?') { 1396 // If this is actually a legal trigraph (not something like "??x"), emit 1397 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1398 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1399 LangOpts.Trigraphs)) { 1400 // Remember that this token needs to be cleaned. 1401 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1402 1403 Ptr += 3; 1404 Size += 3; 1405 if (C == '\\') goto Slash; 1406 return {C, Size}; 1407 } 1408 } 1409 1410 // If this is neither, return a single character. 1411 return {*Ptr, Size + 1u}; 1412 } 1413 1414 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1415 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1416 /// and that we have already incremented Ptr by Size bytes. 1417 /// 1418 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1419 /// be updated to match. 1420 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 1421 const LangOptions &LangOpts) { 1422 1423 unsigned Size = 0; 1424 // If we have a slash, look for an escaped newline. 1425 if (Ptr[0] == '\\') { 1426 ++Size; 1427 ++Ptr; 1428 Slash: 1429 // Common case, backslash-char where the char is not whitespace. 1430 if (!isWhitespace(Ptr[0])) 1431 return {'\\', Size}; 1432 1433 // See if we have optional whitespace characters followed by a newline. 1434 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1435 // Found backslash<whitespace><newline>. Parse the char after it. 1436 Size += EscapedNewLineSize; 1437 Ptr += EscapedNewLineSize; 1438 1439 // Use slow version to accumulate a correct size field. 1440 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 1441 CharAndSize.Size += Size; 1442 return CharAndSize; 1443 } 1444 1445 // Otherwise, this is not an escaped newline, just return the slash. 1446 return {'\\', Size}; 1447 } 1448 1449 // If this is a trigraph, process it. 1450 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1451 // If this is actually a legal trigraph (not something like "??x"), return 1452 // it. 1453 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1454 Ptr += 3; 1455 Size += 3; 1456 if (C == '\\') goto Slash; 1457 return {C, Size}; 1458 } 1459 } 1460 1461 // If this is neither, return a single character. 1462 return {*Ptr, Size + 1u}; 1463 } 1464 1465 //===----------------------------------------------------------------------===// 1466 // Helper methods for lexing. 1467 //===----------------------------------------------------------------------===// 1468 1469 /// Routine that indiscriminately sets the offset into the source file. 1470 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1471 BufferPtr = BufferStart + Offset; 1472 if (BufferPtr > BufferEnd) 1473 BufferPtr = BufferEnd; 1474 // FIXME: What exactly does the StartOfLine bit mean? There are two 1475 // possible meanings for the "start" of the line: the first token on the 1476 // unexpanded line, or the first token on the expanded line. 1477 IsAtStartOfLine = StartOfLine; 1478 IsAtPhysicalStartOfLine = StartOfLine; 1479 } 1480 1481 static bool isUnicodeWhitespace(uint32_t Codepoint) { 1482 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1483 UnicodeWhitespaceCharRanges); 1484 return UnicodeWhitespaceChars.contains(Codepoint); 1485 } 1486 1487 static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1488 llvm::SmallString<5> CharBuf; 1489 llvm::raw_svector_ostream CharOS(CharBuf); 1490 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1491 return CharBuf; 1492 } 1493 1494 // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1495 // we allow "Mathematical Notation Characters" in identifiers. 1496 // This is a proposed profile that extends the XID_Start/XID_continue 1497 // with mathematical symbols, superscipts and subscripts digits 1498 // found in some production software. 1499 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1500 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1501 bool IsStart, bool &IsExtension) { 1502 static const llvm::sys::UnicodeCharSet MathStartChars( 1503 MathematicalNotationProfileIDStartRanges); 1504 static const llvm::sys::UnicodeCharSet MathContinueChars( 1505 MathematicalNotationProfileIDContinueRanges); 1506 if (MathStartChars.contains(C) || 1507 (!IsStart && MathContinueChars.contains(C))) { 1508 IsExtension = true; 1509 return true; 1510 } 1511 return false; 1512 } 1513 1514 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1515 bool &IsExtension) { 1516 if (LangOpts.AsmPreprocessor) { 1517 return false; 1518 } else if (LangOpts.DollarIdents && '$' == C) { 1519 return true; 1520 } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1521 // A non-leading codepoint must have the XID_Continue property. 1522 // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1523 // so we need to check both tables. 1524 // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1525 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1526 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1527 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1528 return true; 1529 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1530 IsExtension); 1531 } else if (LangOpts.C11) { 1532 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1533 C11AllowedIDCharRanges); 1534 return C11AllowedIDChars.contains(C); 1535 } else { 1536 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1537 C99AllowedIDCharRanges); 1538 return C99AllowedIDChars.contains(C); 1539 } 1540 } 1541 1542 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1543 bool &IsExtension) { 1544 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1545 IsExtension = false; 1546 if (LangOpts.AsmPreprocessor) { 1547 return false; 1548 } 1549 if (LangOpts.CPlusPlus || LangOpts.C23) { 1550 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1551 if (XIDStartChars.contains(C)) 1552 return true; 1553 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1554 IsExtension); 1555 } 1556 if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1557 return false; 1558 if (LangOpts.C11) { 1559 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1560 C11DisallowedInitialIDCharRanges); 1561 return !C11DisallowedInitialIDChars.contains(C); 1562 } 1563 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1564 C99DisallowedInitialIDCharRanges); 1565 return !C99DisallowedInitialIDChars.contains(C); 1566 } 1567 1568 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1569 CharSourceRange Range) { 1570 1571 static const llvm::sys::UnicodeCharSet MathStartChars( 1572 MathematicalNotationProfileIDStartRanges); 1573 static const llvm::sys::UnicodeCharSet MathContinueChars( 1574 MathematicalNotationProfileIDContinueRanges); 1575 1576 (void)MathStartChars; 1577 (void)MathContinueChars; 1578 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1579 "Unexpected mathematical notation codepoint"); 1580 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1581 << codepointAsHexString(C) << Range; 1582 } 1583 1584 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1585 const char *End) { 1586 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1587 L.getSourceLocation(End)); 1588 } 1589 1590 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1591 CharSourceRange Range, bool IsFirst) { 1592 // Check C99 compatibility. 1593 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1594 enum { 1595 CannotAppearInIdentifier = 0, 1596 CannotStartIdentifier 1597 }; 1598 1599 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1600 C99AllowedIDCharRanges); 1601 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1602 C99DisallowedInitialIDCharRanges); 1603 if (!C99AllowedIDChars.contains(C)) { 1604 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1605 << Range 1606 << CannotAppearInIdentifier; 1607 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1608 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1609 << Range 1610 << CannotStartIdentifier; 1611 } 1612 } 1613 } 1614 1615 /// After encountering UTF-8 character C and interpreting it as an identifier 1616 /// character, check whether it's a homoglyph for a common non-identifier 1617 /// source character that is unlikely to be an intentional identifier 1618 /// character and warn if so. 1619 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1620 CharSourceRange Range) { 1621 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1622 struct HomoglyphPair { 1623 uint32_t Character; 1624 char LooksLike; 1625 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1626 }; 1627 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1628 {U'\u00ad', 0}, // SOFT HYPHEN 1629 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1630 {U'\u037e', ';'}, // GREEK QUESTION MARK 1631 {U'\u200b', 0}, // ZERO WIDTH SPACE 1632 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1633 {U'\u200d', 0}, // ZERO WIDTH JOINER 1634 {U'\u2060', 0}, // WORD JOINER 1635 {U'\u2061', 0}, // FUNCTION APPLICATION 1636 {U'\u2062', 0}, // INVISIBLE TIMES 1637 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1638 {U'\u2064', 0}, // INVISIBLE PLUS 1639 {U'\u2212', '-'}, // MINUS SIGN 1640 {U'\u2215', '/'}, // DIVISION SLASH 1641 {U'\u2216', '\\'}, // SET MINUS 1642 {U'\u2217', '*'}, // ASTERISK OPERATOR 1643 {U'\u2223', '|'}, // DIVIDES 1644 {U'\u2227', '^'}, // LOGICAL AND 1645 {U'\u2236', ':'}, // RATIO 1646 {U'\u223c', '~'}, // TILDE OPERATOR 1647 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1648 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1649 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1650 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1651 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1652 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1653 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1654 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1655 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1656 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1657 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1658 {U'\uff0c', ','}, // FULLWIDTH COMMA 1659 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1660 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1661 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1662 {U'\uff1a', ':'}, // FULLWIDTH COLON 1663 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1664 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1665 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1666 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1667 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1668 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1669 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1670 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1671 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1672 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1673 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1674 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1675 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1676 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1677 {0, 0} 1678 }; 1679 auto Homoglyph = 1680 std::lower_bound(std::begin(SortedHomoglyphs), 1681 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1682 if (Homoglyph->Character == C) { 1683 if (Homoglyph->LooksLike) { 1684 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1685 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1686 << Range << codepointAsHexString(C) << LooksLikeStr; 1687 } else { 1688 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1689 << Range << codepointAsHexString(C); 1690 } 1691 } 1692 } 1693 1694 static void diagnoseInvalidUnicodeCodepointInIdentifier( 1695 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1696 CharSourceRange Range, bool IsFirst) { 1697 if (isASCII(CodePoint)) 1698 return; 1699 1700 bool IsExtension; 1701 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1702 bool IsIDContinue = 1703 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1704 1705 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1706 return; 1707 1708 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1709 1710 if (!IsFirst || InvalidOnlyAtStart) { 1711 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1712 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1713 << FixItHint::CreateRemoval(Range); 1714 } else { 1715 Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1716 << Range << codepointAsHexString(CodePoint) 1717 << FixItHint::CreateRemoval(Range); 1718 } 1719 } 1720 1721 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1722 Token &Result) { 1723 const char *UCNPtr = CurPtr + Size; 1724 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1725 if (CodePoint == 0) { 1726 return false; 1727 } 1728 bool IsExtension = false; 1729 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1730 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1731 return false; 1732 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1733 !PP->isPreprocessedOutput()) 1734 diagnoseInvalidUnicodeCodepointInIdentifier( 1735 PP->getDiagnostics(), LangOpts, CodePoint, 1736 makeCharRange(*this, CurPtr, UCNPtr), 1737 /*IsFirst=*/false); 1738 1739 // We got a unicode codepoint that is neither a space nor a 1740 // a valid identifier part. 1741 // Carry on as if the codepoint was valid for recovery purposes. 1742 } else if (!isLexingRawMode()) { 1743 if (IsExtension) 1744 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1745 makeCharRange(*this, CurPtr, UCNPtr)); 1746 1747 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1748 makeCharRange(*this, CurPtr, UCNPtr), 1749 /*IsFirst=*/false); 1750 } 1751 1752 Result.setFlag(Token::HasUCN); 1753 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1754 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1755 CurPtr = UCNPtr; 1756 else 1757 while (CurPtr != UCNPtr) 1758 (void)getAndAdvanceChar(CurPtr, Result); 1759 return true; 1760 } 1761 1762 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 1763 llvm::UTF32 CodePoint; 1764 1765 // If a UTF-8 codepoint appears immediately after an escaped new line, 1766 // CurPtr may point to the splicing \ on the preceding line, 1767 // so we need to skip it. 1768 unsigned FirstCodeUnitSize; 1769 getCharAndSize(CurPtr, FirstCodeUnitSize); 1770 const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 1771 const char *UnicodePtr = CharStart; 1772 1773 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 1774 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 1775 &CodePoint, llvm::strictConversion); 1776 if (ConvResult != llvm::conversionOK) 1777 return false; 1778 1779 bool IsExtension = false; 1780 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1781 IsExtension)) { 1782 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1783 return false; 1784 1785 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1786 !PP->isPreprocessedOutput()) 1787 diagnoseInvalidUnicodeCodepointInIdentifier( 1788 PP->getDiagnostics(), LangOpts, CodePoint, 1789 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1790 // We got a unicode codepoint that is neither a space nor a 1791 // a valid identifier part. Carry on as if the codepoint was 1792 // valid for recovery purposes. 1793 } else if (!isLexingRawMode()) { 1794 if (IsExtension) 1795 diagnoseExtensionInIdentifier( 1796 PP->getDiagnostics(), CodePoint, 1797 makeCharRange(*this, CharStart, UnicodePtr)); 1798 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1799 makeCharRange(*this, CharStart, UnicodePtr), 1800 /*IsFirst=*/false); 1801 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1802 makeCharRange(*this, CharStart, UnicodePtr)); 1803 } 1804 1805 // Once we sucessfully parsed some UTF-8, 1806 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 1807 // being lexed, and that warnings about trailing spaces are emitted. 1808 ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 1809 CurPtr = UnicodePtr; 1810 return true; 1811 } 1812 1813 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1814 const char *CurPtr) { 1815 bool IsExtension = false; 1816 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1817 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1818 !PP->isPreprocessedOutput()) { 1819 if (IsExtension) 1820 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1821 makeCharRange(*this, BufferPtr, CurPtr)); 1822 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1823 makeCharRange(*this, BufferPtr, CurPtr), 1824 /*IsFirst=*/true); 1825 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1826 makeCharRange(*this, BufferPtr, CurPtr)); 1827 } 1828 1829 MIOpt.ReadToken(); 1830 return LexIdentifierContinue(Result, CurPtr); 1831 } 1832 1833 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1834 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1835 !isUnicodeWhitespace(C)) { 1836 // Non-ASCII characters tend to creep into source code unintentionally. 1837 // Instead of letting the parser complain about the unknown token, 1838 // just drop the character. 1839 // Note that we can /only/ do this when the non-ASCII character is actually 1840 // spelled as Unicode, not written as a UCN. The standard requires that 1841 // we not throw away any possible preprocessor tokens, but there's a 1842 // loophole in the mapping of Unicode characters to basic character set 1843 // characters that allows us to map these particular characters to, say, 1844 // whitespace. 1845 diagnoseInvalidUnicodeCodepointInIdentifier( 1846 PP->getDiagnostics(), LangOpts, C, 1847 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1848 BufferPtr = CurPtr; 1849 return false; 1850 } 1851 1852 // Otherwise, we have an explicit UCN or a character that's unlikely to show 1853 // up by accident. 1854 MIOpt.ReadToken(); 1855 FormTokenWithChars(Result, CurPtr, tok::unknown); 1856 return true; 1857 } 1858 1859 static const char * 1860 fastParseASCIIIdentifier(const char *CurPtr, 1861 [[maybe_unused]] const char *BufferEnd) { 1862 #ifdef __SSE4_2__ 1863 alignas(16) static constexpr char AsciiIdentifierRange[16] = { 1864 '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 1865 }; 1866 constexpr ssize_t BytesPerRegister = 16; 1867 1868 __m128i AsciiIdentifierRangeV = 1869 _mm_load_si128((const __m128i *)AsciiIdentifierRange); 1870 1871 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 1872 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 1873 1874 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 1875 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 1876 _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 1877 CurPtr += Consumed; 1878 if (Consumed == BytesPerRegister) 1879 continue; 1880 return CurPtr; 1881 } 1882 #endif 1883 1884 unsigned char C = *CurPtr; 1885 while (isAsciiIdentifierContinue(C)) 1886 C = *++CurPtr; 1887 return CurPtr; 1888 } 1889 1890 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1891 // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1892 1893 while (true) { 1894 1895 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1896 1897 unsigned Size; 1898 // Slow path: handle trigraph, unicode codepoints, UCNs. 1899 unsigned char C = getCharAndSize(CurPtr, Size); 1900 if (isAsciiIdentifierContinue(C)) { 1901 CurPtr = ConsumeChar(CurPtr, Size, Result); 1902 continue; 1903 } 1904 if (C == '$') { 1905 // If we hit a $ and they are not supported in identifiers, we are done. 1906 if (!LangOpts.DollarIdents) 1907 break; 1908 // Otherwise, emit a diagnostic and continue. 1909 if (!isLexingRawMode()) 1910 Diag(CurPtr, diag::ext_dollar_in_identifier); 1911 CurPtr = ConsumeChar(CurPtr, Size, Result); 1912 continue; 1913 } 1914 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1915 continue; 1916 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1917 continue; 1918 // Neither an expected Unicode codepoint nor a UCN. 1919 break; 1920 } 1921 1922 const char *IdStart = BufferPtr; 1923 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1924 Result.setRawIdentifierData(IdStart); 1925 1926 // If we are in raw mode, return this identifier raw. There is no need to 1927 // look up identifier information or attempt to macro expand it. 1928 if (LexingRawMode) 1929 return true; 1930 1931 // Fill in Result.IdentifierInfo and update the token kind, 1932 // looking up the identifier in the identifier table. 1933 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1934 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1935 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1936 1937 // If the completion point is at the end of an identifier, we want to treat 1938 // the identifier as incomplete even if it resolves to a macro or a keyword. 1939 // This allows e.g. 'class^' to complete to 'classifier'. 1940 if (isCodeCompletionPoint(CurPtr)) { 1941 // Return the code-completion token. 1942 Result.setKind(tok::code_completion); 1943 // Skip the code-completion char and all immediate identifier characters. 1944 // This ensures we get consistent behavior when completing at any point in 1945 // an identifier (i.e. at the start, in the middle, at the end). Note that 1946 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1947 // simpler. 1948 assert(*CurPtr == 0 && "Completion character must be 0"); 1949 ++CurPtr; 1950 // Note that code completion token is not added as a separate character 1951 // when the completion point is at the end of the buffer. Therefore, we need 1952 // to check if the buffer has ended. 1953 if (CurPtr < BufferEnd) { 1954 while (isAsciiIdentifierContinue(*CurPtr)) 1955 ++CurPtr; 1956 } 1957 BufferPtr = CurPtr; 1958 return true; 1959 } 1960 1961 // Finally, now that we know we have an identifier, pass this off to the 1962 // preprocessor, which may macro expand it or something. 1963 if (II->isHandleIdentifierCase()) 1964 return PP->HandleIdentifier(Result); 1965 1966 return true; 1967 } 1968 1969 /// isHexaLiteral - Return true if Start points to a hex constant. 1970 /// in microsoft mode (where this is supposed to be several different tokens). 1971 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1972 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 1973 char C1 = CharAndSize1.Char; 1974 if (C1 != '0') 1975 return false; 1976 1977 auto CharAndSize2 = 1978 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 1979 char C2 = CharAndSize2.Char; 1980 return (C2 == 'x' || C2 == 'X'); 1981 } 1982 1983 /// LexNumericConstant - Lex the remainder of a integer or floating point 1984 /// constant. From[-1] is the first character lexed. Return the end of the 1985 /// constant. 1986 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1987 unsigned Size; 1988 char C = getCharAndSize(CurPtr, Size); 1989 char PrevCh = 0; 1990 while (isPreprocessingNumberBody(C)) { 1991 CurPtr = ConsumeChar(CurPtr, Size, Result); 1992 PrevCh = C; 1993 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) { 1994 CurPtr -= Size; 1995 break; 1996 } 1997 C = getCharAndSize(CurPtr, Size); 1998 } 1999 2000 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 2001 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 2002 // If we are in Microsoft mode, don't continue if the constant is hex. 2003 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 2004 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 2005 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2006 } 2007 2008 // If we have a hex FP constant, continue. 2009 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 2010 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 2011 // not-quite-conforming extension. Only do so if this looks like it's 2012 // actually meant to be a hexfloat, and not if it has a ud-suffix. 2013 bool IsHexFloat = true; 2014 if (!LangOpts.C99) { 2015 if (!isHexaLiteral(BufferPtr, LangOpts)) 2016 IsHexFloat = false; 2017 else if (!LangOpts.CPlusPlus17 && 2018 std::find(BufferPtr, CurPtr, '_') != CurPtr) 2019 IsHexFloat = false; 2020 } 2021 if (IsHexFloat) 2022 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2023 } 2024 2025 // If we have a digit separator, continue. 2026 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 2027 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2028 if (isAsciiIdentifierContinue(Next)) { 2029 if (!isLexingRawMode()) 2030 Diag(CurPtr, LangOpts.CPlusPlus 2031 ? diag::warn_cxx11_compat_digit_separator 2032 : diag::warn_c23_compat_digit_separator); 2033 CurPtr = ConsumeChar(CurPtr, Size, Result); 2034 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 2035 return LexNumericConstant(Result, CurPtr); 2036 } 2037 } 2038 2039 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 2040 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2041 return LexNumericConstant(Result, CurPtr); 2042 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2043 return LexNumericConstant(Result, CurPtr); 2044 2045 // Update the location of token as well as BufferPtr. 2046 const char *TokStart = BufferPtr; 2047 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 2048 Result.setLiteralData(TokStart); 2049 return true; 2050 } 2051 2052 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 2053 /// in C++11, or warn on a ud-suffix in C++98. 2054 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 2055 bool IsStringLiteral) { 2056 assert(LangOpts.CPlusPlus); 2057 2058 // Maximally munch an identifier. 2059 unsigned Size; 2060 char C = getCharAndSize(CurPtr, Size); 2061 bool Consumed = false; 2062 2063 if (!isAsciiIdentifierStart(C)) { 2064 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2065 Consumed = true; 2066 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2067 Consumed = true; 2068 else 2069 return CurPtr; 2070 } 2071 2072 if (!LangOpts.CPlusPlus11) { 2073 if (!isLexingRawMode()) 2074 Diag(CurPtr, 2075 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 2076 : diag::warn_cxx11_compat_reserved_user_defined_literal) 2077 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2078 return CurPtr; 2079 } 2080 2081 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 2082 // that does not start with an underscore is ill-formed. As a conforming 2083 // extension, we treat all such suffixes as if they had whitespace before 2084 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 2085 // likely to be a ud-suffix than a macro, however, and accept that. 2086 if (!Consumed) { 2087 bool IsUDSuffix = false; 2088 if (C == '_') 2089 IsUDSuffix = true; 2090 else if (IsStringLiteral && LangOpts.CPlusPlus14) { 2091 // In C++1y, we need to look ahead a few characters to see if this is a 2092 // valid suffix for a string literal or a numeric literal (this could be 2093 // the 'operator""if' defining a numeric literal operator). 2094 const unsigned MaxStandardSuffixLength = 3; 2095 char Buffer[MaxStandardSuffixLength] = { C }; 2096 unsigned Consumed = Size; 2097 unsigned Chars = 1; 2098 while (true) { 2099 auto [Next, NextSize] = 2100 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2101 if (!isAsciiIdentifierContinue(Next)) { 2102 // End of suffix. Check whether this is on the allowed list. 2103 const StringRef CompleteSuffix(Buffer, Chars); 2104 IsUDSuffix = 2105 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 2106 break; 2107 } 2108 2109 if (Chars == MaxStandardSuffixLength) 2110 // Too long: can't be a standard suffix. 2111 break; 2112 2113 Buffer[Chars++] = Next; 2114 Consumed += NextSize; 2115 } 2116 } 2117 2118 if (!IsUDSuffix) { 2119 if (!isLexingRawMode()) 2120 Diag(CurPtr, LangOpts.MSVCCompat 2121 ? diag::ext_ms_reserved_user_defined_literal 2122 : diag::ext_reserved_user_defined_literal) 2123 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2124 return CurPtr; 2125 } 2126 2127 CurPtr = ConsumeChar(CurPtr, Size, Result); 2128 } 2129 2130 Result.setFlag(Token::HasUDSuffix); 2131 while (true) { 2132 C = getCharAndSize(CurPtr, Size); 2133 if (isAsciiIdentifierContinue(C)) { 2134 CurPtr = ConsumeChar(CurPtr, Size, Result); 2135 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2136 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2137 } else 2138 break; 2139 } 2140 2141 return CurPtr; 2142 } 2143 2144 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 2145 /// either " or L" or u8" or u" or U". 2146 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 2147 tok::TokenKind Kind) { 2148 const char *AfterQuote = CurPtr; 2149 // Does this string contain the \0 character? 2150 const char *NulCharacter = nullptr; 2151 2152 if (!isLexingRawMode() && 2153 (Kind == tok::utf8_string_literal || 2154 Kind == tok::utf16_string_literal || 2155 Kind == tok::utf32_string_literal)) 2156 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 2157 : diag::warn_c99_compat_unicode_literal); 2158 2159 char C = getAndAdvanceChar(CurPtr, Result); 2160 while (C != '"') { 2161 // Skip escaped characters. Escaped newlines will already be processed by 2162 // getAndAdvanceChar. 2163 if (C == '\\') 2164 C = getAndAdvanceChar(CurPtr, Result); 2165 2166 if (C == '\n' || C == '\r' || // Newline. 2167 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2168 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2169 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 2170 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2171 return true; 2172 } 2173 2174 if (C == 0) { 2175 if (isCodeCompletionPoint(CurPtr-1)) { 2176 if (ParsingFilename) 2177 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 2178 else 2179 PP->CodeCompleteNaturalLanguage(); 2180 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2181 cutOffLexing(); 2182 return true; 2183 } 2184 2185 NulCharacter = CurPtr-1; 2186 } 2187 C = getAndAdvanceChar(CurPtr, Result); 2188 } 2189 2190 // If we are in C++11, lex the optional ud-suffix. 2191 if (LangOpts.CPlusPlus) 2192 CurPtr = LexUDSuffix(Result, CurPtr, true); 2193 2194 // If a nul character existed in the string, warn about it. 2195 if (NulCharacter && !isLexingRawMode()) 2196 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2197 2198 // Update the location of the token as well as the BufferPtr instance var. 2199 const char *TokStart = BufferPtr; 2200 FormTokenWithChars(Result, CurPtr, Kind); 2201 Result.setLiteralData(TokStart); 2202 return true; 2203 } 2204 2205 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 2206 /// having lexed R", LR", u8R", uR", or UR". 2207 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 2208 tok::TokenKind Kind) { 2209 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 2210 // Between the initial and final double quote characters of the raw string, 2211 // any transformations performed in phases 1 and 2 (trigraphs, 2212 // universal-character-names, and line splicing) are reverted. 2213 2214 if (!isLexingRawMode()) 2215 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 2216 2217 unsigned PrefixLen = 0; 2218 2219 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 2220 ++PrefixLen; 2221 2222 // If the last character was not a '(', then we didn't lex a valid delimiter. 2223 if (CurPtr[PrefixLen] != '(') { 2224 if (!isLexingRawMode()) { 2225 const char *PrefixEnd = &CurPtr[PrefixLen]; 2226 if (PrefixLen == 16) { 2227 Diag(PrefixEnd, diag::err_raw_delim_too_long); 2228 } else { 2229 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 2230 << StringRef(PrefixEnd, 1); 2231 } 2232 } 2233 2234 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 2235 // it's possible the '"' was intended to be part of the raw string, but 2236 // there's not much we can do about that. 2237 while (true) { 2238 char C = *CurPtr++; 2239 2240 if (C == '"') 2241 break; 2242 if (C == 0 && CurPtr-1 == BufferEnd) { 2243 --CurPtr; 2244 break; 2245 } 2246 } 2247 2248 FormTokenWithChars(Result, CurPtr, tok::unknown); 2249 return true; 2250 } 2251 2252 // Save prefix and move CurPtr past it 2253 const char *Prefix = CurPtr; 2254 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2255 2256 while (true) { 2257 char C = *CurPtr++; 2258 2259 if (C == ')') { 2260 // Check for prefix match and closing quote. 2261 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2262 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2263 break; 2264 } 2265 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2266 if (!isLexingRawMode()) 2267 Diag(BufferPtr, diag::err_unterminated_raw_string) 2268 << StringRef(Prefix, PrefixLen); 2269 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2270 return true; 2271 } 2272 } 2273 2274 // If we are in C++11, lex the optional ud-suffix. 2275 if (LangOpts.CPlusPlus) 2276 CurPtr = LexUDSuffix(Result, CurPtr, true); 2277 2278 // Update the location of token as well as BufferPtr. 2279 const char *TokStart = BufferPtr; 2280 FormTokenWithChars(Result, CurPtr, Kind); 2281 Result.setLiteralData(TokStart); 2282 return true; 2283 } 2284 2285 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2286 /// after having lexed the '<' character. This is used for #include filenames. 2287 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2288 // Does this string contain the \0 character? 2289 const char *NulCharacter = nullptr; 2290 const char *AfterLessPos = CurPtr; 2291 char C = getAndAdvanceChar(CurPtr, Result); 2292 while (C != '>') { 2293 // Skip escaped characters. Escaped newlines will already be processed by 2294 // getAndAdvanceChar. 2295 if (C == '\\') 2296 C = getAndAdvanceChar(CurPtr, Result); 2297 2298 if (isVerticalWhitespace(C) || // Newline. 2299 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2300 // If the filename is unterminated, then it must just be a lone < 2301 // character. Return this as such. 2302 FormTokenWithChars(Result, AfterLessPos, tok::less); 2303 return true; 2304 } 2305 2306 if (C == 0) { 2307 if (isCodeCompletionPoint(CurPtr - 1)) { 2308 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2309 cutOffLexing(); 2310 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2311 return true; 2312 } 2313 NulCharacter = CurPtr-1; 2314 } 2315 C = getAndAdvanceChar(CurPtr, Result); 2316 } 2317 2318 // If a nul character existed in the string, warn about it. 2319 if (NulCharacter && !isLexingRawMode()) 2320 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2321 2322 // Update the location of token as well as BufferPtr. 2323 const char *TokStart = BufferPtr; 2324 FormTokenWithChars(Result, CurPtr, tok::header_name); 2325 Result.setLiteralData(TokStart); 2326 return true; 2327 } 2328 2329 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2330 const char *CompletionPoint, 2331 bool IsAngled) { 2332 // Completion only applies to the filename, after the last slash. 2333 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2334 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2335 auto Slash = PartialPath.find_last_of(SlashChars); 2336 StringRef Dir = 2337 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2338 const char *StartOfFilename = 2339 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2340 // Code completion filter range is the filename only, up to completion point. 2341 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2342 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2343 // We should replace the characters up to the closing quote or closest slash, 2344 // if any. 2345 while (CompletionPoint < BufferEnd) { 2346 char Next = *(CompletionPoint + 1); 2347 if (Next == 0 || Next == '\r' || Next == '\n') 2348 break; 2349 ++CompletionPoint; 2350 if (Next == (IsAngled ? '>' : '"')) 2351 break; 2352 if (SlashChars.contains(Next)) 2353 break; 2354 } 2355 2356 PP->setCodeCompletionTokenRange( 2357 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2358 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2359 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2360 } 2361 2362 /// LexCharConstant - Lex the remainder of a character constant, after having 2363 /// lexed either ' or L' or u8' or u' or U'. 2364 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2365 tok::TokenKind Kind) { 2366 // Does this character contain the \0 character? 2367 const char *NulCharacter = nullptr; 2368 2369 if (!isLexingRawMode()) { 2370 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2371 Diag(BufferPtr, LangOpts.CPlusPlus 2372 ? diag::warn_cxx98_compat_unicode_literal 2373 : diag::warn_c99_compat_unicode_literal); 2374 else if (Kind == tok::utf8_char_constant) 2375 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2376 } 2377 2378 char C = getAndAdvanceChar(CurPtr, Result); 2379 if (C == '\'') { 2380 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2381 Diag(BufferPtr, diag::ext_empty_character); 2382 FormTokenWithChars(Result, CurPtr, tok::unknown); 2383 return true; 2384 } 2385 2386 while (C != '\'') { 2387 // Skip escaped characters. 2388 if (C == '\\') 2389 C = getAndAdvanceChar(CurPtr, Result); 2390 2391 if (C == '\n' || C == '\r' || // Newline. 2392 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2393 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2394 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2395 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2396 return true; 2397 } 2398 2399 if (C == 0) { 2400 if (isCodeCompletionPoint(CurPtr-1)) { 2401 PP->CodeCompleteNaturalLanguage(); 2402 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2403 cutOffLexing(); 2404 return true; 2405 } 2406 2407 NulCharacter = CurPtr-1; 2408 } 2409 C = getAndAdvanceChar(CurPtr, Result); 2410 } 2411 2412 // If we are in C++11, lex the optional ud-suffix. 2413 if (LangOpts.CPlusPlus) 2414 CurPtr = LexUDSuffix(Result, CurPtr, false); 2415 2416 // If a nul character existed in the character, warn about it. 2417 if (NulCharacter && !isLexingRawMode()) 2418 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2419 2420 // Update the location of token as well as BufferPtr. 2421 const char *TokStart = BufferPtr; 2422 FormTokenWithChars(Result, CurPtr, Kind); 2423 Result.setLiteralData(TokStart); 2424 return true; 2425 } 2426 2427 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2428 /// Update BufferPtr to point to the next non-whitespace character and return. 2429 /// 2430 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2431 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2432 bool &TokAtPhysicalStartOfLine) { 2433 // Whitespace - Skip it, then return the token after the whitespace. 2434 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2435 2436 unsigned char Char = *CurPtr; 2437 2438 const char *lastNewLine = nullptr; 2439 auto setLastNewLine = [&](const char *Ptr) { 2440 lastNewLine = Ptr; 2441 if (!NewLinePtr) 2442 NewLinePtr = Ptr; 2443 }; 2444 if (SawNewline) 2445 setLastNewLine(CurPtr - 1); 2446 2447 // Skip consecutive spaces efficiently. 2448 while (true) { 2449 // Skip horizontal whitespace very aggressively. 2450 while (isHorizontalWhitespace(Char)) 2451 Char = *++CurPtr; 2452 2453 // Otherwise if we have something other than whitespace, we're done. 2454 if (!isVerticalWhitespace(Char)) 2455 break; 2456 2457 if (ParsingPreprocessorDirective) { 2458 // End of preprocessor directive line, let LexTokenInternal handle this. 2459 BufferPtr = CurPtr; 2460 return false; 2461 } 2462 2463 // OK, but handle newline. 2464 if (*CurPtr == '\n') 2465 setLastNewLine(CurPtr); 2466 SawNewline = true; 2467 Char = *++CurPtr; 2468 } 2469 2470 // If the client wants us to return whitespace, return it now. 2471 if (isKeepWhitespaceMode()) { 2472 FormTokenWithChars(Result, CurPtr, tok::unknown); 2473 if (SawNewline) { 2474 IsAtStartOfLine = true; 2475 IsAtPhysicalStartOfLine = true; 2476 } 2477 // FIXME: The next token will not have LeadingSpace set. 2478 return true; 2479 } 2480 2481 // If this isn't immediately after a newline, there is leading space. 2482 char PrevChar = CurPtr[-1]; 2483 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2484 2485 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2486 if (SawNewline) { 2487 Result.setFlag(Token::StartOfLine); 2488 TokAtPhysicalStartOfLine = true; 2489 2490 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2491 if (auto *Handler = PP->getEmptylineHandler()) 2492 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2493 getSourceLocation(lastNewLine))); 2494 } 2495 } 2496 2497 BufferPtr = CurPtr; 2498 return false; 2499 } 2500 2501 /// We have just read the // characters from input. Skip until we find the 2502 /// newline character that terminates the comment. Then update BufferPtr and 2503 /// return. 2504 /// 2505 /// If we're in KeepCommentMode or any CommentHandler has inserted 2506 /// some tokens, this will store the first token and return true. 2507 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2508 bool &TokAtPhysicalStartOfLine) { 2509 // If Line comments aren't explicitly enabled for this language, emit an 2510 // extension warning. 2511 if (!LineComment) { 2512 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 2513 Diag(BufferPtr, diag::ext_line_comment); 2514 2515 // Mark them enabled so we only emit one warning for this translation 2516 // unit. 2517 LineComment = true; 2518 } 2519 2520 // Scan over the body of the comment. The common case, when scanning, is that 2521 // the comment contains normal ascii characters with nothing interesting in 2522 // them. As such, optimize for this case with the inner loop. 2523 // 2524 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2525 // character that ends the line comment. 2526 2527 // C++23 [lex.phases] p1 2528 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2529 // diagnostic only once per entire ill-formed subsequence to avoid 2530 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2531 bool UnicodeDecodingAlreadyDiagnosed = false; 2532 2533 char C; 2534 while (true) { 2535 C = *CurPtr; 2536 // Skip over characters in the fast loop. 2537 while (isASCII(C) && C != 0 && // Potentially EOF. 2538 C != '\n' && C != '\r') { // Newline or DOS-style newline. 2539 C = *++CurPtr; 2540 UnicodeDecodingAlreadyDiagnosed = false; 2541 } 2542 2543 if (!isASCII(C)) { 2544 unsigned Length = llvm::getUTF8SequenceSize( 2545 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2546 if (Length == 0) { 2547 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2548 Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2549 UnicodeDecodingAlreadyDiagnosed = true; 2550 ++CurPtr; 2551 } else { 2552 UnicodeDecodingAlreadyDiagnosed = false; 2553 CurPtr += Length; 2554 } 2555 continue; 2556 } 2557 2558 const char *NextLine = CurPtr; 2559 if (C != 0) { 2560 // We found a newline, see if it's escaped. 2561 const char *EscapePtr = CurPtr-1; 2562 bool HasSpace = false; 2563 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2564 --EscapePtr; 2565 HasSpace = true; 2566 } 2567 2568 if (*EscapePtr == '\\') 2569 // Escaped newline. 2570 CurPtr = EscapePtr; 2571 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2572 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2573 // Trigraph-escaped newline. 2574 CurPtr = EscapePtr-2; 2575 else 2576 break; // This is a newline, we're done. 2577 2578 // If there was space between the backslash and newline, warn about it. 2579 if (HasSpace && !isLexingRawMode()) 2580 Diag(EscapePtr, diag::backslash_newline_space); 2581 } 2582 2583 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2584 // properly decode the character. Read it in raw mode to avoid emitting 2585 // diagnostics about things like trigraphs. If we see an escaped newline, 2586 // we'll handle it below. 2587 const char *OldPtr = CurPtr; 2588 bool OldRawMode = isLexingRawMode(); 2589 LexingRawMode = true; 2590 C = getAndAdvanceChar(CurPtr, Result); 2591 LexingRawMode = OldRawMode; 2592 2593 // If we only read only one character, then no special handling is needed. 2594 // We're done and can skip forward to the newline. 2595 if (C != 0 && CurPtr == OldPtr+1) { 2596 CurPtr = NextLine; 2597 break; 2598 } 2599 2600 // If we read multiple characters, and one of those characters was a \r or 2601 // \n, then we had an escaped newline within the comment. Emit diagnostic 2602 // unless the next line is also a // comment. 2603 if (CurPtr != OldPtr + 1 && C != '/' && 2604 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2605 for (; OldPtr != CurPtr; ++OldPtr) 2606 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2607 // Okay, we found a // comment that ends in a newline, if the next 2608 // line is also a // comment, but has spaces, don't emit a diagnostic. 2609 if (isWhitespace(C)) { 2610 const char *ForwardPtr = CurPtr; 2611 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2612 ++ForwardPtr; 2613 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2614 break; 2615 } 2616 2617 if (!isLexingRawMode()) 2618 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2619 break; 2620 } 2621 } 2622 2623 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2624 --CurPtr; 2625 break; 2626 } 2627 2628 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2629 PP->CodeCompleteNaturalLanguage(); 2630 cutOffLexing(); 2631 return false; 2632 } 2633 } 2634 2635 // Found but did not consume the newline. Notify comment handlers about the 2636 // comment unless we're in a #if 0 block. 2637 if (PP && !isLexingRawMode() && 2638 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2639 getSourceLocation(CurPtr)))) { 2640 BufferPtr = CurPtr; 2641 return true; // A token has to be returned. 2642 } 2643 2644 // If we are returning comments as tokens, return this comment as a token. 2645 if (inKeepCommentMode()) 2646 return SaveLineComment(Result, CurPtr); 2647 2648 // If we are inside a preprocessor directive and we see the end of line, 2649 // return immediately, so that the lexer can return this as an EOD token. 2650 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2651 BufferPtr = CurPtr; 2652 return false; 2653 } 2654 2655 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2656 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2657 // contribute to another token), it isn't needed for correctness. Note that 2658 // this is ok even in KeepWhitespaceMode, because we would have returned the 2659 // comment above in that mode. 2660 NewLinePtr = CurPtr++; 2661 2662 // The next returned token is at the start of the line. 2663 Result.setFlag(Token::StartOfLine); 2664 TokAtPhysicalStartOfLine = true; 2665 // No leading whitespace seen so far. 2666 Result.clearFlag(Token::LeadingSpace); 2667 BufferPtr = CurPtr; 2668 return false; 2669 } 2670 2671 /// If in save-comment mode, package up this Line comment in an appropriate 2672 /// way and return it. 2673 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2674 // If we're not in a preprocessor directive, just return the // comment 2675 // directly. 2676 FormTokenWithChars(Result, CurPtr, tok::comment); 2677 2678 if (!ParsingPreprocessorDirective || LexingRawMode) 2679 return true; 2680 2681 // If this Line-style comment is in a macro definition, transmogrify it into 2682 // a C-style block comment. 2683 bool Invalid = false; 2684 std::string Spelling = PP->getSpelling(Result, &Invalid); 2685 if (Invalid) 2686 return true; 2687 2688 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2689 Spelling[1] = '*'; // Change prefix to "/*". 2690 Spelling += "*/"; // add suffix. 2691 2692 Result.setKind(tok::comment); 2693 PP->CreateString(Spelling, Result, 2694 Result.getLocation(), Result.getLocation()); 2695 return true; 2696 } 2697 2698 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2699 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2700 /// a diagnostic if so. We know that the newline is inside of a block comment. 2701 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2702 bool Trigraphs) { 2703 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2704 2705 // Position of the first trigraph in the ending sequence. 2706 const char *TrigraphPos = nullptr; 2707 // Position of the first whitespace after a '\' in the ending sequence. 2708 const char *SpacePos = nullptr; 2709 2710 while (true) { 2711 // Back up off the newline. 2712 --CurPtr; 2713 2714 // If this is a two-character newline sequence, skip the other character. 2715 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2716 // \n\n or \r\r -> not escaped newline. 2717 if (CurPtr[0] == CurPtr[1]) 2718 return false; 2719 // \n\r or \r\n -> skip the newline. 2720 --CurPtr; 2721 } 2722 2723 // If we have horizontal whitespace, skip over it. We allow whitespace 2724 // between the slash and newline. 2725 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2726 SpacePos = CurPtr; 2727 --CurPtr; 2728 } 2729 2730 // If we have a slash, this is an escaped newline. 2731 if (*CurPtr == '\\') { 2732 --CurPtr; 2733 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2734 // This is a trigraph encoding of a slash. 2735 TrigraphPos = CurPtr - 2; 2736 CurPtr -= 3; 2737 } else { 2738 return false; 2739 } 2740 2741 // If the character preceding the escaped newline is a '*', then after line 2742 // splicing we have a '*/' ending the comment. 2743 if (*CurPtr == '*') 2744 break; 2745 2746 if (*CurPtr != '\n' && *CurPtr != '\r') 2747 return false; 2748 } 2749 2750 if (TrigraphPos) { 2751 // If no trigraphs are enabled, warn that we ignored this trigraph and 2752 // ignore this * character. 2753 if (!Trigraphs) { 2754 if (!L->isLexingRawMode()) 2755 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 2756 return false; 2757 } 2758 if (!L->isLexingRawMode()) 2759 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 2760 } 2761 2762 // Warn about having an escaped newline between the */ characters. 2763 if (!L->isLexingRawMode()) 2764 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 2765 2766 // If there was space between the backslash and newline, warn about it. 2767 if (SpacePos && !L->isLexingRawMode()) 2768 L->Diag(SpacePos, diag::backslash_newline_space); 2769 2770 return true; 2771 } 2772 2773 #ifdef __SSE2__ 2774 #include <emmintrin.h> 2775 #elif __ALTIVEC__ 2776 #include <altivec.h> 2777 #undef bool 2778 #endif 2779 2780 /// We have just read from input the / and * characters that started a comment. 2781 /// Read until we find the * and / characters that terminate the comment. 2782 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2783 /// comments, because they cannot cause the comment to end. The only thing 2784 /// that can happen is the comment could end with an escaped newline between 2785 /// the terminating * and /. 2786 /// 2787 /// If we're in KeepCommentMode or any CommentHandler has inserted 2788 /// some tokens, this will store the first token and return true. 2789 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2790 bool &TokAtPhysicalStartOfLine) { 2791 // Scan one character past where we should, looking for a '/' character. Once 2792 // we find it, check to see if it was preceded by a *. This common 2793 // optimization helps people who like to put a lot of * characters in their 2794 // comments. 2795 2796 // The first character we get with newlines and trigraphs skipped to handle 2797 // the degenerate /*/ case below correctly if the * has an escaped newline 2798 // after it. 2799 unsigned CharSize; 2800 unsigned char C = getCharAndSize(CurPtr, CharSize); 2801 CurPtr += CharSize; 2802 if (C == 0 && CurPtr == BufferEnd+1) { 2803 if (!isLexingRawMode()) 2804 Diag(BufferPtr, diag::err_unterminated_block_comment); 2805 --CurPtr; 2806 2807 // KeepWhitespaceMode should return this broken comment as a token. Since 2808 // it isn't a well formed comment, just return it as an 'unknown' token. 2809 if (isKeepWhitespaceMode()) { 2810 FormTokenWithChars(Result, CurPtr, tok::unknown); 2811 return true; 2812 } 2813 2814 BufferPtr = CurPtr; 2815 return false; 2816 } 2817 2818 // Check to see if the first character after the '/*' is another /. If so, 2819 // then this slash does not end the block comment, it is part of it. 2820 if (C == '/') 2821 C = *CurPtr++; 2822 2823 // C++23 [lex.phases] p1 2824 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2825 // diagnostic only once per entire ill-formed subsequence to avoid 2826 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2827 bool UnicodeDecodingAlreadyDiagnosed = false; 2828 2829 while (true) { 2830 // Skip over all non-interesting characters until we find end of buffer or a 2831 // (probably ending) '/' character. 2832 if (CurPtr + 24 < BufferEnd && 2833 // If there is a code-completion point avoid the fast scan because it 2834 // doesn't check for '\0'. 2835 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2836 // While not aligned to a 16-byte boundary. 2837 while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2838 if (!isASCII(C)) 2839 goto MultiByteUTF8; 2840 C = *CurPtr++; 2841 } 2842 if (C == '/') goto FoundSlash; 2843 2844 #ifdef __SSE2__ 2845 __m128i Slashes = _mm_set1_epi8('/'); 2846 while (CurPtr + 16 < BufferEnd) { 2847 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2848 if (LLVM_UNLIKELY(Mask != 0)) { 2849 goto MultiByteUTF8; 2850 } 2851 // look for slashes 2852 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2853 Slashes)); 2854 if (cmp != 0) { 2855 // Adjust the pointer to point directly after the first slash. It's 2856 // not necessary to set C here, it will be overwritten at the end of 2857 // the outer loop. 2858 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 2859 goto FoundSlash; 2860 } 2861 CurPtr += 16; 2862 } 2863 #elif __ALTIVEC__ 2864 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2865 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2866 0x80, 0x80, 0x80, 0x80}; 2867 __vector unsigned char Slashes = { 2868 '/', '/', '/', '/', '/', '/', '/', '/', 2869 '/', '/', '/', '/', '/', '/', '/', '/' 2870 }; 2871 while (CurPtr + 16 < BufferEnd) { 2872 if (LLVM_UNLIKELY( 2873 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2874 goto MultiByteUTF8; 2875 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2876 break; 2877 } 2878 CurPtr += 16; 2879 } 2880 2881 #else 2882 while (CurPtr + 16 < BufferEnd) { 2883 bool HasNonASCII = false; 2884 for (unsigned I = 0; I < 16; ++I) 2885 HasNonASCII |= !isASCII(CurPtr[I]); 2886 2887 if (LLVM_UNLIKELY(HasNonASCII)) 2888 goto MultiByteUTF8; 2889 2890 bool HasSlash = false; 2891 for (unsigned I = 0; I < 16; ++I) 2892 HasSlash |= CurPtr[I] == '/'; 2893 if (HasSlash) 2894 break; 2895 CurPtr += 16; 2896 } 2897 #endif 2898 2899 // It has to be one of the bytes scanned, increment to it and read one. 2900 C = *CurPtr++; 2901 } 2902 2903 // Loop to scan the remainder, warning on invalid UTF-8 2904 // if the corresponding warning is enabled, emitting a diagnostic only once 2905 // per sequence that cannot be decoded. 2906 while (C != '/' && C != '\0') { 2907 if (isASCII(C)) { 2908 UnicodeDecodingAlreadyDiagnosed = false; 2909 C = *CurPtr++; 2910 continue; 2911 } 2912 MultiByteUTF8: 2913 // CurPtr is 1 code unit past C, so to decode 2914 // the codepoint, we need to read from the previous position. 2915 unsigned Length = llvm::getUTF8SequenceSize( 2916 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2917 if (Length == 0) { 2918 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2919 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2920 UnicodeDecodingAlreadyDiagnosed = true; 2921 } else { 2922 UnicodeDecodingAlreadyDiagnosed = false; 2923 CurPtr += Length - 1; 2924 } 2925 C = *CurPtr++; 2926 } 2927 2928 if (C == '/') { 2929 FoundSlash: 2930 if (CurPtr[-2] == '*') // We found the final */. We're done! 2931 break; 2932 2933 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2934 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 2935 LangOpts.Trigraphs)) { 2936 // We found the final */, though it had an escaped newline between the 2937 // * and /. We're done! 2938 break; 2939 } 2940 } 2941 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2942 // If this is a /* inside of the comment, emit a warning. Don't do this 2943 // if this is a /*/, which will end the comment. This misses cases with 2944 // embedded escaped newlines, but oh well. 2945 if (!isLexingRawMode()) 2946 Diag(CurPtr-1, diag::warn_nested_block_comment); 2947 } 2948 } else if (C == 0 && CurPtr == BufferEnd+1) { 2949 if (!isLexingRawMode()) 2950 Diag(BufferPtr, diag::err_unterminated_block_comment); 2951 // Note: the user probably forgot a */. We could continue immediately 2952 // after the /*, but this would involve lexing a lot of what really is the 2953 // comment, which surely would confuse the parser. 2954 --CurPtr; 2955 2956 // KeepWhitespaceMode should return this broken comment as a token. Since 2957 // it isn't a well formed comment, just return it as an 'unknown' token. 2958 if (isKeepWhitespaceMode()) { 2959 FormTokenWithChars(Result, CurPtr, tok::unknown); 2960 return true; 2961 } 2962 2963 BufferPtr = CurPtr; 2964 return false; 2965 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2966 PP->CodeCompleteNaturalLanguage(); 2967 cutOffLexing(); 2968 return false; 2969 } 2970 2971 C = *CurPtr++; 2972 } 2973 2974 // Notify comment handlers about the comment unless we're in a #if 0 block. 2975 if (PP && !isLexingRawMode() && 2976 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2977 getSourceLocation(CurPtr)))) { 2978 BufferPtr = CurPtr; 2979 return true; // A token has to be returned. 2980 } 2981 2982 // If we are returning comments as tokens, return this comment as a token. 2983 if (inKeepCommentMode()) { 2984 FormTokenWithChars(Result, CurPtr, tok::comment); 2985 return true; 2986 } 2987 2988 // It is common for the tokens immediately after a /**/ comment to be 2989 // whitespace. Instead of going through the big switch, handle it 2990 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2991 // have already returned above with the comment as a token. 2992 if (isHorizontalWhitespace(*CurPtr)) { 2993 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 2994 return false; 2995 } 2996 2997 // Otherwise, just return so that the next character will be lexed as a token. 2998 BufferPtr = CurPtr; 2999 Result.setFlag(Token::LeadingSpace); 3000 return false; 3001 } 3002 3003 //===----------------------------------------------------------------------===// 3004 // Primary Lexing Entry Points 3005 //===----------------------------------------------------------------------===// 3006 3007 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 3008 /// uninterpreted string. This switches the lexer out of directive mode. 3009 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 3010 assert(ParsingPreprocessorDirective && ParsingFilename == false && 3011 "Must be in a preprocessing directive!"); 3012 Token Tmp; 3013 Tmp.startToken(); 3014 3015 // CurPtr - Cache BufferPtr in an automatic variable. 3016 const char *CurPtr = BufferPtr; 3017 while (true) { 3018 char Char = getAndAdvanceChar(CurPtr, Tmp); 3019 switch (Char) { 3020 default: 3021 if (Result) 3022 Result->push_back(Char); 3023 break; 3024 case 0: // Null. 3025 // Found end of file? 3026 if (CurPtr-1 != BufferEnd) { 3027 if (isCodeCompletionPoint(CurPtr-1)) { 3028 PP->CodeCompleteNaturalLanguage(); 3029 cutOffLexing(); 3030 return; 3031 } 3032 3033 // Nope, normal character, continue. 3034 if (Result) 3035 Result->push_back(Char); 3036 break; 3037 } 3038 // FALL THROUGH. 3039 [[fallthrough]]; 3040 case '\r': 3041 case '\n': 3042 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 3043 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 3044 BufferPtr = CurPtr-1; 3045 3046 // Next, lex the character, which should handle the EOD transition. 3047 Lex(Tmp); 3048 if (Tmp.is(tok::code_completion)) { 3049 if (PP) 3050 PP->CodeCompleteNaturalLanguage(); 3051 Lex(Tmp); 3052 } 3053 assert(Tmp.is(tok::eod) && "Unexpected token!"); 3054 3055 // Finally, we're done; 3056 return; 3057 } 3058 } 3059 } 3060 3061 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 3062 /// condition, reporting diagnostics and handling other edge cases as required. 3063 /// This returns true if Result contains a token, false if PP.Lex should be 3064 /// called again. 3065 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 3066 // If we hit the end of the file while parsing a preprocessor directive, 3067 // end the preprocessor directive first. The next token returned will 3068 // then be the end of file. 3069 if (ParsingPreprocessorDirective) { 3070 // Done parsing the "line". 3071 ParsingPreprocessorDirective = false; 3072 // Update the location of token as well as BufferPtr. 3073 FormTokenWithChars(Result, CurPtr, tok::eod); 3074 3075 // Restore comment saving mode, in case it was disabled for directive. 3076 if (PP) 3077 resetExtendedTokenMode(); 3078 return true; // Have a token. 3079 } 3080 3081 // If we are in raw mode, return this event as an EOF token. Let the caller 3082 // that put us in raw mode handle the event. 3083 if (isLexingRawMode()) { 3084 Result.startToken(); 3085 BufferPtr = BufferEnd; 3086 FormTokenWithChars(Result, BufferEnd, tok::eof); 3087 return true; 3088 } 3089 3090 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 3091 PP->setRecordedPreambleConditionalStack(ConditionalStack); 3092 // If the preamble cuts off the end of a header guard, consider it guarded. 3093 // The guard is valid for the preamble content itself, and for tools the 3094 // most useful answer is "yes, this file has a header guard". 3095 if (!ConditionalStack.empty()) 3096 MIOpt.ExitTopLevelConditional(); 3097 ConditionalStack.clear(); 3098 } 3099 3100 // Issue diagnostics for unterminated #if and missing newline. 3101 3102 // If we are in a #if directive, emit an error. 3103 while (!ConditionalStack.empty()) { 3104 if (PP->getCodeCompletionFileLoc() != FileLoc) 3105 PP->Diag(ConditionalStack.back().IfLoc, 3106 diag::err_pp_unterminated_conditional); 3107 ConditionalStack.pop_back(); 3108 } 3109 3110 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 3111 // a pedwarn. 3112 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 3113 DiagnosticsEngine &Diags = PP->getDiagnostics(); 3114 SourceLocation EndLoc = getSourceLocation(BufferEnd); 3115 unsigned DiagID; 3116 3117 if (LangOpts.CPlusPlus11) { 3118 // C++11 [lex.phases] 2.2 p2 3119 // Prefer the C++98 pedantic compatibility warning over the generic, 3120 // non-extension, user-requested "missing newline at EOF" warning. 3121 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 3122 DiagID = diag::warn_cxx98_compat_no_newline_eof; 3123 } else { 3124 DiagID = diag::warn_no_newline_eof; 3125 } 3126 } else { 3127 DiagID = diag::ext_no_newline_eof; 3128 } 3129 3130 Diag(BufferEnd, DiagID) 3131 << FixItHint::CreateInsertion(EndLoc, "\n"); 3132 } 3133 3134 BufferPtr = CurPtr; 3135 3136 // Finally, let the preprocessor handle this. 3137 return PP->HandleEndOfFile(Result, isPragmaLexer()); 3138 } 3139 3140 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 3141 /// the specified lexer will return a tok::l_paren token, 0 if it is something 3142 /// else and 2 if there are no more tokens in the buffer controlled by the 3143 /// lexer. 3144 unsigned Lexer::isNextPPTokenLParen() { 3145 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 3146 3147 if (isDependencyDirectivesLexer()) { 3148 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 3149 return 2; 3150 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 3151 tok::l_paren); 3152 } 3153 3154 // Switch to 'skipping' mode. This will ensure that we can lex a token 3155 // without emitting diagnostics, disables macro expansion, and will cause EOF 3156 // to return an EOF token instead of popping the include stack. 3157 LexingRawMode = true; 3158 3159 // Save state that can be changed while lexing so that we can restore it. 3160 const char *TmpBufferPtr = BufferPtr; 3161 bool inPPDirectiveMode = ParsingPreprocessorDirective; 3162 bool atStartOfLine = IsAtStartOfLine; 3163 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3164 bool leadingSpace = HasLeadingSpace; 3165 3166 Token Tok; 3167 Lex(Tok); 3168 3169 // Restore state that may have changed. 3170 BufferPtr = TmpBufferPtr; 3171 ParsingPreprocessorDirective = inPPDirectiveMode; 3172 HasLeadingSpace = leadingSpace; 3173 IsAtStartOfLine = atStartOfLine; 3174 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 3175 3176 // Restore the lexer back to non-skipping mode. 3177 LexingRawMode = false; 3178 3179 if (Tok.is(tok::eof)) 3180 return 2; 3181 return Tok.is(tok::l_paren); 3182 } 3183 3184 /// Find the end of a version control conflict marker. 3185 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 3186 ConflictMarkerKind CMK) { 3187 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 3188 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 3189 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 3190 size_t Pos = RestOfBuffer.find(Terminator); 3191 while (Pos != StringRef::npos) { 3192 // Must occur at start of line. 3193 if (Pos == 0 || 3194 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 3195 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 3196 Pos = RestOfBuffer.find(Terminator); 3197 continue; 3198 } 3199 return RestOfBuffer.data()+Pos; 3200 } 3201 return nullptr; 3202 } 3203 3204 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 3205 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 3206 /// and recover nicely. This returns true if it is a conflict marker and false 3207 /// if not. 3208 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 3209 // Only a conflict marker if it starts at the beginning of a line. 3210 if (CurPtr != BufferStart && 3211 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3212 return false; 3213 3214 // Check to see if we have <<<<<<< or >>>>. 3215 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") && 3216 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> ")) 3217 return false; 3218 3219 // If we have a situation where we don't care about conflict markers, ignore 3220 // it. 3221 if (CurrentConflictMarkerState || isLexingRawMode()) 3222 return false; 3223 3224 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 3225 3226 // Check to see if there is an ending marker somewhere in the buffer at the 3227 // start of a line to terminate this conflict marker. 3228 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 3229 // We found a match. We are really in a conflict marker. 3230 // Diagnose this, and ignore to the end of line. 3231 Diag(CurPtr, diag::err_conflict_marker); 3232 CurrentConflictMarkerState = Kind; 3233 3234 // Skip ahead to the end of line. We know this exists because the 3235 // end-of-conflict marker starts with \r or \n. 3236 while (*CurPtr != '\r' && *CurPtr != '\n') { 3237 assert(CurPtr != BufferEnd && "Didn't find end of line"); 3238 ++CurPtr; 3239 } 3240 BufferPtr = CurPtr; 3241 return true; 3242 } 3243 3244 // No end of conflict marker found. 3245 return false; 3246 } 3247 3248 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 3249 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 3250 /// is the end of a conflict marker. Handle it by ignoring up until the end of 3251 /// the line. This returns true if it is a conflict marker and false if not. 3252 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 3253 // Only a conflict marker if it starts at the beginning of a line. 3254 if (CurPtr != BufferStart && 3255 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3256 return false; 3257 3258 // If we have a situation where we don't care about conflict markers, ignore 3259 // it. 3260 if (!CurrentConflictMarkerState || isLexingRawMode()) 3261 return false; 3262 3263 // Check to see if we have the marker (4 characters in a row). 3264 for (unsigned i = 1; i != 4; ++i) 3265 if (CurPtr[i] != CurPtr[0]) 3266 return false; 3267 3268 // If we do have it, search for the end of the conflict marker. This could 3269 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 3270 // be the end of conflict marker. 3271 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 3272 CurrentConflictMarkerState)) { 3273 CurPtr = End; 3274 3275 // Skip ahead to the end of line. 3276 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 3277 ++CurPtr; 3278 3279 BufferPtr = CurPtr; 3280 3281 // No longer in the conflict marker. 3282 CurrentConflictMarkerState = CMK_None; 3283 return true; 3284 } 3285 3286 return false; 3287 } 3288 3289 static const char *findPlaceholderEnd(const char *CurPtr, 3290 const char *BufferEnd) { 3291 if (CurPtr == BufferEnd) 3292 return nullptr; 3293 BufferEnd -= 1; // Scan until the second last character. 3294 for (; CurPtr != BufferEnd; ++CurPtr) { 3295 if (CurPtr[0] == '#' && CurPtr[1] == '>') 3296 return CurPtr + 2; 3297 } 3298 return nullptr; 3299 } 3300 3301 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 3302 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 3303 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 3304 return false; 3305 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 3306 if (!End) 3307 return false; 3308 const char *Start = CurPtr - 1; 3309 if (!LangOpts.AllowEditorPlaceholders) 3310 Diag(Start, diag::err_placeholder_in_source); 3311 Result.startToken(); 3312 FormTokenWithChars(Result, End, tok::raw_identifier); 3313 Result.setRawIdentifierData(Start); 3314 PP->LookUpIdentifierInfo(Result); 3315 Result.setFlag(Token::IsEditorPlaceholder); 3316 BufferPtr = End; 3317 return true; 3318 } 3319 3320 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 3321 if (PP && PP->isCodeCompletionEnabled()) { 3322 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 3323 return Loc == PP->getCodeCompletionLoc(); 3324 } 3325 3326 return false; 3327 } 3328 3329 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3330 const char *SlashLoc, 3331 Token *Result) { 3332 unsigned CharSize; 3333 char Kind = getCharAndSize(StartPtr, CharSize); 3334 assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 3335 3336 unsigned NumHexDigits; 3337 if (Kind == 'u') 3338 NumHexDigits = 4; 3339 else if (Kind == 'U') 3340 NumHexDigits = 8; 3341 3342 bool Delimited = false; 3343 bool FoundEndDelimiter = false; 3344 unsigned Count = 0; 3345 bool Diagnose = Result && !isLexingRawMode(); 3346 3347 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3348 if (Diagnose) 3349 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3350 return std::nullopt; 3351 } 3352 3353 const char *CurPtr = StartPtr + CharSize; 3354 const char *KindLoc = &CurPtr[-1]; 3355 3356 uint32_t CodePoint = 0; 3357 while (Count != NumHexDigits || Delimited) { 3358 char C = getCharAndSize(CurPtr, CharSize); 3359 if (!Delimited && Count == 0 && C == '{') { 3360 Delimited = true; 3361 CurPtr += CharSize; 3362 continue; 3363 } 3364 3365 if (Delimited && C == '}') { 3366 CurPtr += CharSize; 3367 FoundEndDelimiter = true; 3368 break; 3369 } 3370 3371 unsigned Value = llvm::hexDigitValue(C); 3372 if (Value == -1U) { 3373 if (!Delimited) 3374 break; 3375 if (Diagnose) 3376 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 3377 << StringRef(KindLoc, 1); 3378 return std::nullopt; 3379 } 3380 3381 if (CodePoint & 0xF000'0000) { 3382 if (Diagnose) 3383 Diag(KindLoc, diag::err_escape_too_large) << 0; 3384 return std::nullopt; 3385 } 3386 3387 CodePoint <<= 4; 3388 CodePoint |= Value; 3389 CurPtr += CharSize; 3390 Count++; 3391 } 3392 3393 if (Count == 0) { 3394 if (Diagnose) 3395 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3396 : diag::warn_ucn_escape_no_digits) 3397 << StringRef(KindLoc, 1); 3398 return std::nullopt; 3399 } 3400 3401 if (Delimited && Kind == 'U') { 3402 if (Diagnose) 3403 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3404 return std::nullopt; 3405 } 3406 3407 if (!Delimited && Count != NumHexDigits) { 3408 if (Diagnose) { 3409 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3410 // If the user wrote \U1234, suggest a fixit to \u. 3411 if (Count == 4 && NumHexDigits == 8) { 3412 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3413 Diag(KindLoc, diag::note_ucn_four_not_eight) 3414 << FixItHint::CreateReplacement(URange, "u"); 3415 } 3416 } 3417 return std::nullopt; 3418 } 3419 3420 if (Delimited && PP) { 3421 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3422 ? diag::warn_cxx23_delimited_escape_sequence 3423 : diag::ext_delimited_escape_sequence) 3424 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3425 } 3426 3427 if (Result) { 3428 Result->setFlag(Token::HasUCN); 3429 // If the UCN contains either a trigraph or a line splicing, 3430 // we need to call getAndAdvanceChar again to set the appropriate flags 3431 // on Result. 3432 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 3433 StartPtr = CurPtr; 3434 else 3435 while (StartPtr != CurPtr) 3436 (void)getAndAdvanceChar(StartPtr, *Result); 3437 } else { 3438 StartPtr = CurPtr; 3439 } 3440 return CodePoint; 3441 } 3442 3443 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3444 const char *SlashLoc, 3445 Token *Result) { 3446 unsigned CharSize; 3447 bool Diagnose = Result && !isLexingRawMode(); 3448 3449 char C = getCharAndSize(StartPtr, CharSize); 3450 assert(C == 'N' && "expected \\N{...}"); 3451 3452 const char *CurPtr = StartPtr + CharSize; 3453 const char *KindLoc = &CurPtr[-1]; 3454 3455 C = getCharAndSize(CurPtr, CharSize); 3456 if (C != '{') { 3457 if (Diagnose) 3458 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3459 return std::nullopt; 3460 } 3461 CurPtr += CharSize; 3462 const char *StartName = CurPtr; 3463 bool FoundEndDelimiter = false; 3464 llvm::SmallVector<char, 30> Buffer; 3465 while (C) { 3466 C = getCharAndSize(CurPtr, CharSize); 3467 CurPtr += CharSize; 3468 if (C == '}') { 3469 FoundEndDelimiter = true; 3470 break; 3471 } 3472 3473 if (isVerticalWhitespace(C)) 3474 break; 3475 Buffer.push_back(C); 3476 } 3477 3478 if (!FoundEndDelimiter || Buffer.empty()) { 3479 if (Diagnose) 3480 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3481 : diag::warn_delimited_ucn_incomplete) 3482 << StringRef(KindLoc, 1); 3483 return std::nullopt; 3484 } 3485 3486 StringRef Name(Buffer.data(), Buffer.size()); 3487 std::optional<char32_t> Match = 3488 llvm::sys::unicode::nameToCodepointStrict(Name); 3489 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3490 if (!Match) { 3491 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3492 if (Diagnose) { 3493 Diag(StartName, diag::err_invalid_ucn_name) 3494 << StringRef(Buffer.data(), Buffer.size()) 3495 << makeCharRange(*this, StartName, CurPtr - CharSize); 3496 if (LooseMatch) { 3497 Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3498 << FixItHint::CreateReplacement( 3499 makeCharRange(*this, StartName, CurPtr - CharSize), 3500 LooseMatch->Name); 3501 } 3502 } 3503 // We do not offer misspelled character names suggestions here 3504 // as the set of what would be a valid suggestion depends on context, 3505 // and we should not make invalid suggestions. 3506 } 3507 3508 if (Diagnose && Match) 3509 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3510 ? diag::warn_cxx23_delimited_escape_sequence 3511 : diag::ext_delimited_escape_sequence) 3512 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3513 3514 // If no diagnostic has been emitted yet, likely because we are doing a 3515 // tentative lexing, we do not want to recover here to make sure the token 3516 // will not be incorrectly considered valid. This function will be called 3517 // again and a diagnostic emitted then. 3518 if (LooseMatch && Diagnose) 3519 Match = LooseMatch->CodePoint; 3520 3521 if (Result) { 3522 Result->setFlag(Token::HasUCN); 3523 // If the UCN contains either a trigraph or a line splicing, 3524 // we need to call getAndAdvanceChar again to set the appropriate flags 3525 // on Result. 3526 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 3527 StartPtr = CurPtr; 3528 else 3529 while (StartPtr != CurPtr) 3530 (void)getAndAdvanceChar(StartPtr, *Result); 3531 } else { 3532 StartPtr = CurPtr; 3533 } 3534 return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 3535 } 3536 3537 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3538 Token *Result) { 3539 3540 unsigned CharSize; 3541 std::optional<uint32_t> CodePointOpt; 3542 char Kind = getCharAndSize(StartPtr, CharSize); 3543 if (Kind == 'u' || Kind == 'U') 3544 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3545 else if (Kind == 'N') 3546 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 3547 3548 if (!CodePointOpt) 3549 return 0; 3550 3551 uint32_t CodePoint = *CodePointOpt; 3552 3553 // Don't apply C family restrictions to UCNs in assembly mode 3554 if (LangOpts.AsmPreprocessor) 3555 return CodePoint; 3556 3557 // C23 6.4.3p2: A universal character name shall not designate a code point 3558 // where the hexadecimal value is: 3559 // - in the range D800 through DFFF inclusive; or 3560 // - greater than 10FFFF. 3561 // A universal-character-name outside the c-char-sequence of a character 3562 // constant, or the s-char-sequence of a string-literal shall not designate 3563 // a control character or a character in the basic character set. 3564 3565 // C++11 [lex.charset]p2: If the hexadecimal value for a 3566 // universal-character-name corresponds to a surrogate code point (in the 3567 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3568 // if the hexadecimal value for a universal-character-name outside the 3569 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3570 // string literal corresponds to a control character (in either of the 3571 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3572 // basic source character set, the program is ill-formed. 3573 if (CodePoint < 0xA0) { 3574 // We don't use isLexingRawMode() here because we need to warn about bad 3575 // UCNs even when skipping preprocessing tokens in a #if block. 3576 if (Result && PP) { 3577 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3578 Diag(BufferPtr, diag::err_ucn_control_character); 3579 else { 3580 char C = static_cast<char>(CodePoint); 3581 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3582 } 3583 } 3584 3585 return 0; 3586 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3587 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3588 // We don't use isLexingRawMode() here because we need to diagnose bad 3589 // UCNs even when skipping preprocessing tokens in a #if block. 3590 if (Result && PP) { 3591 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3592 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3593 else 3594 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3595 } 3596 return 0; 3597 } 3598 3599 return CodePoint; 3600 } 3601 3602 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3603 const char *CurPtr) { 3604 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3605 isUnicodeWhitespace(C)) { 3606 Diag(BufferPtr, diag::ext_unicode_whitespace) 3607 << makeCharRange(*this, BufferPtr, CurPtr); 3608 3609 Result.setFlag(Token::LeadingSpace); 3610 return true; 3611 } 3612 return false; 3613 } 3614 3615 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3616 IsAtStartOfLine = Result.isAtStartOfLine(); 3617 HasLeadingSpace = Result.hasLeadingSpace(); 3618 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3619 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3620 } 3621 3622 bool Lexer::Lex(Token &Result) { 3623 assert(!isDependencyDirectivesLexer()); 3624 3625 // Start a new token. 3626 Result.startToken(); 3627 3628 // Set up misc whitespace flags for LexTokenInternal. 3629 if (IsAtStartOfLine) { 3630 Result.setFlag(Token::StartOfLine); 3631 IsAtStartOfLine = false; 3632 } 3633 3634 if (HasLeadingSpace) { 3635 Result.setFlag(Token::LeadingSpace); 3636 HasLeadingSpace = false; 3637 } 3638 3639 if (HasLeadingEmptyMacro) { 3640 Result.setFlag(Token::LeadingEmptyMacro); 3641 HasLeadingEmptyMacro = false; 3642 } 3643 3644 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3645 IsAtPhysicalStartOfLine = false; 3646 bool isRawLex = isLexingRawMode(); 3647 (void) isRawLex; 3648 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3649 // (After the LexTokenInternal call, the lexer might be destroyed.) 3650 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3651 return returnedToken; 3652 } 3653 3654 /// LexTokenInternal - This implements a simple C family lexer. It is an 3655 /// extremely performance critical piece of code. This assumes that the buffer 3656 /// has a null character at the end of the file. This returns a preprocessing 3657 /// token, not a normal token, as such, it is an internal interface. It assumes 3658 /// that the Flags of result have been cleared before calling this. 3659 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3660 LexStart: 3661 assert(!Result.needsCleaning() && "Result needs cleaning"); 3662 assert(!Result.hasPtrData() && "Result has not been reset"); 3663 3664 // CurPtr - Cache BufferPtr in an automatic variable. 3665 const char *CurPtr = BufferPtr; 3666 3667 // Small amounts of horizontal whitespace is very common between tokens. 3668 if (isHorizontalWhitespace(*CurPtr)) { 3669 do { 3670 ++CurPtr; 3671 } while (isHorizontalWhitespace(*CurPtr)); 3672 3673 // If we are keeping whitespace and other tokens, just return what we just 3674 // skipped. The next lexer invocation will return the token after the 3675 // whitespace. 3676 if (isKeepWhitespaceMode()) { 3677 FormTokenWithChars(Result, CurPtr, tok::unknown); 3678 // FIXME: The next token will not have LeadingSpace set. 3679 return true; 3680 } 3681 3682 BufferPtr = CurPtr; 3683 Result.setFlag(Token::LeadingSpace); 3684 } 3685 3686 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3687 3688 // Read a character, advancing over it. 3689 char Char = getAndAdvanceChar(CurPtr, Result); 3690 tok::TokenKind Kind; 3691 3692 if (!isVerticalWhitespace(Char)) 3693 NewLinePtr = nullptr; 3694 3695 switch (Char) { 3696 case 0: // Null. 3697 // Found end of file? 3698 if (CurPtr-1 == BufferEnd) 3699 return LexEndOfFile(Result, CurPtr-1); 3700 3701 // Check if we are performing code completion. 3702 if (isCodeCompletionPoint(CurPtr-1)) { 3703 // Return the code-completion token. 3704 Result.startToken(); 3705 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3706 return true; 3707 } 3708 3709 if (!isLexingRawMode()) 3710 Diag(CurPtr-1, diag::null_in_file); 3711 Result.setFlag(Token::LeadingSpace); 3712 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3713 return true; // KeepWhitespaceMode 3714 3715 // We know the lexer hasn't changed, so just try again with this lexer. 3716 // (We manually eliminate the tail call to avoid recursion.) 3717 goto LexNextToken; 3718 3719 case 26: // DOS & CP/M EOF: "^Z". 3720 // If we're in Microsoft extensions mode, treat this as end of file. 3721 if (LangOpts.MicrosoftExt) { 3722 if (!isLexingRawMode()) 3723 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3724 return LexEndOfFile(Result, CurPtr-1); 3725 } 3726 3727 // If Microsoft extensions are disabled, this is just random garbage. 3728 Kind = tok::unknown; 3729 break; 3730 3731 case '\r': 3732 if (CurPtr[0] == '\n') 3733 (void)getAndAdvanceChar(CurPtr, Result); 3734 [[fallthrough]]; 3735 case '\n': 3736 // If we are inside a preprocessor directive and we see the end of line, 3737 // we know we are done with the directive, so return an EOD token. 3738 if (ParsingPreprocessorDirective) { 3739 // Done parsing the "line". 3740 ParsingPreprocessorDirective = false; 3741 3742 // Restore comment saving mode, in case it was disabled for directive. 3743 if (PP) 3744 resetExtendedTokenMode(); 3745 3746 // Since we consumed a newline, we are back at the start of a line. 3747 IsAtStartOfLine = true; 3748 IsAtPhysicalStartOfLine = true; 3749 NewLinePtr = CurPtr - 1; 3750 3751 Kind = tok::eod; 3752 break; 3753 } 3754 3755 // No leading whitespace seen so far. 3756 Result.clearFlag(Token::LeadingSpace); 3757 3758 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3759 return true; // KeepWhitespaceMode 3760 3761 // We only saw whitespace, so just try again with this lexer. 3762 // (We manually eliminate the tail call to avoid recursion.) 3763 goto LexNextToken; 3764 case ' ': 3765 case '\t': 3766 case '\f': 3767 case '\v': 3768 SkipHorizontalWhitespace: 3769 Result.setFlag(Token::LeadingSpace); 3770 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3771 return true; // KeepWhitespaceMode 3772 3773 SkipIgnoredUnits: 3774 CurPtr = BufferPtr; 3775 3776 // If the next token is obviously a // or /* */ comment, skip it efficiently 3777 // too (without going through the big switch stmt). 3778 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3779 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3780 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3781 return true; // There is a token to return. 3782 goto SkipIgnoredUnits; 3783 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3784 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3785 return true; // There is a token to return. 3786 goto SkipIgnoredUnits; 3787 } else if (isHorizontalWhitespace(*CurPtr)) { 3788 goto SkipHorizontalWhitespace; 3789 } 3790 // We only saw whitespace, so just try again with this lexer. 3791 // (We manually eliminate the tail call to avoid recursion.) 3792 goto LexNextToken; 3793 3794 // C99 6.4.4.1: Integer Constants. 3795 // C99 6.4.4.2: Floating Constants. 3796 case '0': case '1': case '2': case '3': case '4': 3797 case '5': case '6': case '7': case '8': case '9': 3798 // Notify MIOpt that we read a non-whitespace/non-comment token. 3799 MIOpt.ReadToken(); 3800 return LexNumericConstant(Result, CurPtr); 3801 3802 // Identifier (e.g., uber), or 3803 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 3804 // UTF-8 or UTF-16 string literal (C11/C++11). 3805 case 'u': 3806 // Notify MIOpt that we read a non-whitespace/non-comment token. 3807 MIOpt.ReadToken(); 3808 3809 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3810 Char = getCharAndSize(CurPtr, SizeTmp); 3811 3812 // UTF-16 string literal 3813 if (Char == '"') 3814 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3815 tok::utf16_string_literal); 3816 3817 // UTF-16 character constant 3818 if (Char == '\'') 3819 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3820 tok::utf16_char_constant); 3821 3822 // UTF-16 raw string literal 3823 if (Char == 'R' && LangOpts.CPlusPlus11 && 3824 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3825 return LexRawStringLiteral(Result, 3826 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3827 SizeTmp2, Result), 3828 tok::utf16_string_literal); 3829 3830 if (Char == '8') { 3831 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3832 3833 // UTF-8 string literal 3834 if (Char2 == '"') 3835 return LexStringLiteral(Result, 3836 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3837 SizeTmp2, Result), 3838 tok::utf8_string_literal); 3839 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 3840 return LexCharConstant( 3841 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3842 SizeTmp2, Result), 3843 tok::utf8_char_constant); 3844 3845 if (Char2 == 'R' && LangOpts.CPlusPlus11) { 3846 unsigned SizeTmp3; 3847 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3848 // UTF-8 raw string literal 3849 if (Char3 == '"') { 3850 return LexRawStringLiteral(Result, 3851 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3852 SizeTmp2, Result), 3853 SizeTmp3, Result), 3854 tok::utf8_string_literal); 3855 } 3856 } 3857 } 3858 } 3859 3860 // treat u like the start of an identifier. 3861 return LexIdentifierContinue(Result, CurPtr); 3862 3863 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 3864 // Notify MIOpt that we read a non-whitespace/non-comment token. 3865 MIOpt.ReadToken(); 3866 3867 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3868 Char = getCharAndSize(CurPtr, SizeTmp); 3869 3870 // UTF-32 string literal 3871 if (Char == '"') 3872 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3873 tok::utf32_string_literal); 3874 3875 // UTF-32 character constant 3876 if (Char == '\'') 3877 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3878 tok::utf32_char_constant); 3879 3880 // UTF-32 raw string literal 3881 if (Char == 'R' && LangOpts.CPlusPlus11 && 3882 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3883 return LexRawStringLiteral(Result, 3884 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3885 SizeTmp2, Result), 3886 tok::utf32_string_literal); 3887 } 3888 3889 // treat U like the start of an identifier. 3890 return LexIdentifierContinue(Result, CurPtr); 3891 3892 case 'R': // Identifier or C++0x raw string literal 3893 // Notify MIOpt that we read a non-whitespace/non-comment token. 3894 MIOpt.ReadToken(); 3895 3896 if (LangOpts.CPlusPlus11) { 3897 Char = getCharAndSize(CurPtr, SizeTmp); 3898 3899 if (Char == '"') 3900 return LexRawStringLiteral(Result, 3901 ConsumeChar(CurPtr, SizeTmp, Result), 3902 tok::string_literal); 3903 } 3904 3905 // treat R like the start of an identifier. 3906 return LexIdentifierContinue(Result, CurPtr); 3907 3908 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3909 // Notify MIOpt that we read a non-whitespace/non-comment token. 3910 MIOpt.ReadToken(); 3911 Char = getCharAndSize(CurPtr, SizeTmp); 3912 3913 // Wide string literal. 3914 if (Char == '"') 3915 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3916 tok::wide_string_literal); 3917 3918 // Wide raw string literal. 3919 if (LangOpts.CPlusPlus11 && Char == 'R' && 3920 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3921 return LexRawStringLiteral(Result, 3922 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3923 SizeTmp2, Result), 3924 tok::wide_string_literal); 3925 3926 // Wide character constant. 3927 if (Char == '\'') 3928 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3929 tok::wide_char_constant); 3930 // FALL THROUGH, treating L like the start of an identifier. 3931 [[fallthrough]]; 3932 3933 // C99 6.4.2: Identifiers. 3934 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3935 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3936 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3937 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3938 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3939 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3940 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3941 case 'v': case 'w': case 'x': case 'y': case 'z': 3942 case '_': 3943 // Notify MIOpt that we read a non-whitespace/non-comment token. 3944 MIOpt.ReadToken(); 3945 return LexIdentifierContinue(Result, CurPtr); 3946 3947 case '$': // $ in identifiers. 3948 if (LangOpts.DollarIdents) { 3949 if (!isLexingRawMode()) 3950 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3951 // Notify MIOpt that we read a non-whitespace/non-comment token. 3952 MIOpt.ReadToken(); 3953 return LexIdentifierContinue(Result, CurPtr); 3954 } 3955 3956 Kind = tok::unknown; 3957 break; 3958 3959 // C99 6.4.4: Character Constants. 3960 case '\'': 3961 // Notify MIOpt that we read a non-whitespace/non-comment token. 3962 MIOpt.ReadToken(); 3963 return LexCharConstant(Result, CurPtr, tok::char_constant); 3964 3965 // C99 6.4.5: String Literals. 3966 case '"': 3967 // Notify MIOpt that we read a non-whitespace/non-comment token. 3968 MIOpt.ReadToken(); 3969 return LexStringLiteral(Result, CurPtr, 3970 ParsingFilename ? tok::header_name 3971 : tok::string_literal); 3972 3973 // C99 6.4.6: Punctuators. 3974 case '?': 3975 Kind = tok::question; 3976 break; 3977 case '[': 3978 Kind = tok::l_square; 3979 break; 3980 case ']': 3981 Kind = tok::r_square; 3982 break; 3983 case '(': 3984 Kind = tok::l_paren; 3985 break; 3986 case ')': 3987 Kind = tok::r_paren; 3988 break; 3989 case '{': 3990 Kind = tok::l_brace; 3991 break; 3992 case '}': 3993 Kind = tok::r_brace; 3994 break; 3995 case '.': 3996 Char = getCharAndSize(CurPtr, SizeTmp); 3997 if (Char >= '0' && Char <= '9') { 3998 // Notify MIOpt that we read a non-whitespace/non-comment token. 3999 MIOpt.ReadToken(); 4000 4001 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 4002 } else if (LangOpts.CPlusPlus && Char == '*') { 4003 Kind = tok::periodstar; 4004 CurPtr += SizeTmp; 4005 } else if (Char == '.' && 4006 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 4007 Kind = tok::ellipsis; 4008 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4009 SizeTmp2, Result); 4010 } else { 4011 Kind = tok::period; 4012 } 4013 break; 4014 case '&': 4015 Char = getCharAndSize(CurPtr, SizeTmp); 4016 if (Char == '&') { 4017 Kind = tok::ampamp; 4018 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4019 } else if (Char == '=') { 4020 Kind = tok::ampequal; 4021 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4022 } else { 4023 Kind = tok::amp; 4024 } 4025 break; 4026 case '*': 4027 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4028 Kind = tok::starequal; 4029 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4030 } else { 4031 Kind = tok::star; 4032 } 4033 break; 4034 case '+': 4035 Char = getCharAndSize(CurPtr, SizeTmp); 4036 if (Char == '+') { 4037 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4038 Kind = tok::plusplus; 4039 } else if (Char == '=') { 4040 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4041 Kind = tok::plusequal; 4042 } else { 4043 Kind = tok::plus; 4044 } 4045 break; 4046 case '-': 4047 Char = getCharAndSize(CurPtr, SizeTmp); 4048 if (Char == '-') { // -- 4049 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4050 Kind = tok::minusminus; 4051 } else if (Char == '>' && LangOpts.CPlusPlus && 4052 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 4053 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4054 SizeTmp2, Result); 4055 Kind = tok::arrowstar; 4056 } else if (Char == '>') { // -> 4057 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4058 Kind = tok::arrow; 4059 } else if (Char == '=') { // -= 4060 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4061 Kind = tok::minusequal; 4062 } else { 4063 Kind = tok::minus; 4064 } 4065 break; 4066 case '~': 4067 Kind = tok::tilde; 4068 break; 4069 case '!': 4070 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4071 Kind = tok::exclaimequal; 4072 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4073 } else { 4074 Kind = tok::exclaim; 4075 } 4076 break; 4077 case '/': 4078 // 6.4.9: Comments 4079 Char = getCharAndSize(CurPtr, SizeTmp); 4080 if (Char == '/') { // Line comment. 4081 // Even if Line comments are disabled (e.g. in C89 mode), we generally 4082 // want to lex this as a comment. There is one problem with this though, 4083 // that in one particular corner case, this can change the behavior of the 4084 // resultant program. For example, In "foo //**/ bar", C89 would lex 4085 // this as "foo / bar" and languages with Line comments would lex it as 4086 // "foo". Check to see if the character after the second slash is a '*'. 4087 // If so, we will lex that as a "/" instead of the start of a comment. 4088 // However, we never do this if we are just preprocessing. 4089 bool TreatAsComment = 4090 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 4091 if (!TreatAsComment) 4092 if (!(PP && PP->isPreprocessedOutput())) 4093 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 4094 4095 if (TreatAsComment) { 4096 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4097 TokAtPhysicalStartOfLine)) 4098 return true; // There is a token to return. 4099 4100 // It is common for the tokens immediately after a // comment to be 4101 // whitespace (indentation for the next line). Instead of going through 4102 // the big switch, handle it efficiently now. 4103 goto SkipIgnoredUnits; 4104 } 4105 } 4106 4107 if (Char == '*') { // /**/ comment. 4108 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4109 TokAtPhysicalStartOfLine)) 4110 return true; // There is a token to return. 4111 4112 // We only saw whitespace, so just try again with this lexer. 4113 // (We manually eliminate the tail call to avoid recursion.) 4114 goto LexNextToken; 4115 } 4116 4117 if (Char == '=') { 4118 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4119 Kind = tok::slashequal; 4120 } else { 4121 Kind = tok::slash; 4122 } 4123 break; 4124 case '%': 4125 Char = getCharAndSize(CurPtr, SizeTmp); 4126 if (Char == '=') { 4127 Kind = tok::percentequal; 4128 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4129 } else if (LangOpts.Digraphs && Char == '>') { 4130 Kind = tok::r_brace; // '%>' -> '}' 4131 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4132 } else if (LangOpts.Digraphs && Char == ':') { 4133 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4134 Char = getCharAndSize(CurPtr, SizeTmp); 4135 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 4136 Kind = tok::hashhash; // '%:%:' -> '##' 4137 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4138 SizeTmp2, Result); 4139 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 4140 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4141 if (!isLexingRawMode()) 4142 Diag(BufferPtr, diag::ext_charize_microsoft); 4143 Kind = tok::hashat; 4144 } else { // '%:' -> '#' 4145 // We parsed a # character. If this occurs at the start of the line, 4146 // it's actually the start of a preprocessing directive. Callback to 4147 // the preprocessor to handle it. 4148 // TODO: -fpreprocessed mode?? 4149 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4150 goto HandleDirective; 4151 4152 Kind = tok::hash; 4153 } 4154 } else { 4155 Kind = tok::percent; 4156 } 4157 break; 4158 case '<': 4159 Char = getCharAndSize(CurPtr, SizeTmp); 4160 if (ParsingFilename) { 4161 return LexAngledStringLiteral(Result, CurPtr); 4162 } else if (Char == '<') { 4163 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4164 if (After == '=') { 4165 Kind = tok::lesslessequal; 4166 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4167 SizeTmp2, Result); 4168 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 4169 // If this is actually a '<<<<<<<' version control conflict marker, 4170 // recognize it as such and recover nicely. 4171 goto LexNextToken; 4172 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 4173 // If this is '<<<<' and we're in a Perforce-style conflict marker, 4174 // ignore it. 4175 goto LexNextToken; 4176 } else if (LangOpts.CUDA && After == '<') { 4177 Kind = tok::lesslessless; 4178 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4179 SizeTmp2, Result); 4180 } else { 4181 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4182 Kind = tok::lessless; 4183 } 4184 } else if (Char == '=') { 4185 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4186 if (After == '>') { 4187 if (LangOpts.CPlusPlus20) { 4188 if (!isLexingRawMode()) 4189 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 4190 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4191 SizeTmp2, Result); 4192 Kind = tok::spaceship; 4193 break; 4194 } 4195 // Suggest adding a space between the '<=' and the '>' to avoid a 4196 // change in semantics if this turns up in C++ <=17 mode. 4197 if (LangOpts.CPlusPlus && !isLexingRawMode()) { 4198 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 4199 << FixItHint::CreateInsertion( 4200 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 4201 } 4202 } 4203 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4204 Kind = tok::lessequal; 4205 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 4206 if (LangOpts.CPlusPlus11 && 4207 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 4208 // C++0x [lex.pptoken]p3: 4209 // Otherwise, if the next three characters are <:: and the subsequent 4210 // character is neither : nor >, the < is treated as a preprocessor 4211 // token by itself and not as the first character of the alternative 4212 // token <:. 4213 unsigned SizeTmp3; 4214 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 4215 if (After != ':' && After != '>') { 4216 Kind = tok::less; 4217 if (!isLexingRawMode()) 4218 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 4219 break; 4220 } 4221 } 4222 4223 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4224 Kind = tok::l_square; 4225 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 4226 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4227 Kind = tok::l_brace; 4228 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 4229 lexEditorPlaceholder(Result, CurPtr)) { 4230 return true; 4231 } else { 4232 Kind = tok::less; 4233 } 4234 break; 4235 case '>': 4236 Char = getCharAndSize(CurPtr, SizeTmp); 4237 if (Char == '=') { 4238 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4239 Kind = tok::greaterequal; 4240 } else if (Char == '>') { 4241 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4242 if (After == '=') { 4243 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4244 SizeTmp2, Result); 4245 Kind = tok::greatergreaterequal; 4246 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 4247 // If this is actually a '>>>>' conflict marker, recognize it as such 4248 // and recover nicely. 4249 goto LexNextToken; 4250 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 4251 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 4252 goto LexNextToken; 4253 } else if (LangOpts.CUDA && After == '>') { 4254 Kind = tok::greatergreatergreater; 4255 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4256 SizeTmp2, Result); 4257 } else { 4258 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4259 Kind = tok::greatergreater; 4260 } 4261 } else { 4262 Kind = tok::greater; 4263 } 4264 break; 4265 case '^': 4266 Char = getCharAndSize(CurPtr, SizeTmp); 4267 if (Char == '=') { 4268 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4269 Kind = tok::caretequal; 4270 } else if (LangOpts.OpenCL && Char == '^') { 4271 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4272 Kind = tok::caretcaret; 4273 } else { 4274 Kind = tok::caret; 4275 } 4276 break; 4277 case '|': 4278 Char = getCharAndSize(CurPtr, SizeTmp); 4279 if (Char == '=') { 4280 Kind = tok::pipeequal; 4281 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4282 } else if (Char == '|') { 4283 // If this is '|||||||' and we're in a conflict marker, ignore it. 4284 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 4285 goto LexNextToken; 4286 Kind = tok::pipepipe; 4287 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4288 } else { 4289 Kind = tok::pipe; 4290 } 4291 break; 4292 case ':': 4293 Char = getCharAndSize(CurPtr, SizeTmp); 4294 if (LangOpts.Digraphs && Char == '>') { 4295 Kind = tok::r_square; // ':>' -> ']' 4296 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4297 } else if (Char == ':') { 4298 Kind = tok::coloncolon; 4299 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4300 } else { 4301 Kind = tok::colon; 4302 } 4303 break; 4304 case ';': 4305 Kind = tok::semi; 4306 break; 4307 case '=': 4308 Char = getCharAndSize(CurPtr, SizeTmp); 4309 if (Char == '=') { 4310 // If this is '====' and we're in a conflict marker, ignore it. 4311 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 4312 goto LexNextToken; 4313 4314 Kind = tok::equalequal; 4315 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4316 } else { 4317 Kind = tok::equal; 4318 } 4319 break; 4320 case ',': 4321 Kind = tok::comma; 4322 break; 4323 case '#': 4324 Char = getCharAndSize(CurPtr, SizeTmp); 4325 if (Char == '#') { 4326 Kind = tok::hashhash; 4327 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4328 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 4329 Kind = tok::hashat; 4330 if (!isLexingRawMode()) 4331 Diag(BufferPtr, diag::ext_charize_microsoft); 4332 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4333 } else { 4334 // We parsed a # character. If this occurs at the start of the line, 4335 // it's actually the start of a preprocessing directive. Callback to 4336 // the preprocessor to handle it. 4337 // TODO: -fpreprocessed mode?? 4338 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4339 goto HandleDirective; 4340 4341 Kind = tok::hash; 4342 } 4343 break; 4344 4345 case '@': 4346 // Objective C support. 4347 if (CurPtr[-1] == '@' && LangOpts.ObjC) 4348 Kind = tok::at; 4349 else 4350 Kind = tok::unknown; 4351 break; 4352 4353 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 4354 case '\\': 4355 if (!LangOpts.AsmPreprocessor) { 4356 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 4357 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4358 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4359 return true; // KeepWhitespaceMode 4360 4361 // We only saw whitespace, so just try again with this lexer. 4362 // (We manually eliminate the tail call to avoid recursion.) 4363 goto LexNextToken; 4364 } 4365 4366 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4367 } 4368 } 4369 4370 Kind = tok::unknown; 4371 break; 4372 4373 default: { 4374 if (isASCII(Char)) { 4375 Kind = tok::unknown; 4376 break; 4377 } 4378 4379 llvm::UTF32 CodePoint; 4380 4381 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 4382 // an escaped newline. 4383 --CurPtr; 4384 llvm::ConversionResult Status = 4385 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 4386 (const llvm::UTF8 *)BufferEnd, 4387 &CodePoint, 4388 llvm::strictConversion); 4389 if (Status == llvm::conversionOK) { 4390 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4391 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4392 return true; // KeepWhitespaceMode 4393 4394 // We only saw whitespace, so just try again with this lexer. 4395 // (We manually eliminate the tail call to avoid recursion.) 4396 goto LexNextToken; 4397 } 4398 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4399 } 4400 4401 if (isLexingRawMode() || ParsingPreprocessorDirective || 4402 PP->isPreprocessedOutput()) { 4403 ++CurPtr; 4404 Kind = tok::unknown; 4405 break; 4406 } 4407 4408 // Non-ASCII characters tend to creep into source code unintentionally. 4409 // Instead of letting the parser complain about the unknown token, 4410 // just diagnose the invalid UTF-8, then drop the character. 4411 Diag(CurPtr, diag::err_invalid_utf8); 4412 4413 BufferPtr = CurPtr+1; 4414 // We're pretending the character didn't exist, so just try again with 4415 // this lexer. 4416 // (We manually eliminate the tail call to avoid recursion.) 4417 goto LexNextToken; 4418 } 4419 } 4420 4421 // Notify MIOpt that we read a non-whitespace/non-comment token. 4422 MIOpt.ReadToken(); 4423 4424 // Update the location of token as well as BufferPtr. 4425 FormTokenWithChars(Result, CurPtr, Kind); 4426 return true; 4427 4428 HandleDirective: 4429 // We parsed a # character and it's the start of a preprocessing directive. 4430 4431 FormTokenWithChars(Result, CurPtr, tok::hash); 4432 PP->HandleDirective(Result); 4433 4434 if (PP->hadModuleLoaderFatalFailure()) 4435 // With a fatal failure in the module loader, we abort parsing. 4436 return true; 4437 4438 // We parsed the directive; lex a token with the new state. 4439 return false; 4440 4441 LexNextToken: 4442 Result.clearFlag(Token::NeedsCleaning); 4443 goto LexStart; 4444 } 4445 4446 const char *Lexer::convertDependencyDirectiveToken( 4447 const dependency_directives_scan::Token &DDTok, Token &Result) { 4448 const char *TokPtr = BufferStart + DDTok.Offset; 4449 Result.startToken(); 4450 Result.setLocation(getSourceLocation(TokPtr)); 4451 Result.setKind(DDTok.Kind); 4452 Result.setFlag((Token::TokenFlags)DDTok.Flags); 4453 Result.setLength(DDTok.Length); 4454 BufferPtr = TokPtr + DDTok.Length; 4455 return TokPtr; 4456 } 4457 4458 bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4459 assert(isDependencyDirectivesLexer()); 4460 4461 using namespace dependency_directives_scan; 4462 4463 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4464 if (DepDirectives.front().Kind == pp_eof) 4465 return LexEndOfFile(Result, BufferEnd); 4466 if (DepDirectives.front().Kind == tokens_present_before_eof) 4467 MIOpt.ReadToken(); 4468 NextDepDirectiveTokenIndex = 0; 4469 DepDirectives = DepDirectives.drop_front(); 4470 } 4471 4472 const dependency_directives_scan::Token &DDTok = 4473 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4474 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4475 // Read something other than a preprocessor directive hash. 4476 MIOpt.ReadToken(); 4477 } 4478 4479 if (ParsingFilename && DDTok.is(tok::less)) { 4480 BufferPtr = BufferStart + DDTok.Offset; 4481 LexAngledStringLiteral(Result, BufferPtr + 1); 4482 if (Result.isNot(tok::header_name)) 4483 return true; 4484 // Advance the index of lexed tokens. 4485 while (true) { 4486 const dependency_directives_scan::Token &NextTok = 4487 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4488 if (BufferStart + NextTok.Offset >= BufferPtr) 4489 break; 4490 ++NextDepDirectiveTokenIndex; 4491 } 4492 return true; 4493 } 4494 4495 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4496 4497 if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4498 PP->HandleDirective(Result); 4499 return false; 4500 } 4501 if (Result.is(tok::raw_identifier)) { 4502 Result.setRawIdentifierData(TokPtr); 4503 if (!isLexingRawMode()) { 4504 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4505 if (II->isHandleIdentifierCase()) 4506 return PP->HandleIdentifier(Result); 4507 } 4508 return true; 4509 } 4510 if (Result.isLiteral()) { 4511 Result.setLiteralData(TokPtr); 4512 return true; 4513 } 4514 if (Result.is(tok::colon)) { 4515 // Convert consecutive colons to 'tok::coloncolon'. 4516 if (*BufferPtr == ':') { 4517 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4518 tok::colon)); 4519 ++NextDepDirectiveTokenIndex; 4520 Result.setKind(tok::coloncolon); 4521 } 4522 return true; 4523 } 4524 if (Result.is(tok::eod)) 4525 ParsingPreprocessorDirective = false; 4526 4527 return true; 4528 } 4529 4530 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4531 assert(isDependencyDirectivesLexer()); 4532 4533 using namespace dependency_directives_scan; 4534 4535 bool Stop = false; 4536 unsigned NestedIfs = 0; 4537 do { 4538 DepDirectives = DepDirectives.drop_front(); 4539 switch (DepDirectives.front().Kind) { 4540 case pp_none: 4541 llvm_unreachable("unexpected 'pp_none'"); 4542 case pp_include: 4543 case pp___include_macros: 4544 case pp_define: 4545 case pp_undef: 4546 case pp_import: 4547 case pp_pragma_import: 4548 case pp_pragma_once: 4549 case pp_pragma_push_macro: 4550 case pp_pragma_pop_macro: 4551 case pp_pragma_include_alias: 4552 case pp_pragma_system_header: 4553 case pp_include_next: 4554 case decl_at_import: 4555 case cxx_module_decl: 4556 case cxx_import_decl: 4557 case cxx_export_module_decl: 4558 case cxx_export_import_decl: 4559 case tokens_present_before_eof: 4560 break; 4561 case pp_if: 4562 case pp_ifdef: 4563 case pp_ifndef: 4564 ++NestedIfs; 4565 break; 4566 case pp_elif: 4567 case pp_elifdef: 4568 case pp_elifndef: 4569 case pp_else: 4570 if (!NestedIfs) { 4571 Stop = true; 4572 } 4573 break; 4574 case pp_endif: 4575 if (!NestedIfs) { 4576 Stop = true; 4577 } else { 4578 --NestedIfs; 4579 } 4580 break; 4581 case pp_eof: 4582 NextDepDirectiveTokenIndex = 0; 4583 return LexEndOfFile(Result, BufferEnd); 4584 } 4585 } while (!Stop); 4586 4587 const dependency_directives_scan::Token &DDTok = 4588 DepDirectives.front().Tokens.front(); 4589 assert(DDTok.is(tok::hash)); 4590 NextDepDirectiveTokenIndex = 1; 4591 4592 convertDependencyDirectiveToken(DDTok, Result); 4593 return false; 4594 } 4595