1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/Diagnostic.h" 17 #include "clang/Basic/IdentifierTable.h" 18 #include "clang/Basic/LLVM.h" 19 #include "clang/Basic/LangOptions.h" 20 #include "clang/Basic/SourceLocation.h" 21 #include "clang/Basic/SourceManager.h" 22 #include "clang/Basic/TokenKinds.h" 23 #include "clang/Lex/LexDiagnostic.h" 24 #include "clang/Lex/LiteralSupport.h" 25 #include "clang/Lex/MultipleIncludeOpt.h" 26 #include "clang/Lex/Preprocessor.h" 27 #include "clang/Lex/PreprocessorOptions.h" 28 #include "clang/Lex/Token.h" 29 #include "llvm/ADT/STLExtras.h" 30 #include "llvm/ADT/StringExtras.h" 31 #include "llvm/ADT/StringRef.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/ConvertUTF.h" 35 #include "llvm/Support/MathExtras.h" 36 #include "llvm/Support/MemoryBufferRef.h" 37 #include "llvm/Support/NativeFormatting.h" 38 #include "llvm/Support/Unicode.h" 39 #include "llvm/Support/UnicodeCharRanges.h" 40 #include <algorithm> 41 #include <cassert> 42 #include <cstddef> 43 #include <cstdint> 44 #include <cstring> 45 #include <optional> 46 #include <string> 47 #include <tuple> 48 #include <utility> 49 50 #ifdef __SSE4_2__ 51 #include <nmmintrin.h> 52 #endif 53 54 using namespace clang; 55 56 //===----------------------------------------------------------------------===// 57 // Token Class Implementation 58 //===----------------------------------------------------------------------===// 59 60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 61 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 62 if (isAnnotation()) 63 return false; 64 if (const IdentifierInfo *II = getIdentifierInfo()) 65 return II->getObjCKeywordID() == objcKey; 66 return false; 67 } 68 69 /// getObjCKeywordID - Return the ObjC keyword kind. 70 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 71 if (isAnnotation()) 72 return tok::objc_not_keyword; 73 const IdentifierInfo *specId = getIdentifierInfo(); 74 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 75 } 76 77 /// Determine whether the token kind starts a simple-type-specifier. 78 bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const { 79 switch (getKind()) { 80 case tok::annot_typename: 81 case tok::annot_decltype: 82 case tok::annot_pack_indexing_type: 83 return true; 84 85 case tok::kw_short: 86 case tok::kw_long: 87 case tok::kw___int64: 88 case tok::kw___int128: 89 case tok::kw_signed: 90 case tok::kw_unsigned: 91 case tok::kw_void: 92 case tok::kw_char: 93 case tok::kw_int: 94 case tok::kw_half: 95 case tok::kw_float: 96 case tok::kw_double: 97 case tok::kw___bf16: 98 case tok::kw__Float16: 99 case tok::kw___float128: 100 case tok::kw___ibm128: 101 case tok::kw_wchar_t: 102 case tok::kw_bool: 103 case tok::kw__Bool: 104 case tok::kw__Accum: 105 case tok::kw__Fract: 106 case tok::kw__Sat: 107 #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait: 108 #include "clang/Basic/TransformTypeTraits.def" 109 case tok::kw___auto_type: 110 case tok::kw_char16_t: 111 case tok::kw_char32_t: 112 case tok::kw_typeof: 113 case tok::kw_decltype: 114 case tok::kw_char8_t: 115 return getIdentifierInfo()->isKeyword(LangOpts); 116 117 default: 118 return false; 119 } 120 } 121 122 //===----------------------------------------------------------------------===// 123 // Lexer Class Implementation 124 //===----------------------------------------------------------------------===// 125 126 void Lexer::anchor() {} 127 128 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 129 const char *BufEnd) { 130 BufferStart = BufStart; 131 BufferPtr = BufPtr; 132 BufferEnd = BufEnd; 133 134 assert(BufEnd[0] == 0 && 135 "We assume that the input buffer has a null character at the end" 136 " to simplify lexing!"); 137 138 // Check whether we have a BOM in the beginning of the buffer. If yes - act 139 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 140 // skip the UTF-8 BOM if it's present. 141 if (BufferStart == BufferPtr) { 142 // Determine the size of the BOM. 143 StringRef Buf(BufferStart, BufferEnd - BufferStart); 144 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 145 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 146 .Default(0); 147 148 // Skip the BOM. 149 BufferPtr += BOMLength; 150 } 151 152 Is_PragmaLexer = false; 153 CurrentConflictMarkerState = CMK_None; 154 155 // Start of the file is a start of line. 156 IsAtStartOfLine = true; 157 IsAtPhysicalStartOfLine = true; 158 159 HasLeadingSpace = false; 160 HasLeadingEmptyMacro = false; 161 162 // We are not after parsing a #. 163 ParsingPreprocessorDirective = false; 164 165 // We are not after parsing #include. 166 ParsingFilename = false; 167 168 // We are not in raw mode. Raw mode disables diagnostics and interpretation 169 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 170 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 171 // or otherwise skipping over tokens. 172 LexingRawMode = false; 173 174 // Default to not keeping comments. 175 ExtendedTokenMode = 0; 176 177 NewLinePtr = nullptr; 178 } 179 180 /// Lexer constructor - Create a new lexer object for the specified buffer 181 /// with the specified preprocessor managing the lexing process. This lexer 182 /// assumes that the associated file buffer and Preprocessor objects will 183 /// outlive it, so it doesn't take ownership of either of them. 184 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 185 Preprocessor &PP, bool IsFirstIncludeOfFile) 186 : PreprocessorLexer(&PP, FID), 187 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 188 LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 189 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 190 InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 191 InputFile.getBufferEnd()); 192 193 resetExtendedTokenMode(); 194 } 195 196 /// Lexer constructor - Create a new raw lexer object. This object is only 197 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 198 /// range will outlive it, so it doesn't take ownership of it. 199 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 200 const char *BufStart, const char *BufPtr, const char *BufEnd, 201 bool IsFirstIncludeOfFile) 202 : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 203 IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 204 InitLexer(BufStart, BufPtr, BufEnd); 205 206 // We *are* in raw mode. 207 LexingRawMode = true; 208 } 209 210 /// Lexer constructor - Create a new raw lexer object. This object is only 211 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 212 /// range will outlive it, so it doesn't take ownership of it. 213 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 214 const SourceManager &SM, const LangOptions &langOpts, 215 bool IsFirstIncludeOfFile) 216 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 217 FromFile.getBufferStart(), FromFile.getBufferEnd(), 218 IsFirstIncludeOfFile) {} 219 220 void Lexer::resetExtendedTokenMode() { 221 assert(PP && "Cannot reset token mode without a preprocessor"); 222 if (LangOpts.TraditionalCPP) 223 SetKeepWhitespaceMode(true); 224 else 225 SetCommentRetentionState(PP->getCommentRetentionState()); 226 } 227 228 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 229 /// _Pragma expansion. This has a variety of magic semantics that this method 230 /// sets up. It returns a new'd Lexer that must be delete'd when done. 231 /// 232 /// On entrance to this routine, TokStartLoc is a macro location which has a 233 /// spelling loc that indicates the bytes to be lexed for the token and an 234 /// expansion location that indicates where all lexed tokens should be 235 /// "expanded from". 236 /// 237 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 238 /// normal lexer that remaps tokens as they fly by. This would require making 239 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 240 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 241 /// out of the critical path of the lexer! 242 /// 243 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 244 SourceLocation ExpansionLocStart, 245 SourceLocation ExpansionLocEnd, 246 unsigned TokLen, Preprocessor &PP) { 247 SourceManager &SM = PP.getSourceManager(); 248 249 // Create the lexer as if we were going to lex the file normally. 250 FileID SpellingFID = SM.getFileID(SpellingLoc); 251 llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 252 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 253 254 // Now that the lexer is created, change the start/end locations so that we 255 // just lex the subsection of the file that we want. This is lexing from a 256 // scratch buffer. 257 const char *StrData = SM.getCharacterData(SpellingLoc); 258 259 L->BufferPtr = StrData; 260 L->BufferEnd = StrData+TokLen; 261 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 262 263 // Set the SourceLocation with the remapping information. This ensures that 264 // GetMappedTokenLoc will remap the tokens as they are lexed. 265 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 266 ExpansionLocStart, 267 ExpansionLocEnd, TokLen); 268 269 // Ensure that the lexer thinks it is inside a directive, so that end \n will 270 // return an EOD token. 271 L->ParsingPreprocessorDirective = true; 272 273 // This lexer really is for _Pragma. 274 L->Is_PragmaLexer = true; 275 return L; 276 } 277 278 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 279 this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 280 this->IsAtStartOfLine = IsAtStartOfLine; 281 assert((BufferStart + Offset) <= BufferEnd); 282 BufferPtr = BufferStart + Offset; 283 } 284 285 template <typename T> static void StringifyImpl(T &Str, char Quote) { 286 typename T::size_type i = 0, e = Str.size(); 287 while (i < e) { 288 if (Str[i] == '\\' || Str[i] == Quote) { 289 Str.insert(Str.begin() + i, '\\'); 290 i += 2; 291 ++e; 292 } else if (Str[i] == '\n' || Str[i] == '\r') { 293 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 294 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 295 Str[i] != Str[i + 1]) { 296 Str[i] = '\\'; 297 Str[i + 1] = 'n'; 298 } else { 299 // Replace '\n' and '\r' to '\\' followed by 'n'. 300 Str[i] = '\\'; 301 Str.insert(Str.begin() + i + 1, 'n'); 302 ++e; 303 } 304 i += 2; 305 } else 306 ++i; 307 } 308 } 309 310 std::string Lexer::Stringify(StringRef Str, bool Charify) { 311 std::string Result = std::string(Str); 312 char Quote = Charify ? '\'' : '"'; 313 StringifyImpl(Result, Quote); 314 return Result; 315 } 316 317 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 318 319 //===----------------------------------------------------------------------===// 320 // Token Spelling 321 //===----------------------------------------------------------------------===// 322 323 /// Slow case of getSpelling. Extract the characters comprising the 324 /// spelling of this token from the provided input buffer. 325 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 326 const LangOptions &LangOpts, char *Spelling) { 327 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 328 329 size_t Length = 0; 330 const char *BufEnd = BufPtr + Tok.getLength(); 331 332 if (tok::isStringLiteral(Tok.getKind())) { 333 // Munch the encoding-prefix and opening double-quote. 334 while (BufPtr < BufEnd) { 335 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 336 Spelling[Length++] = CharAndSize.Char; 337 BufPtr += CharAndSize.Size; 338 339 if (Spelling[Length - 1] == '"') 340 break; 341 } 342 343 // Raw string literals need special handling; trigraph expansion and line 344 // splicing do not occur within their d-char-sequence nor within their 345 // r-char-sequence. 346 if (Length >= 2 && 347 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 348 // Search backwards from the end of the token to find the matching closing 349 // quote. 350 const char *RawEnd = BufEnd; 351 do --RawEnd; while (*RawEnd != '"'); 352 size_t RawLength = RawEnd - BufPtr + 1; 353 354 // Everything between the quotes is included verbatim in the spelling. 355 memcpy(Spelling + Length, BufPtr, RawLength); 356 Length += RawLength; 357 BufPtr += RawLength; 358 359 // The rest of the token is lexed normally. 360 } 361 } 362 363 while (BufPtr < BufEnd) { 364 auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts); 365 Spelling[Length++] = CharAndSize.Char; 366 BufPtr += CharAndSize.Size; 367 } 368 369 assert(Length < Tok.getLength() && 370 "NeedsCleaning flag set on token that didn't need cleaning!"); 371 return Length; 372 } 373 374 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 375 /// token are the characters used to represent the token in the source file 376 /// after trigraph expansion and escaped-newline folding. In particular, this 377 /// wants to get the true, uncanonicalized, spelling of things like digraphs 378 /// UCNs, etc. 379 StringRef Lexer::getSpelling(SourceLocation loc, 380 SmallVectorImpl<char> &buffer, 381 const SourceManager &SM, 382 const LangOptions &options, 383 bool *invalid) { 384 // Break down the source location. 385 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 386 387 // Try to the load the file buffer. 388 bool invalidTemp = false; 389 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 390 if (invalidTemp) { 391 if (invalid) *invalid = true; 392 return {}; 393 } 394 395 const char *tokenBegin = file.data() + locInfo.second; 396 397 // Lex from the start of the given location. 398 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 399 file.begin(), tokenBegin, file.end()); 400 Token token; 401 lexer.LexFromRawLexer(token); 402 403 unsigned length = token.getLength(); 404 405 // Common case: no need for cleaning. 406 if (!token.needsCleaning()) 407 return StringRef(tokenBegin, length); 408 409 // Hard case, we need to relex the characters into the string. 410 buffer.resize(length); 411 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 412 return StringRef(buffer.data(), buffer.size()); 413 } 414 415 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 416 /// token are the characters used to represent the token in the source file 417 /// after trigraph expansion and escaped-newline folding. In particular, this 418 /// wants to get the true, uncanonicalized, spelling of things like digraphs 419 /// UCNs, etc. 420 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 421 const LangOptions &LangOpts, bool *Invalid) { 422 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 423 424 bool CharDataInvalid = false; 425 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 426 &CharDataInvalid); 427 if (Invalid) 428 *Invalid = CharDataInvalid; 429 if (CharDataInvalid) 430 return {}; 431 432 // If this token contains nothing interesting, return it directly. 433 if (!Tok.needsCleaning()) 434 return std::string(TokStart, TokStart + Tok.getLength()); 435 436 std::string Result; 437 Result.resize(Tok.getLength()); 438 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 439 return Result; 440 } 441 442 /// getSpelling - This method is used to get the spelling of a token into a 443 /// preallocated buffer, instead of as an std::string. The caller is required 444 /// to allocate enough space for the token, which is guaranteed to be at least 445 /// Tok.getLength() bytes long. The actual length of the token is returned. 446 /// 447 /// Note that this method may do two possible things: it may either fill in 448 /// the buffer specified with characters, or it may *change the input pointer* 449 /// to point to a constant buffer with the data already in it (avoiding a 450 /// copy). The caller is not allowed to modify the returned buffer pointer 451 /// if an internal buffer is returned. 452 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 453 const SourceManager &SourceMgr, 454 const LangOptions &LangOpts, bool *Invalid) { 455 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 456 457 const char *TokStart = nullptr; 458 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 459 if (Tok.is(tok::raw_identifier)) 460 TokStart = Tok.getRawIdentifier().data(); 461 else if (!Tok.hasUCN()) { 462 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 463 // Just return the string from the identifier table, which is very quick. 464 Buffer = II->getNameStart(); 465 return II->getLength(); 466 } 467 } 468 469 // NOTE: this can be checked even after testing for an IdentifierInfo. 470 if (Tok.isLiteral()) 471 TokStart = Tok.getLiteralData(); 472 473 if (!TokStart) { 474 // Compute the start of the token in the input lexer buffer. 475 bool CharDataInvalid = false; 476 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 477 if (Invalid) 478 *Invalid = CharDataInvalid; 479 if (CharDataInvalid) { 480 Buffer = ""; 481 return 0; 482 } 483 } 484 485 // If this token contains nothing interesting, return it directly. 486 if (!Tok.needsCleaning()) { 487 Buffer = TokStart; 488 return Tok.getLength(); 489 } 490 491 // Otherwise, hard case, relex the characters into the string. 492 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 493 } 494 495 /// MeasureTokenLength - Relex the token at the specified location and return 496 /// its length in bytes in the input file. If the token needs cleaning (e.g. 497 /// includes a trigraph or an escaped newline) then this count includes bytes 498 /// that are part of that. 499 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 500 const SourceManager &SM, 501 const LangOptions &LangOpts) { 502 Token TheTok; 503 if (getRawToken(Loc, TheTok, SM, LangOpts)) 504 return 0; 505 return TheTok.getLength(); 506 } 507 508 /// Relex the token at the specified location. 509 /// \returns true if there was a failure, false on success. 510 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 511 const SourceManager &SM, 512 const LangOptions &LangOpts, 513 bool IgnoreWhiteSpace) { 514 // TODO: this could be special cased for common tokens like identifiers, ')', 515 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 516 // all obviously single-char tokens. This could use 517 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 518 // something. 519 520 // If this comes from a macro expansion, we really do want the macro name, not 521 // the token this macro expanded to. 522 Loc = SM.getExpansionLoc(Loc); 523 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 524 bool Invalid = false; 525 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 526 if (Invalid) 527 return true; 528 529 const char *StrData = Buffer.data()+LocInfo.second; 530 531 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 532 return true; 533 534 // Create a lexer starting at the beginning of this token. 535 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 536 Buffer.begin(), StrData, Buffer.end()); 537 TheLexer.SetCommentRetentionState(true); 538 TheLexer.LexFromRawLexer(Result); 539 return false; 540 } 541 542 /// Returns the pointer that points to the beginning of line that contains 543 /// the given offset, or null if the offset if invalid. 544 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 545 const char *BufStart = Buffer.data(); 546 if (Offset >= Buffer.size()) 547 return nullptr; 548 549 const char *LexStart = BufStart + Offset; 550 for (; LexStart != BufStart; --LexStart) { 551 if (isVerticalWhitespace(LexStart[0]) && 552 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 553 // LexStart should point at first character of logical line. 554 ++LexStart; 555 break; 556 } 557 } 558 return LexStart; 559 } 560 561 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 562 const SourceManager &SM, 563 const LangOptions &LangOpts) { 564 assert(Loc.isFileID()); 565 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 566 if (LocInfo.first.isInvalid()) 567 return Loc; 568 569 bool Invalid = false; 570 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 571 if (Invalid) 572 return Loc; 573 574 // Back up from the current location until we hit the beginning of a line 575 // (or the buffer). We'll relex from that point. 576 const char *StrData = Buffer.data() + LocInfo.second; 577 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 578 if (!LexStart || LexStart == StrData) 579 return Loc; 580 581 // Create a lexer starting at the beginning of this token. 582 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 583 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 584 Buffer.end()); 585 TheLexer.SetCommentRetentionState(true); 586 587 // Lex tokens until we find the token that contains the source location. 588 Token TheTok; 589 do { 590 TheLexer.LexFromRawLexer(TheTok); 591 592 if (TheLexer.getBufferLocation() > StrData) { 593 // Lexing this token has taken the lexer past the source location we're 594 // looking for. If the current token encompasses our source location, 595 // return the beginning of that token. 596 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 597 return TheTok.getLocation(); 598 599 // We ended up skipping over the source location entirely, which means 600 // that it points into whitespace. We're done here. 601 break; 602 } 603 } while (TheTok.getKind() != tok::eof); 604 605 // We've passed our source location; just return the original source location. 606 return Loc; 607 } 608 609 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 610 const SourceManager &SM, 611 const LangOptions &LangOpts) { 612 if (Loc.isFileID()) 613 return getBeginningOfFileToken(Loc, SM, LangOpts); 614 615 if (!SM.isMacroArgExpansion(Loc)) 616 return Loc; 617 618 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 619 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 620 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 621 std::pair<FileID, unsigned> BeginFileLocInfo = 622 SM.getDecomposedLoc(BeginFileLoc); 623 assert(FileLocInfo.first == BeginFileLocInfo.first && 624 FileLocInfo.second >= BeginFileLocInfo.second); 625 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 626 } 627 628 namespace { 629 630 enum PreambleDirectiveKind { 631 PDK_Skipped, 632 PDK_Unknown 633 }; 634 635 } // namespace 636 637 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 638 const LangOptions &LangOpts, 639 unsigned MaxLines) { 640 // Create a lexer starting at the beginning of the file. Note that we use a 641 // "fake" file source location at offset 1 so that the lexer will track our 642 // position within the file. 643 const SourceLocation::UIntTy StartOffset = 1; 644 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 645 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 646 Buffer.end()); 647 TheLexer.SetCommentRetentionState(true); 648 649 bool InPreprocessorDirective = false; 650 Token TheTok; 651 SourceLocation ActiveCommentLoc; 652 653 unsigned MaxLineOffset = 0; 654 if (MaxLines) { 655 const char *CurPtr = Buffer.begin(); 656 unsigned CurLine = 0; 657 while (CurPtr != Buffer.end()) { 658 char ch = *CurPtr++; 659 if (ch == '\n') { 660 ++CurLine; 661 if (CurLine == MaxLines) 662 break; 663 } 664 } 665 if (CurPtr != Buffer.end()) 666 MaxLineOffset = CurPtr - Buffer.begin(); 667 } 668 669 do { 670 TheLexer.LexFromRawLexer(TheTok); 671 672 if (InPreprocessorDirective) { 673 // If we've hit the end of the file, we're done. 674 if (TheTok.getKind() == tok::eof) { 675 break; 676 } 677 678 // If we haven't hit the end of the preprocessor directive, skip this 679 // token. 680 if (!TheTok.isAtStartOfLine()) 681 continue; 682 683 // We've passed the end of the preprocessor directive, and will look 684 // at this token again below. 685 InPreprocessorDirective = false; 686 } 687 688 // Keep track of the # of lines in the preamble. 689 if (TheTok.isAtStartOfLine()) { 690 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 691 692 // If we were asked to limit the number of lines in the preamble, 693 // and we're about to exceed that limit, we're done. 694 if (MaxLineOffset && TokOffset >= MaxLineOffset) 695 break; 696 } 697 698 // Comments are okay; skip over them. 699 if (TheTok.getKind() == tok::comment) { 700 if (ActiveCommentLoc.isInvalid()) 701 ActiveCommentLoc = TheTok.getLocation(); 702 continue; 703 } 704 705 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 706 // This is the start of a preprocessor directive. 707 Token HashTok = TheTok; 708 InPreprocessorDirective = true; 709 ActiveCommentLoc = SourceLocation(); 710 711 // Figure out which directive this is. Since we're lexing raw tokens, 712 // we don't have an identifier table available. Instead, just look at 713 // the raw identifier to recognize and categorize preprocessor directives. 714 TheLexer.LexFromRawLexer(TheTok); 715 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 716 StringRef Keyword = TheTok.getRawIdentifier(); 717 PreambleDirectiveKind PDK 718 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 719 .Case("include", PDK_Skipped) 720 .Case("__include_macros", PDK_Skipped) 721 .Case("define", PDK_Skipped) 722 .Case("undef", PDK_Skipped) 723 .Case("line", PDK_Skipped) 724 .Case("error", PDK_Skipped) 725 .Case("pragma", PDK_Skipped) 726 .Case("import", PDK_Skipped) 727 .Case("include_next", PDK_Skipped) 728 .Case("warning", PDK_Skipped) 729 .Case("ident", PDK_Skipped) 730 .Case("sccs", PDK_Skipped) 731 .Case("assert", PDK_Skipped) 732 .Case("unassert", PDK_Skipped) 733 .Case("if", PDK_Skipped) 734 .Case("ifdef", PDK_Skipped) 735 .Case("ifndef", PDK_Skipped) 736 .Case("elif", PDK_Skipped) 737 .Case("elifdef", PDK_Skipped) 738 .Case("elifndef", PDK_Skipped) 739 .Case("else", PDK_Skipped) 740 .Case("endif", PDK_Skipped) 741 .Default(PDK_Unknown); 742 743 switch (PDK) { 744 case PDK_Skipped: 745 continue; 746 747 case PDK_Unknown: 748 // We don't know what this directive is; stop at the '#'. 749 break; 750 } 751 } 752 753 // We only end up here if we didn't recognize the preprocessor 754 // directive or it was one that can't occur in the preamble at this 755 // point. Roll back the current token to the location of the '#'. 756 TheTok = HashTok; 757 } else if (TheTok.isAtStartOfLine() && 758 TheTok.getKind() == tok::raw_identifier && 759 TheTok.getRawIdentifier() == "module" && 760 LangOpts.CPlusPlusModules) { 761 // The initial global module fragment introducer "module;" is part of 762 // the preamble, which runs up to the module declaration "module foo;". 763 Token ModuleTok = TheTok; 764 do { 765 TheLexer.LexFromRawLexer(TheTok); 766 } while (TheTok.getKind() == tok::comment); 767 if (TheTok.getKind() != tok::semi) { 768 // Not global module fragment, roll back. 769 TheTok = ModuleTok; 770 break; 771 } 772 continue; 773 } 774 775 // We hit a token that we don't recognize as being in the 776 // "preprocessing only" part of the file, so we're no longer in 777 // the preamble. 778 break; 779 } while (true); 780 781 SourceLocation End; 782 if (ActiveCommentLoc.isValid()) 783 End = ActiveCommentLoc; // don't truncate a decl comment. 784 else 785 End = TheTok.getLocation(); 786 787 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 788 TheTok.isAtStartOfLine()); 789 } 790 791 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 792 const SourceManager &SM, 793 const LangOptions &LangOpts) { 794 // Figure out how many physical characters away the specified expansion 795 // character is. This needs to take into consideration newlines and 796 // trigraphs. 797 bool Invalid = false; 798 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 799 800 // If they request the first char of the token, we're trivially done. 801 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 802 return 0; 803 804 unsigned PhysOffset = 0; 805 806 // The usual case is that tokens don't contain anything interesting. Skip 807 // over the uninteresting characters. If a token only consists of simple 808 // chars, this method is extremely fast. 809 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 810 if (CharNo == 0) 811 return PhysOffset; 812 ++TokPtr; 813 --CharNo; 814 ++PhysOffset; 815 } 816 817 // If we have a character that may be a trigraph or escaped newline, use a 818 // lexer to parse it correctly. 819 for (; CharNo; --CharNo) { 820 auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts); 821 TokPtr += CharAndSize.Size; 822 PhysOffset += CharAndSize.Size; 823 } 824 825 // Final detail: if we end up on an escaped newline, we want to return the 826 // location of the actual byte of the token. For example foo\<newline>bar 827 // advanced by 3 should return the location of b, not of \\. One compounding 828 // detail of this is that the escape may be made by a trigraph. 829 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 830 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 831 832 return PhysOffset; 833 } 834 835 /// Computes the source location just past the end of the 836 /// token at this source location. 837 /// 838 /// This routine can be used to produce a source location that 839 /// points just past the end of the token referenced by \p Loc, and 840 /// is generally used when a diagnostic needs to point just after a 841 /// token where it expected something different that it received. If 842 /// the returned source location would not be meaningful (e.g., if 843 /// it points into a macro), this routine returns an invalid 844 /// source location. 845 /// 846 /// \param Offset an offset from the end of the token, where the source 847 /// location should refer to. The default offset (0) produces a source 848 /// location pointing just past the end of the token; an offset of 1 produces 849 /// a source location pointing to the last character in the token, etc. 850 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 851 const SourceManager &SM, 852 const LangOptions &LangOpts) { 853 if (Loc.isInvalid()) 854 return {}; 855 856 if (Loc.isMacroID()) { 857 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 858 return {}; // Points inside the macro expansion. 859 } 860 861 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 862 if (Len > Offset) 863 Len = Len - Offset; 864 else 865 return Loc; 866 867 return Loc.getLocWithOffset(Len); 868 } 869 870 /// Returns true if the given MacroID location points at the first 871 /// token of the macro expansion. 872 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 873 const SourceManager &SM, 874 const LangOptions &LangOpts, 875 SourceLocation *MacroBegin) { 876 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 877 878 SourceLocation expansionLoc; 879 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 880 return false; 881 882 if (expansionLoc.isFileID()) { 883 // No other macro expansions, this is the first. 884 if (MacroBegin) 885 *MacroBegin = expansionLoc; 886 return true; 887 } 888 889 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 890 } 891 892 /// Returns true if the given MacroID location points at the last 893 /// token of the macro expansion. 894 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 895 const SourceManager &SM, 896 const LangOptions &LangOpts, 897 SourceLocation *MacroEnd) { 898 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 899 900 SourceLocation spellLoc = SM.getSpellingLoc(loc); 901 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 902 if (tokLen == 0) 903 return false; 904 905 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 906 SourceLocation expansionLoc; 907 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 908 return false; 909 910 if (expansionLoc.isFileID()) { 911 // No other macro expansions. 912 if (MacroEnd) 913 *MacroEnd = expansionLoc; 914 return true; 915 } 916 917 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 918 } 919 920 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 921 const SourceManager &SM, 922 const LangOptions &LangOpts) { 923 SourceLocation Begin = Range.getBegin(); 924 SourceLocation End = Range.getEnd(); 925 assert(Begin.isFileID() && End.isFileID()); 926 if (Range.isTokenRange()) { 927 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 928 if (End.isInvalid()) 929 return {}; 930 } 931 932 // Break down the source locations. 933 FileID FID; 934 unsigned BeginOffs; 935 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 936 if (FID.isInvalid()) 937 return {}; 938 939 unsigned EndOffs; 940 if (!SM.isInFileID(End, FID, &EndOffs) || 941 BeginOffs > EndOffs) 942 return {}; 943 944 return CharSourceRange::getCharRange(Begin, End); 945 } 946 947 // Assumes that `Loc` is in an expansion. 948 static bool isInExpansionTokenRange(const SourceLocation Loc, 949 const SourceManager &SM) { 950 return SM.getSLocEntry(SM.getFileID(Loc)) 951 .getExpansion() 952 .isExpansionTokenRange(); 953 } 954 955 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 956 const SourceManager &SM, 957 const LangOptions &LangOpts) { 958 SourceLocation Begin = Range.getBegin(); 959 SourceLocation End = Range.getEnd(); 960 if (Begin.isInvalid() || End.isInvalid()) 961 return {}; 962 963 if (Begin.isFileID() && End.isFileID()) 964 return makeRangeFromFileLocs(Range, SM, LangOpts); 965 966 if (Begin.isMacroID() && End.isFileID()) { 967 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 968 return {}; 969 Range.setBegin(Begin); 970 return makeRangeFromFileLocs(Range, SM, LangOpts); 971 } 972 973 if (Begin.isFileID() && End.isMacroID()) { 974 if (Range.isTokenRange()) { 975 if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 976 return {}; 977 // Use the *original* end, not the expanded one in `End`. 978 Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 979 } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 980 return {}; 981 Range.setEnd(End); 982 return makeRangeFromFileLocs(Range, SM, LangOpts); 983 } 984 985 assert(Begin.isMacroID() && End.isMacroID()); 986 SourceLocation MacroBegin, MacroEnd; 987 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 988 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 989 &MacroEnd)) || 990 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 991 &MacroEnd)))) { 992 Range.setBegin(MacroBegin); 993 Range.setEnd(MacroEnd); 994 // Use the *original* `End`, not the expanded one in `MacroEnd`. 995 if (Range.isTokenRange()) 996 Range.setTokenRange(isInExpansionTokenRange(End, SM)); 997 return makeRangeFromFileLocs(Range, SM, LangOpts); 998 } 999 1000 bool Invalid = false; 1001 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 1002 &Invalid); 1003 if (Invalid) 1004 return {}; 1005 1006 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 1007 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 1008 &Invalid); 1009 if (Invalid) 1010 return {}; 1011 1012 if (EndEntry.getExpansion().isMacroArgExpansion() && 1013 BeginEntry.getExpansion().getExpansionLocStart() == 1014 EndEntry.getExpansion().getExpansionLocStart()) { 1015 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 1016 Range.setEnd(SM.getImmediateSpellingLoc(End)); 1017 return makeFileCharRange(Range, SM, LangOpts); 1018 } 1019 } 1020 1021 return {}; 1022 } 1023 1024 StringRef Lexer::getSourceText(CharSourceRange Range, 1025 const SourceManager &SM, 1026 const LangOptions &LangOpts, 1027 bool *Invalid) { 1028 Range = makeFileCharRange(Range, SM, LangOpts); 1029 if (Range.isInvalid()) { 1030 if (Invalid) *Invalid = true; 1031 return {}; 1032 } 1033 1034 // Break down the source location. 1035 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 1036 if (beginInfo.first.isInvalid()) { 1037 if (Invalid) *Invalid = true; 1038 return {}; 1039 } 1040 1041 unsigned EndOffs; 1042 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 1043 beginInfo.second > EndOffs) { 1044 if (Invalid) *Invalid = true; 1045 return {}; 1046 } 1047 1048 // Try to the load the file buffer. 1049 bool invalidTemp = false; 1050 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 1051 if (invalidTemp) { 1052 if (Invalid) *Invalid = true; 1053 return {}; 1054 } 1055 1056 if (Invalid) *Invalid = false; 1057 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 1058 } 1059 1060 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 1061 const SourceManager &SM, 1062 const LangOptions &LangOpts) { 1063 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1064 1065 // Find the location of the immediate macro expansion. 1066 while (true) { 1067 FileID FID = SM.getFileID(Loc); 1068 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 1069 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 1070 Loc = Expansion.getExpansionLocStart(); 1071 if (!Expansion.isMacroArgExpansion()) 1072 break; 1073 1074 // For macro arguments we need to check that the argument did not come 1075 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 1076 1077 // Loc points to the argument id of the macro definition, move to the 1078 // macro expansion. 1079 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1080 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 1081 if (SpellLoc.isFileID()) 1082 break; // No inner macro. 1083 1084 // If spelling location resides in the same FileID as macro expansion 1085 // location, it means there is no inner macro. 1086 FileID MacroFID = SM.getFileID(Loc); 1087 if (SM.isInFileID(SpellLoc, MacroFID)) 1088 break; 1089 1090 // Argument came from inner macro. 1091 Loc = SpellLoc; 1092 } 1093 1094 // Find the spelling location of the start of the non-argument expansion 1095 // range. This is where the macro name was spelled in order to begin 1096 // expanding this macro. 1097 Loc = SM.getSpellingLoc(Loc); 1098 1099 // Dig out the buffer where the macro name was spelled and the extents of the 1100 // name so that we can render it into the expansion note. 1101 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1102 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1103 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1104 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1105 } 1106 1107 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1108 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1109 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1110 // Walk past macro argument expansions. 1111 while (SM.isMacroArgExpansion(Loc)) 1112 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1113 1114 // If the macro's spelling isn't FileID or from scratch space, then it's 1115 // actually a token paste or stringization (or similar) and not a macro at 1116 // all. 1117 SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1118 if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 1119 return {}; 1120 1121 // Find the spelling location of the start of the non-argument expansion 1122 // range. This is where the macro name was spelled in order to begin 1123 // expanding this macro. 1124 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1125 1126 // Dig out the buffer where the macro name was spelled and the extents of the 1127 // name so that we can render it into the expansion note. 1128 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1129 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1130 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1131 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1132 } 1133 1134 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1135 return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 1136 } 1137 1138 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1139 assert(isVerticalWhitespace(Str[0])); 1140 if (Str - 1 < BufferStart) 1141 return false; 1142 1143 if ((Str[0] == '\n' && Str[-1] == '\r') || 1144 (Str[0] == '\r' && Str[-1] == '\n')) { 1145 if (Str - 2 < BufferStart) 1146 return false; 1147 --Str; 1148 } 1149 --Str; 1150 1151 // Rewind to first non-space character: 1152 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1153 --Str; 1154 1155 return *Str == '\\'; 1156 } 1157 1158 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1159 const SourceManager &SM) { 1160 if (Loc.isInvalid() || Loc.isMacroID()) 1161 return {}; 1162 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1163 if (LocInfo.first.isInvalid()) 1164 return {}; 1165 bool Invalid = false; 1166 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1167 if (Invalid) 1168 return {}; 1169 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1170 if (!Line) 1171 return {}; 1172 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1173 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1174 return NumWhitespaceChars == StringRef::npos 1175 ? "" 1176 : Rest.take_front(NumWhitespaceChars); 1177 } 1178 1179 //===----------------------------------------------------------------------===// 1180 // Diagnostics forwarding code. 1181 //===----------------------------------------------------------------------===// 1182 1183 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1184 /// lexer buffer was all expanded at a single point, perform the mapping. 1185 /// This is currently only used for _Pragma implementation, so it is the slow 1186 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1187 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1188 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1189 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1190 SourceLocation FileLoc, 1191 unsigned CharNo, unsigned TokLen) { 1192 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1193 1194 // Otherwise, we're lexing "mapped tokens". This is used for things like 1195 // _Pragma handling. Combine the expansion location of FileLoc with the 1196 // spelling location. 1197 SourceManager &SM = PP.getSourceManager(); 1198 1199 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1200 // characters come from spelling(FileLoc)+Offset. 1201 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1202 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1203 1204 // Figure out the expansion loc range, which is the range covered by the 1205 // original _Pragma(...) sequence. 1206 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1207 1208 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1209 } 1210 1211 /// getSourceLocation - Return a source location identifier for the specified 1212 /// offset in the current file. 1213 SourceLocation Lexer::getSourceLocation(const char *Loc, 1214 unsigned TokLen) const { 1215 assert(Loc >= BufferStart && Loc <= BufferEnd && 1216 "Location out of range for this buffer!"); 1217 1218 // In the normal case, we're just lexing from a simple file buffer, return 1219 // the file id from FileLoc with the offset specified. 1220 unsigned CharNo = Loc-BufferStart; 1221 if (FileLoc.isFileID()) 1222 return FileLoc.getLocWithOffset(CharNo); 1223 1224 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1225 // tokens are lexed from where the _Pragma was defined. 1226 assert(PP && "This doesn't work on raw lexers"); 1227 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1228 } 1229 1230 /// Diag - Forwarding function for diagnostics. This translate a source 1231 /// position in the current buffer into a SourceLocation object for rendering. 1232 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1233 return PP->Diag(getSourceLocation(Loc), DiagID); 1234 } 1235 1236 //===----------------------------------------------------------------------===// 1237 // Trigraph and Escaped Newline Handling Code. 1238 //===----------------------------------------------------------------------===// 1239 1240 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1241 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1242 static char GetTrigraphCharForLetter(char Letter) { 1243 switch (Letter) { 1244 default: return 0; 1245 case '=': return '#'; 1246 case ')': return ']'; 1247 case '(': return '['; 1248 case '!': return '|'; 1249 case '\'': return '^'; 1250 case '>': return '}'; 1251 case '/': return '\\'; 1252 case '<': return '{'; 1253 case '-': return '~'; 1254 } 1255 } 1256 1257 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1258 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1259 /// return the result character. Finally, emit a warning about trigraph use 1260 /// whether trigraphs are enabled or not. 1261 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 1262 char Res = GetTrigraphCharForLetter(*CP); 1263 if (!Res) 1264 return Res; 1265 1266 if (!Trigraphs) { 1267 if (L && !L->isLexingRawMode()) 1268 L->Diag(CP-2, diag::trigraph_ignored); 1269 return 0; 1270 } 1271 1272 if (L && !L->isLexingRawMode()) 1273 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1274 return Res; 1275 } 1276 1277 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1278 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1279 /// trigraph equivalent on entry to this function. 1280 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1281 unsigned Size = 0; 1282 while (isWhitespace(Ptr[Size])) { 1283 ++Size; 1284 1285 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1286 continue; 1287 1288 // If this is a \r\n or \n\r, skip the other half. 1289 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1290 Ptr[Size-1] != Ptr[Size]) 1291 ++Size; 1292 1293 return Size; 1294 } 1295 1296 // Not an escaped newline, must be a \t or something else. 1297 return 0; 1298 } 1299 1300 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1301 /// them), skip over them and return the first non-escaped-newline found, 1302 /// otherwise return P. 1303 const char *Lexer::SkipEscapedNewLines(const char *P) { 1304 while (true) { 1305 const char *AfterEscape; 1306 if (*P == '\\') { 1307 AfterEscape = P+1; 1308 } else if (*P == '?') { 1309 // If not a trigraph for escape, bail out. 1310 if (P[1] != '?' || P[2] != '/') 1311 return P; 1312 // FIXME: Take LangOpts into account; the language might not 1313 // support trigraphs. 1314 AfterEscape = P+3; 1315 } else { 1316 return P; 1317 } 1318 1319 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1320 if (NewLineSize == 0) return P; 1321 P = AfterEscape+NewLineSize; 1322 } 1323 } 1324 1325 std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 1326 const SourceManager &SM, 1327 const LangOptions &LangOpts) { 1328 if (Loc.isMacroID()) { 1329 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1330 return std::nullopt; 1331 } 1332 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1333 1334 // Break down the source location. 1335 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1336 1337 // Try to load the file buffer. 1338 bool InvalidTemp = false; 1339 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1340 if (InvalidTemp) 1341 return std::nullopt; 1342 1343 const char *TokenBegin = File.data() + LocInfo.second; 1344 1345 // Lex from the start of the given location. 1346 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1347 TokenBegin, File.end()); 1348 // Find the token. 1349 Token Tok; 1350 lexer.LexFromRawLexer(Tok); 1351 return Tok; 1352 } 1353 1354 /// Checks that the given token is the first token that occurs after the 1355 /// given location (this excludes comments and whitespace). Returns the location 1356 /// immediately after the specified token. If the token is not found or the 1357 /// location is inside a macro, the returned source location will be invalid. 1358 SourceLocation Lexer::findLocationAfterToken( 1359 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1360 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1361 std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1362 if (!Tok || Tok->isNot(TKind)) 1363 return {}; 1364 SourceLocation TokenLoc = Tok->getLocation(); 1365 1366 // Calculate how much whitespace needs to be skipped if any. 1367 unsigned NumWhitespaceChars = 0; 1368 if (SkipTrailingWhitespaceAndNewLine) { 1369 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1370 unsigned char C = *TokenEnd; 1371 while (isHorizontalWhitespace(C)) { 1372 C = *(++TokenEnd); 1373 NumWhitespaceChars++; 1374 } 1375 1376 // Skip \r, \n, \r\n, or \n\r 1377 if (C == '\n' || C == '\r') { 1378 char PrevC = C; 1379 C = *(++TokenEnd); 1380 NumWhitespaceChars++; 1381 if ((C == '\n' || C == '\r') && C != PrevC) 1382 NumWhitespaceChars++; 1383 } 1384 } 1385 1386 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1387 } 1388 1389 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1390 /// get its size, and return it. This is tricky in several cases: 1391 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1392 /// then either return the trigraph (skipping 3 chars) or the '?', 1393 /// depending on whether trigraphs are enabled or not. 1394 /// 2. If this is an escaped newline (potentially with whitespace between 1395 /// the backslash and newline), implicitly skip the newline and return 1396 /// the char after it. 1397 /// 1398 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1399 /// know that we can accumulate into Size, and that we have already incremented 1400 /// Ptr by Size bytes. 1401 /// 1402 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1403 /// be updated to match. 1404 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) { 1405 unsigned Size = 0; 1406 // If we have a slash, look for an escaped newline. 1407 if (Ptr[0] == '\\') { 1408 ++Size; 1409 ++Ptr; 1410 Slash: 1411 // Common case, backslash-char where the char is not whitespace. 1412 if (!isWhitespace(Ptr[0])) 1413 return {'\\', Size}; 1414 1415 // See if we have optional whitespace characters between the slash and 1416 // newline. 1417 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1418 // Remember that this token needs to be cleaned. 1419 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1420 1421 // Warn if there was whitespace between the backslash and newline. 1422 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1423 Diag(Ptr, diag::backslash_newline_space); 1424 1425 // Found backslash<whitespace><newline>. Parse the char after it. 1426 Size += EscapedNewLineSize; 1427 Ptr += EscapedNewLineSize; 1428 1429 // Use slow version to accumulate a correct size field. 1430 auto CharAndSize = getCharAndSizeSlow(Ptr, Tok); 1431 CharAndSize.Size += Size; 1432 return CharAndSize; 1433 } 1434 1435 // Otherwise, this is not an escaped newline, just return the slash. 1436 return {'\\', Size}; 1437 } 1438 1439 // If this is a trigraph, process it. 1440 if (Ptr[0] == '?' && Ptr[1] == '?') { 1441 // If this is actually a legal trigraph (not something like "??x"), emit 1442 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1443 if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1444 LangOpts.Trigraphs)) { 1445 // Remember that this token needs to be cleaned. 1446 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1447 1448 Ptr += 3; 1449 Size += 3; 1450 if (C == '\\') goto Slash; 1451 return {C, Size}; 1452 } 1453 } 1454 1455 // If this is neither, return a single character. 1456 return {*Ptr, Size + 1u}; 1457 } 1458 1459 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1460 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1461 /// and that we have already incremented Ptr by Size bytes. 1462 /// 1463 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1464 /// be updated to match. 1465 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, 1466 const LangOptions &LangOpts) { 1467 1468 unsigned Size = 0; 1469 // If we have a slash, look for an escaped newline. 1470 if (Ptr[0] == '\\') { 1471 ++Size; 1472 ++Ptr; 1473 Slash: 1474 // Common case, backslash-char where the char is not whitespace. 1475 if (!isWhitespace(Ptr[0])) 1476 return {'\\', Size}; 1477 1478 // See if we have optional whitespace characters followed by a newline. 1479 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1480 // Found backslash<whitespace><newline>. Parse the char after it. 1481 Size += EscapedNewLineSize; 1482 Ptr += EscapedNewLineSize; 1483 1484 // Use slow version to accumulate a correct size field. 1485 auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts); 1486 CharAndSize.Size += Size; 1487 return CharAndSize; 1488 } 1489 1490 // Otherwise, this is not an escaped newline, just return the slash. 1491 return {'\\', Size}; 1492 } 1493 1494 // If this is a trigraph, process it. 1495 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1496 // If this is actually a legal trigraph (not something like "??x"), return 1497 // it. 1498 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1499 Ptr += 3; 1500 Size += 3; 1501 if (C == '\\') goto Slash; 1502 return {C, Size}; 1503 } 1504 } 1505 1506 // If this is neither, return a single character. 1507 return {*Ptr, Size + 1u}; 1508 } 1509 1510 //===----------------------------------------------------------------------===// 1511 // Helper methods for lexing. 1512 //===----------------------------------------------------------------------===// 1513 1514 /// Routine that indiscriminately sets the offset into the source file. 1515 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1516 BufferPtr = BufferStart + Offset; 1517 if (BufferPtr > BufferEnd) 1518 BufferPtr = BufferEnd; 1519 // FIXME: What exactly does the StartOfLine bit mean? There are two 1520 // possible meanings for the "start" of the line: the first token on the 1521 // unexpanded line, or the first token on the expanded line. 1522 IsAtStartOfLine = StartOfLine; 1523 IsAtPhysicalStartOfLine = StartOfLine; 1524 } 1525 1526 static bool isUnicodeWhitespace(uint32_t Codepoint) { 1527 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1528 UnicodeWhitespaceCharRanges); 1529 return UnicodeWhitespaceChars.contains(Codepoint); 1530 } 1531 1532 static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1533 llvm::SmallString<5> CharBuf; 1534 llvm::raw_svector_ostream CharOS(CharBuf); 1535 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1536 return CharBuf; 1537 } 1538 1539 // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1540 // we allow "Mathematical Notation Characters" in identifiers. 1541 // This is a proposed profile that extends the XID_Start/XID_continue 1542 // with mathematical symbols, superscipts and subscripts digits 1543 // found in some production software. 1544 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1545 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1546 bool IsStart, bool &IsExtension) { 1547 static const llvm::sys::UnicodeCharSet MathStartChars( 1548 MathematicalNotationProfileIDStartRanges); 1549 static const llvm::sys::UnicodeCharSet MathContinueChars( 1550 MathematicalNotationProfileIDContinueRanges); 1551 if (MathStartChars.contains(C) || 1552 (!IsStart && MathContinueChars.contains(C))) { 1553 IsExtension = true; 1554 return true; 1555 } 1556 return false; 1557 } 1558 1559 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1560 bool &IsExtension) { 1561 if (LangOpts.AsmPreprocessor) { 1562 return false; 1563 } else if (LangOpts.DollarIdents && '$' == C) { 1564 return true; 1565 } else if (LangOpts.CPlusPlus || LangOpts.C23) { 1566 // A non-leading codepoint must have the XID_Continue property. 1567 // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1568 // so we need to check both tables. 1569 // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1570 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1571 static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1572 if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1573 return true; 1574 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1575 IsExtension); 1576 } else if (LangOpts.C11) { 1577 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1578 C11AllowedIDCharRanges); 1579 return C11AllowedIDChars.contains(C); 1580 } else { 1581 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1582 C99AllowedIDCharRanges); 1583 return C99AllowedIDChars.contains(C); 1584 } 1585 } 1586 1587 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1588 bool &IsExtension) { 1589 assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1590 IsExtension = false; 1591 if (LangOpts.AsmPreprocessor) { 1592 return false; 1593 } 1594 if (LangOpts.CPlusPlus || LangOpts.C23) { 1595 static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1596 if (XIDStartChars.contains(C)) 1597 return true; 1598 return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1599 IsExtension); 1600 } 1601 if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1602 return false; 1603 if (LangOpts.C11) { 1604 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1605 C11DisallowedInitialIDCharRanges); 1606 return !C11DisallowedInitialIDChars.contains(C); 1607 } 1608 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1609 C99DisallowedInitialIDCharRanges); 1610 return !C99DisallowedInitialIDChars.contains(C); 1611 } 1612 1613 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1614 CharSourceRange Range) { 1615 1616 static const llvm::sys::UnicodeCharSet MathStartChars( 1617 MathematicalNotationProfileIDStartRanges); 1618 static const llvm::sys::UnicodeCharSet MathContinueChars( 1619 MathematicalNotationProfileIDContinueRanges); 1620 1621 (void)MathStartChars; 1622 (void)MathContinueChars; 1623 assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1624 "Unexpected mathematical notation codepoint"); 1625 Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1626 << codepointAsHexString(C) << Range; 1627 } 1628 1629 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1630 const char *End) { 1631 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1632 L.getSourceLocation(End)); 1633 } 1634 1635 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1636 CharSourceRange Range, bool IsFirst) { 1637 // Check C99 compatibility. 1638 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1639 enum { 1640 CannotAppearInIdentifier = 0, 1641 CannotStartIdentifier 1642 }; 1643 1644 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1645 C99AllowedIDCharRanges); 1646 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1647 C99DisallowedInitialIDCharRanges); 1648 if (!C99AllowedIDChars.contains(C)) { 1649 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1650 << Range 1651 << CannotAppearInIdentifier; 1652 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1653 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1654 << Range 1655 << CannotStartIdentifier; 1656 } 1657 } 1658 } 1659 1660 /// After encountering UTF-8 character C and interpreting it as an identifier 1661 /// character, check whether it's a homoglyph for a common non-identifier 1662 /// source character that is unlikely to be an intentional identifier 1663 /// character and warn if so. 1664 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1665 CharSourceRange Range) { 1666 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1667 struct HomoglyphPair { 1668 uint32_t Character; 1669 char LooksLike; 1670 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1671 }; 1672 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1673 {U'\u00ad', 0}, // SOFT HYPHEN 1674 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1675 {U'\u037e', ';'}, // GREEK QUESTION MARK 1676 {U'\u200b', 0}, // ZERO WIDTH SPACE 1677 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1678 {U'\u200d', 0}, // ZERO WIDTH JOINER 1679 {U'\u2060', 0}, // WORD JOINER 1680 {U'\u2061', 0}, // FUNCTION APPLICATION 1681 {U'\u2062', 0}, // INVISIBLE TIMES 1682 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1683 {U'\u2064', 0}, // INVISIBLE PLUS 1684 {U'\u2212', '-'}, // MINUS SIGN 1685 {U'\u2215', '/'}, // DIVISION SLASH 1686 {U'\u2216', '\\'}, // SET MINUS 1687 {U'\u2217', '*'}, // ASTERISK OPERATOR 1688 {U'\u2223', '|'}, // DIVIDES 1689 {U'\u2227', '^'}, // LOGICAL AND 1690 {U'\u2236', ':'}, // RATIO 1691 {U'\u223c', '~'}, // TILDE OPERATOR 1692 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1693 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1694 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1695 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1696 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1697 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1698 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1699 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1700 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1701 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1702 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1703 {U'\uff0c', ','}, // FULLWIDTH COMMA 1704 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1705 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1706 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1707 {U'\uff1a', ':'}, // FULLWIDTH COLON 1708 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1709 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1710 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1711 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1712 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1713 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1714 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1715 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1716 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1717 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1718 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1719 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1720 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1721 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1722 {0, 0} 1723 }; 1724 auto Homoglyph = 1725 std::lower_bound(std::begin(SortedHomoglyphs), 1726 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1727 if (Homoglyph->Character == C) { 1728 if (Homoglyph->LooksLike) { 1729 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1730 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1731 << Range << codepointAsHexString(C) << LooksLikeStr; 1732 } else { 1733 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1734 << Range << codepointAsHexString(C); 1735 } 1736 } 1737 } 1738 1739 static void diagnoseInvalidUnicodeCodepointInIdentifier( 1740 DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1741 CharSourceRange Range, bool IsFirst) { 1742 if (isASCII(CodePoint)) 1743 return; 1744 1745 bool IsExtension; 1746 bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1747 bool IsIDContinue = 1748 IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1749 1750 if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1751 return; 1752 1753 bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1754 1755 if (!IsFirst || InvalidOnlyAtStart) { 1756 Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1757 << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1758 << FixItHint::CreateRemoval(Range); 1759 } else { 1760 Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1761 << Range << codepointAsHexString(CodePoint) 1762 << FixItHint::CreateRemoval(Range); 1763 } 1764 } 1765 1766 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1767 Token &Result) { 1768 const char *UCNPtr = CurPtr + Size; 1769 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1770 if (CodePoint == 0) { 1771 return false; 1772 } 1773 bool IsExtension = false; 1774 if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1775 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1776 return false; 1777 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1778 !PP->isPreprocessedOutput()) 1779 diagnoseInvalidUnicodeCodepointInIdentifier( 1780 PP->getDiagnostics(), LangOpts, CodePoint, 1781 makeCharRange(*this, CurPtr, UCNPtr), 1782 /*IsFirst=*/false); 1783 1784 // We got a unicode codepoint that is neither a space nor a 1785 // a valid identifier part. 1786 // Carry on as if the codepoint was valid for recovery purposes. 1787 } else if (!isLexingRawMode()) { 1788 if (IsExtension) 1789 diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1790 makeCharRange(*this, CurPtr, UCNPtr)); 1791 1792 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1793 makeCharRange(*this, CurPtr, UCNPtr), 1794 /*IsFirst=*/false); 1795 } 1796 1797 Result.setFlag(Token::HasUCN); 1798 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1799 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1800 CurPtr = UCNPtr; 1801 else 1802 while (CurPtr != UCNPtr) 1803 (void)getAndAdvanceChar(CurPtr, Result); 1804 return true; 1805 } 1806 1807 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) { 1808 llvm::UTF32 CodePoint; 1809 1810 // If a UTF-8 codepoint appears immediately after an escaped new line, 1811 // CurPtr may point to the splicing \ on the preceding line, 1812 // so we need to skip it. 1813 unsigned FirstCodeUnitSize; 1814 getCharAndSize(CurPtr, FirstCodeUnitSize); 1815 const char *CharStart = CurPtr + FirstCodeUnitSize - 1; 1816 const char *UnicodePtr = CharStart; 1817 1818 llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence( 1819 (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd, 1820 &CodePoint, llvm::strictConversion); 1821 if (ConvResult != llvm::conversionOK) 1822 return false; 1823 1824 bool IsExtension = false; 1825 if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1826 IsExtension)) { 1827 if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1828 return false; 1829 1830 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1831 !PP->isPreprocessedOutput()) 1832 diagnoseInvalidUnicodeCodepointInIdentifier( 1833 PP->getDiagnostics(), LangOpts, CodePoint, 1834 makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false); 1835 // We got a unicode codepoint that is neither a space nor a 1836 // a valid identifier part. Carry on as if the codepoint was 1837 // valid for recovery purposes. 1838 } else if (!isLexingRawMode()) { 1839 if (IsExtension) 1840 diagnoseExtensionInIdentifier( 1841 PP->getDiagnostics(), CodePoint, 1842 makeCharRange(*this, CharStart, UnicodePtr)); 1843 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1844 makeCharRange(*this, CharStart, UnicodePtr), 1845 /*IsFirst=*/false); 1846 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1847 makeCharRange(*this, CharStart, UnicodePtr)); 1848 } 1849 1850 // Once we sucessfully parsed some UTF-8, 1851 // calling ConsumeChar ensures the NeedsCleaning flag is set on the token 1852 // being lexed, and that warnings about trailing spaces are emitted. 1853 ConsumeChar(CurPtr, FirstCodeUnitSize, Result); 1854 CurPtr = UnicodePtr; 1855 return true; 1856 } 1857 1858 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1859 const char *CurPtr) { 1860 bool IsExtension = false; 1861 if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1862 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1863 !PP->isPreprocessedOutput()) { 1864 if (IsExtension) 1865 diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1866 makeCharRange(*this, BufferPtr, CurPtr)); 1867 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1868 makeCharRange(*this, BufferPtr, CurPtr), 1869 /*IsFirst=*/true); 1870 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1871 makeCharRange(*this, BufferPtr, CurPtr)); 1872 } 1873 1874 MIOpt.ReadToken(); 1875 return LexIdentifierContinue(Result, CurPtr); 1876 } 1877 1878 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1879 !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1880 !isUnicodeWhitespace(C)) { 1881 // Non-ASCII characters tend to creep into source code unintentionally. 1882 // Instead of letting the parser complain about the unknown token, 1883 // just drop the character. 1884 // Note that we can /only/ do this when the non-ASCII character is actually 1885 // spelled as Unicode, not written as a UCN. The standard requires that 1886 // we not throw away any possible preprocessor tokens, but there's a 1887 // loophole in the mapping of Unicode characters to basic character set 1888 // characters that allows us to map these particular characters to, say, 1889 // whitespace. 1890 diagnoseInvalidUnicodeCodepointInIdentifier( 1891 PP->getDiagnostics(), LangOpts, C, 1892 makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1893 BufferPtr = CurPtr; 1894 return false; 1895 } 1896 1897 // Otherwise, we have an explicit UCN or a character that's unlikely to show 1898 // up by accident. 1899 MIOpt.ReadToken(); 1900 FormTokenWithChars(Result, CurPtr, tok::unknown); 1901 return true; 1902 } 1903 1904 static const char * 1905 fastParseASCIIIdentifier(const char *CurPtr, 1906 [[maybe_unused]] const char *BufferEnd) { 1907 #ifdef __SSE4_2__ 1908 alignas(16) static constexpr char AsciiIdentifierRange[16] = { 1909 '_', '_', 'A', 'Z', 'a', 'z', '0', '9', 1910 }; 1911 constexpr ssize_t BytesPerRegister = 16; 1912 1913 __m128i AsciiIdentifierRangeV = 1914 _mm_load_si128((const __m128i *)AsciiIdentifierRange); 1915 1916 while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) { 1917 __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr)); 1918 1919 int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv, 1920 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | 1921 _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY); 1922 CurPtr += Consumed; 1923 if (Consumed == BytesPerRegister) 1924 continue; 1925 return CurPtr; 1926 } 1927 #endif 1928 1929 unsigned char C = *CurPtr; 1930 while (isAsciiIdentifierContinue(C)) 1931 C = *++CurPtr; 1932 return CurPtr; 1933 } 1934 1935 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1936 // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1937 1938 while (true) { 1939 1940 CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd); 1941 1942 unsigned Size; 1943 // Slow path: handle trigraph, unicode codepoints, UCNs. 1944 unsigned char C = getCharAndSize(CurPtr, Size); 1945 if (isAsciiIdentifierContinue(C)) { 1946 CurPtr = ConsumeChar(CurPtr, Size, Result); 1947 continue; 1948 } 1949 if (C == '$') { 1950 // If we hit a $ and they are not supported in identifiers, we are done. 1951 if (!LangOpts.DollarIdents) 1952 break; 1953 // Otherwise, emit a diagnostic and continue. 1954 if (!isLexingRawMode()) 1955 Diag(CurPtr, diag::ext_dollar_in_identifier); 1956 CurPtr = ConsumeChar(CurPtr, Size, Result); 1957 continue; 1958 } 1959 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1960 continue; 1961 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 1962 continue; 1963 // Neither an expected Unicode codepoint nor a UCN. 1964 break; 1965 } 1966 1967 const char *IdStart = BufferPtr; 1968 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1969 Result.setRawIdentifierData(IdStart); 1970 1971 // If we are in raw mode, return this identifier raw. There is no need to 1972 // look up identifier information or attempt to macro expand it. 1973 if (LexingRawMode) 1974 return true; 1975 1976 // Fill in Result.IdentifierInfo and update the token kind, 1977 // looking up the identifier in the identifier table. 1978 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1979 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1980 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1981 1982 // If the completion point is at the end of an identifier, we want to treat 1983 // the identifier as incomplete even if it resolves to a macro or a keyword. 1984 // This allows e.g. 'class^' to complete to 'classifier'. 1985 if (isCodeCompletionPoint(CurPtr)) { 1986 // Return the code-completion token. 1987 Result.setKind(tok::code_completion); 1988 // Skip the code-completion char and all immediate identifier characters. 1989 // This ensures we get consistent behavior when completing at any point in 1990 // an identifier (i.e. at the start, in the middle, at the end). Note that 1991 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1992 // simpler. 1993 assert(*CurPtr == 0 && "Completion character must be 0"); 1994 ++CurPtr; 1995 // Note that code completion token is not added as a separate character 1996 // when the completion point is at the end of the buffer. Therefore, we need 1997 // to check if the buffer has ended. 1998 if (CurPtr < BufferEnd) { 1999 while (isAsciiIdentifierContinue(*CurPtr)) 2000 ++CurPtr; 2001 } 2002 BufferPtr = CurPtr; 2003 return true; 2004 } 2005 2006 // Finally, now that we know we have an identifier, pass this off to the 2007 // preprocessor, which may macro expand it or something. 2008 if (II->isHandleIdentifierCase()) 2009 return PP->HandleIdentifier(Result); 2010 2011 return true; 2012 } 2013 2014 /// isHexaLiteral - Return true if Start points to a hex constant. 2015 /// in microsoft mode (where this is supposed to be several different tokens). 2016 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 2017 auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts); 2018 char C1 = CharAndSize1.Char; 2019 if (C1 != '0') 2020 return false; 2021 2022 auto CharAndSize2 = 2023 Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts); 2024 char C2 = CharAndSize2.Char; 2025 return (C2 == 'x' || C2 == 'X'); 2026 } 2027 2028 /// LexNumericConstant - Lex the remainder of a integer or floating point 2029 /// constant. From[-1] is the first character lexed. Return the end of the 2030 /// constant. 2031 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 2032 unsigned Size; 2033 char C = getCharAndSize(CurPtr, Size); 2034 char PrevCh = 0; 2035 while (isPreprocessingNumberBody(C)) { 2036 CurPtr = ConsumeChar(CurPtr, Size, Result); 2037 PrevCh = C; 2038 if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) { 2039 CurPtr -= Size; 2040 break; 2041 } 2042 C = getCharAndSize(CurPtr, Size); 2043 } 2044 2045 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 2046 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 2047 // If we are in Microsoft mode, don't continue if the constant is hex. 2048 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 2049 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 2050 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2051 } 2052 2053 // If we have a hex FP constant, continue. 2054 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 2055 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 2056 // not-quite-conforming extension. Only do so if this looks like it's 2057 // actually meant to be a hexfloat, and not if it has a ud-suffix. 2058 bool IsHexFloat = true; 2059 if (!LangOpts.C99) { 2060 if (!isHexaLiteral(BufferPtr, LangOpts)) 2061 IsHexFloat = false; 2062 else if (!LangOpts.CPlusPlus17 && 2063 std::find(BufferPtr, CurPtr, '_') != CurPtr) 2064 IsHexFloat = false; 2065 } 2066 if (IsHexFloat) 2067 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 2068 } 2069 2070 // If we have a digit separator, continue. 2071 if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) { 2072 auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts); 2073 if (isAsciiIdentifierContinue(Next)) { 2074 if (!isLexingRawMode()) 2075 Diag(CurPtr, LangOpts.CPlusPlus 2076 ? diag::warn_cxx11_compat_digit_separator 2077 : diag::warn_c23_compat_digit_separator); 2078 CurPtr = ConsumeChar(CurPtr, Size, Result); 2079 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 2080 return LexNumericConstant(Result, CurPtr); 2081 } 2082 } 2083 2084 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 2085 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2086 return LexNumericConstant(Result, CurPtr); 2087 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2088 return LexNumericConstant(Result, CurPtr); 2089 2090 // Update the location of token as well as BufferPtr. 2091 const char *TokStart = BufferPtr; 2092 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 2093 Result.setLiteralData(TokStart); 2094 return true; 2095 } 2096 2097 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 2098 /// in C++11, or warn on a ud-suffix in C++98. 2099 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 2100 bool IsStringLiteral) { 2101 assert(LangOpts.CPlusPlus); 2102 2103 // Maximally munch an identifier. 2104 unsigned Size; 2105 char C = getCharAndSize(CurPtr, Size); 2106 bool Consumed = false; 2107 2108 if (!isAsciiIdentifierStart(C)) { 2109 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 2110 Consumed = true; 2111 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) 2112 Consumed = true; 2113 else 2114 return CurPtr; 2115 } 2116 2117 if (!LangOpts.CPlusPlus11) { 2118 if (!isLexingRawMode()) 2119 Diag(CurPtr, 2120 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 2121 : diag::warn_cxx11_compat_reserved_user_defined_literal) 2122 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2123 return CurPtr; 2124 } 2125 2126 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 2127 // that does not start with an underscore is ill-formed. As a conforming 2128 // extension, we treat all such suffixes as if they had whitespace before 2129 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 2130 // likely to be a ud-suffix than a macro, however, and accept that. 2131 if (!Consumed) { 2132 bool IsUDSuffix = false; 2133 if (C == '_') 2134 IsUDSuffix = true; 2135 else if (IsStringLiteral && LangOpts.CPlusPlus14) { 2136 // In C++1y, we need to look ahead a few characters to see if this is a 2137 // valid suffix for a string literal or a numeric literal (this could be 2138 // the 'operator""if' defining a numeric literal operator). 2139 const unsigned MaxStandardSuffixLength = 3; 2140 char Buffer[MaxStandardSuffixLength] = { C }; 2141 unsigned Consumed = Size; 2142 unsigned Chars = 1; 2143 while (true) { 2144 auto [Next, NextSize] = 2145 getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts); 2146 if (!isAsciiIdentifierContinue(Next)) { 2147 // End of suffix. Check whether this is on the allowed list. 2148 const StringRef CompleteSuffix(Buffer, Chars); 2149 IsUDSuffix = 2150 StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 2151 break; 2152 } 2153 2154 if (Chars == MaxStandardSuffixLength) 2155 // Too long: can't be a standard suffix. 2156 break; 2157 2158 Buffer[Chars++] = Next; 2159 Consumed += NextSize; 2160 } 2161 } 2162 2163 if (!IsUDSuffix) { 2164 if (!isLexingRawMode()) 2165 Diag(CurPtr, LangOpts.MSVCCompat 2166 ? diag::ext_ms_reserved_user_defined_literal 2167 : diag::ext_reserved_user_defined_literal) 2168 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 2169 return CurPtr; 2170 } 2171 2172 CurPtr = ConsumeChar(CurPtr, Size, Result); 2173 } 2174 2175 Result.setFlag(Token::HasUDSuffix); 2176 while (true) { 2177 C = getCharAndSize(CurPtr, Size); 2178 if (isAsciiIdentifierContinue(C)) { 2179 CurPtr = ConsumeChar(CurPtr, Size, Result); 2180 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2181 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) { 2182 } else 2183 break; 2184 } 2185 2186 return CurPtr; 2187 } 2188 2189 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 2190 /// either " or L" or u8" or u" or U". 2191 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 2192 tok::TokenKind Kind) { 2193 const char *AfterQuote = CurPtr; 2194 // Does this string contain the \0 character? 2195 const char *NulCharacter = nullptr; 2196 2197 if (!isLexingRawMode() && 2198 (Kind == tok::utf8_string_literal || 2199 Kind == tok::utf16_string_literal || 2200 Kind == tok::utf32_string_literal)) 2201 Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 2202 : diag::warn_c99_compat_unicode_literal); 2203 2204 char C = getAndAdvanceChar(CurPtr, Result); 2205 while (C != '"') { 2206 // Skip escaped characters. Escaped newlines will already be processed by 2207 // getAndAdvanceChar. 2208 if (C == '\\') 2209 C = getAndAdvanceChar(CurPtr, Result); 2210 2211 if (C == '\n' || C == '\r' || // Newline. 2212 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2213 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2214 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 2215 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2216 return true; 2217 } 2218 2219 if (C == 0) { 2220 if (isCodeCompletionPoint(CurPtr-1)) { 2221 if (ParsingFilename) 2222 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 2223 else 2224 PP->CodeCompleteNaturalLanguage(); 2225 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2226 cutOffLexing(); 2227 return true; 2228 } 2229 2230 NulCharacter = CurPtr-1; 2231 } 2232 C = getAndAdvanceChar(CurPtr, Result); 2233 } 2234 2235 // If we are in C++11, lex the optional ud-suffix. 2236 if (LangOpts.CPlusPlus) 2237 CurPtr = LexUDSuffix(Result, CurPtr, true); 2238 2239 // If a nul character existed in the string, warn about it. 2240 if (NulCharacter && !isLexingRawMode()) 2241 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2242 2243 // Update the location of the token as well as the BufferPtr instance var. 2244 const char *TokStart = BufferPtr; 2245 FormTokenWithChars(Result, CurPtr, Kind); 2246 Result.setLiteralData(TokStart); 2247 return true; 2248 } 2249 2250 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 2251 /// having lexed R", LR", u8R", uR", or UR". 2252 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 2253 tok::TokenKind Kind) { 2254 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 2255 // Between the initial and final double quote characters of the raw string, 2256 // any transformations performed in phases 1 and 2 (trigraphs, 2257 // universal-character-names, and line splicing) are reverted. 2258 2259 if (!isLexingRawMode()) 2260 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 2261 2262 unsigned PrefixLen = 0; 2263 2264 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) { 2265 if (!isLexingRawMode() && 2266 llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) { 2267 const char *Pos = &CurPtr[PrefixLen]; 2268 Diag(Pos, LangOpts.CPlusPlus26 2269 ? diag::warn_cxx26_compat_raw_string_literal_character_set 2270 : diag::ext_cxx26_raw_string_literal_character_set) 2271 << StringRef(Pos, 1); 2272 } 2273 ++PrefixLen; 2274 } 2275 2276 // If the last character was not a '(', then we didn't lex a valid delimiter. 2277 if (CurPtr[PrefixLen] != '(') { 2278 if (!isLexingRawMode()) { 2279 const char *PrefixEnd = &CurPtr[PrefixLen]; 2280 if (PrefixLen == 16) { 2281 Diag(PrefixEnd, diag::err_raw_delim_too_long); 2282 } else if (*PrefixEnd == '\n') { 2283 Diag(PrefixEnd, diag::err_invalid_newline_raw_delim); 2284 } else { 2285 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 2286 << StringRef(PrefixEnd, 1); 2287 } 2288 } 2289 2290 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 2291 // it's possible the '"' was intended to be part of the raw string, but 2292 // there's not much we can do about that. 2293 while (true) { 2294 char C = *CurPtr++; 2295 2296 if (C == '"') 2297 break; 2298 if (C == 0 && CurPtr-1 == BufferEnd) { 2299 --CurPtr; 2300 break; 2301 } 2302 } 2303 2304 FormTokenWithChars(Result, CurPtr, tok::unknown); 2305 return true; 2306 } 2307 2308 // Save prefix and move CurPtr past it 2309 const char *Prefix = CurPtr; 2310 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2311 2312 while (true) { 2313 char C = *CurPtr++; 2314 2315 if (C == ')') { 2316 // Check for prefix match and closing quote. 2317 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2318 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2319 break; 2320 } 2321 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2322 if (!isLexingRawMode()) 2323 Diag(BufferPtr, diag::err_unterminated_raw_string) 2324 << StringRef(Prefix, PrefixLen); 2325 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2326 return true; 2327 } 2328 } 2329 2330 // If we are in C++11, lex the optional ud-suffix. 2331 if (LangOpts.CPlusPlus) 2332 CurPtr = LexUDSuffix(Result, CurPtr, true); 2333 2334 // Update the location of token as well as BufferPtr. 2335 const char *TokStart = BufferPtr; 2336 FormTokenWithChars(Result, CurPtr, Kind); 2337 Result.setLiteralData(TokStart); 2338 return true; 2339 } 2340 2341 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2342 /// after having lexed the '<' character. This is used for #include filenames. 2343 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2344 // Does this string contain the \0 character? 2345 const char *NulCharacter = nullptr; 2346 const char *AfterLessPos = CurPtr; 2347 char C = getAndAdvanceChar(CurPtr, Result); 2348 while (C != '>') { 2349 // Skip escaped characters. Escaped newlines will already be processed by 2350 // getAndAdvanceChar. 2351 if (C == '\\') 2352 C = getAndAdvanceChar(CurPtr, Result); 2353 2354 if (isVerticalWhitespace(C) || // Newline. 2355 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2356 // If the filename is unterminated, then it must just be a lone < 2357 // character. Return this as such. 2358 FormTokenWithChars(Result, AfterLessPos, tok::less); 2359 return true; 2360 } 2361 2362 if (C == 0) { 2363 if (isCodeCompletionPoint(CurPtr - 1)) { 2364 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2365 cutOffLexing(); 2366 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2367 return true; 2368 } 2369 NulCharacter = CurPtr-1; 2370 } 2371 C = getAndAdvanceChar(CurPtr, Result); 2372 } 2373 2374 // If a nul character existed in the string, warn about it. 2375 if (NulCharacter && !isLexingRawMode()) 2376 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2377 2378 // Update the location of token as well as BufferPtr. 2379 const char *TokStart = BufferPtr; 2380 FormTokenWithChars(Result, CurPtr, tok::header_name); 2381 Result.setLiteralData(TokStart); 2382 return true; 2383 } 2384 2385 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2386 const char *CompletionPoint, 2387 bool IsAngled) { 2388 // Completion only applies to the filename, after the last slash. 2389 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2390 llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2391 auto Slash = PartialPath.find_last_of(SlashChars); 2392 StringRef Dir = 2393 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2394 const char *StartOfFilename = 2395 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2396 // Code completion filter range is the filename only, up to completion point. 2397 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2398 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2399 // We should replace the characters up to the closing quote or closest slash, 2400 // if any. 2401 while (CompletionPoint < BufferEnd) { 2402 char Next = *(CompletionPoint + 1); 2403 if (Next == 0 || Next == '\r' || Next == '\n') 2404 break; 2405 ++CompletionPoint; 2406 if (Next == (IsAngled ? '>' : '"')) 2407 break; 2408 if (SlashChars.contains(Next)) 2409 break; 2410 } 2411 2412 PP->setCodeCompletionTokenRange( 2413 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2414 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2415 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2416 } 2417 2418 /// LexCharConstant - Lex the remainder of a character constant, after having 2419 /// lexed either ' or L' or u8' or u' or U'. 2420 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2421 tok::TokenKind Kind) { 2422 // Does this character contain the \0 character? 2423 const char *NulCharacter = nullptr; 2424 2425 if (!isLexingRawMode()) { 2426 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2427 Diag(BufferPtr, LangOpts.CPlusPlus 2428 ? diag::warn_cxx98_compat_unicode_literal 2429 : diag::warn_c99_compat_unicode_literal); 2430 else if (Kind == tok::utf8_char_constant) 2431 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2432 } 2433 2434 char C = getAndAdvanceChar(CurPtr, Result); 2435 if (C == '\'') { 2436 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2437 Diag(BufferPtr, diag::ext_empty_character); 2438 FormTokenWithChars(Result, CurPtr, tok::unknown); 2439 return true; 2440 } 2441 2442 while (C != '\'') { 2443 // Skip escaped characters. 2444 if (C == '\\') 2445 C = getAndAdvanceChar(CurPtr, Result); 2446 2447 if (C == '\n' || C == '\r' || // Newline. 2448 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2449 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2450 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2451 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2452 return true; 2453 } 2454 2455 if (C == 0) { 2456 if (isCodeCompletionPoint(CurPtr-1)) { 2457 PP->CodeCompleteNaturalLanguage(); 2458 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2459 cutOffLexing(); 2460 return true; 2461 } 2462 2463 NulCharacter = CurPtr-1; 2464 } 2465 C = getAndAdvanceChar(CurPtr, Result); 2466 } 2467 2468 // If we are in C++11, lex the optional ud-suffix. 2469 if (LangOpts.CPlusPlus) 2470 CurPtr = LexUDSuffix(Result, CurPtr, false); 2471 2472 // If a nul character existed in the character, warn about it. 2473 if (NulCharacter && !isLexingRawMode()) 2474 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2475 2476 // Update the location of token as well as BufferPtr. 2477 const char *TokStart = BufferPtr; 2478 FormTokenWithChars(Result, CurPtr, Kind); 2479 Result.setLiteralData(TokStart); 2480 return true; 2481 } 2482 2483 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2484 /// Update BufferPtr to point to the next non-whitespace character and return. 2485 /// 2486 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2487 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2488 bool &TokAtPhysicalStartOfLine) { 2489 // Whitespace - Skip it, then return the token after the whitespace. 2490 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2491 2492 unsigned char Char = *CurPtr; 2493 2494 const char *lastNewLine = nullptr; 2495 auto setLastNewLine = [&](const char *Ptr) { 2496 lastNewLine = Ptr; 2497 if (!NewLinePtr) 2498 NewLinePtr = Ptr; 2499 }; 2500 if (SawNewline) 2501 setLastNewLine(CurPtr - 1); 2502 2503 // Skip consecutive spaces efficiently. 2504 while (true) { 2505 // Skip horizontal whitespace very aggressively. 2506 while (isHorizontalWhitespace(Char)) 2507 Char = *++CurPtr; 2508 2509 // Otherwise if we have something other than whitespace, we're done. 2510 if (!isVerticalWhitespace(Char)) 2511 break; 2512 2513 if (ParsingPreprocessorDirective) { 2514 // End of preprocessor directive line, let LexTokenInternal handle this. 2515 BufferPtr = CurPtr; 2516 return false; 2517 } 2518 2519 // OK, but handle newline. 2520 if (*CurPtr == '\n') 2521 setLastNewLine(CurPtr); 2522 SawNewline = true; 2523 Char = *++CurPtr; 2524 } 2525 2526 // If the client wants us to return whitespace, return it now. 2527 if (isKeepWhitespaceMode()) { 2528 FormTokenWithChars(Result, CurPtr, tok::unknown); 2529 if (SawNewline) { 2530 IsAtStartOfLine = true; 2531 IsAtPhysicalStartOfLine = true; 2532 } 2533 // FIXME: The next token will not have LeadingSpace set. 2534 return true; 2535 } 2536 2537 // If this isn't immediately after a newline, there is leading space. 2538 char PrevChar = CurPtr[-1]; 2539 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2540 2541 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2542 if (SawNewline) { 2543 Result.setFlag(Token::StartOfLine); 2544 TokAtPhysicalStartOfLine = true; 2545 2546 if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2547 if (auto *Handler = PP->getEmptylineHandler()) 2548 Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2549 getSourceLocation(lastNewLine))); 2550 } 2551 } 2552 2553 BufferPtr = CurPtr; 2554 return false; 2555 } 2556 2557 /// We have just read the // characters from input. Skip until we find the 2558 /// newline character that terminates the comment. Then update BufferPtr and 2559 /// return. 2560 /// 2561 /// If we're in KeepCommentMode or any CommentHandler has inserted 2562 /// some tokens, this will store the first token and return true. 2563 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2564 bool &TokAtPhysicalStartOfLine) { 2565 // If Line comments aren't explicitly enabled for this language, emit an 2566 // extension warning. 2567 if (!LineComment) { 2568 if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 2569 Diag(BufferPtr, diag::ext_line_comment); 2570 2571 // Mark them enabled so we only emit one warning for this translation 2572 // unit. 2573 LineComment = true; 2574 } 2575 2576 // Scan over the body of the comment. The common case, when scanning, is that 2577 // the comment contains normal ascii characters with nothing interesting in 2578 // them. As such, optimize for this case with the inner loop. 2579 // 2580 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2581 // character that ends the line comment. 2582 2583 // C++23 [lex.phases] p1 2584 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2585 // diagnostic only once per entire ill-formed subsequence to avoid 2586 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2587 bool UnicodeDecodingAlreadyDiagnosed = false; 2588 2589 char C; 2590 while (true) { 2591 C = *CurPtr; 2592 // Skip over characters in the fast loop. 2593 while (isASCII(C) && C != 0 && // Potentially EOF. 2594 C != '\n' && C != '\r') { // Newline or DOS-style newline. 2595 C = *++CurPtr; 2596 UnicodeDecodingAlreadyDiagnosed = false; 2597 } 2598 2599 if (!isASCII(C)) { 2600 unsigned Length = llvm::getUTF8SequenceSize( 2601 (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2602 if (Length == 0) { 2603 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2604 Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2605 UnicodeDecodingAlreadyDiagnosed = true; 2606 ++CurPtr; 2607 } else { 2608 UnicodeDecodingAlreadyDiagnosed = false; 2609 CurPtr += Length; 2610 } 2611 continue; 2612 } 2613 2614 const char *NextLine = CurPtr; 2615 if (C != 0) { 2616 // We found a newline, see if it's escaped. 2617 const char *EscapePtr = CurPtr-1; 2618 bool HasSpace = false; 2619 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2620 --EscapePtr; 2621 HasSpace = true; 2622 } 2623 2624 if (*EscapePtr == '\\') 2625 // Escaped newline. 2626 CurPtr = EscapePtr; 2627 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2628 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2629 // Trigraph-escaped newline. 2630 CurPtr = EscapePtr-2; 2631 else 2632 break; // This is a newline, we're done. 2633 2634 // If there was space between the backslash and newline, warn about it. 2635 if (HasSpace && !isLexingRawMode()) 2636 Diag(EscapePtr, diag::backslash_newline_space); 2637 } 2638 2639 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2640 // properly decode the character. Read it in raw mode to avoid emitting 2641 // diagnostics about things like trigraphs. If we see an escaped newline, 2642 // we'll handle it below. 2643 const char *OldPtr = CurPtr; 2644 bool OldRawMode = isLexingRawMode(); 2645 LexingRawMode = true; 2646 C = getAndAdvanceChar(CurPtr, Result); 2647 LexingRawMode = OldRawMode; 2648 2649 // If we only read only one character, then no special handling is needed. 2650 // We're done and can skip forward to the newline. 2651 if (C != 0 && CurPtr == OldPtr+1) { 2652 CurPtr = NextLine; 2653 break; 2654 } 2655 2656 // If we read multiple characters, and one of those characters was a \r or 2657 // \n, then we had an escaped newline within the comment. Emit diagnostic 2658 // unless the next line is also a // comment. 2659 if (CurPtr != OldPtr + 1 && C != '/' && 2660 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2661 for (; OldPtr != CurPtr; ++OldPtr) 2662 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2663 // Okay, we found a // comment that ends in a newline, if the next 2664 // line is also a // comment, but has spaces, don't emit a diagnostic. 2665 if (isWhitespace(C)) { 2666 const char *ForwardPtr = CurPtr; 2667 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2668 ++ForwardPtr; 2669 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2670 break; 2671 } 2672 2673 if (!isLexingRawMode()) 2674 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2675 break; 2676 } 2677 } 2678 2679 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2680 --CurPtr; 2681 break; 2682 } 2683 2684 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2685 PP->CodeCompleteNaturalLanguage(); 2686 cutOffLexing(); 2687 return false; 2688 } 2689 } 2690 2691 // Found but did not consume the newline. Notify comment handlers about the 2692 // comment unless we're in a #if 0 block. 2693 if (PP && !isLexingRawMode() && 2694 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2695 getSourceLocation(CurPtr)))) { 2696 BufferPtr = CurPtr; 2697 return true; // A token has to be returned. 2698 } 2699 2700 // If we are returning comments as tokens, return this comment as a token. 2701 if (inKeepCommentMode()) 2702 return SaveLineComment(Result, CurPtr); 2703 2704 // If we are inside a preprocessor directive and we see the end of line, 2705 // return immediately, so that the lexer can return this as an EOD token. 2706 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2707 BufferPtr = CurPtr; 2708 return false; 2709 } 2710 2711 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2712 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2713 // contribute to another token), it isn't needed for correctness. Note that 2714 // this is ok even in KeepWhitespaceMode, because we would have returned the 2715 // comment above in that mode. 2716 NewLinePtr = CurPtr++; 2717 2718 // The next returned token is at the start of the line. 2719 Result.setFlag(Token::StartOfLine); 2720 TokAtPhysicalStartOfLine = true; 2721 // No leading whitespace seen so far. 2722 Result.clearFlag(Token::LeadingSpace); 2723 BufferPtr = CurPtr; 2724 return false; 2725 } 2726 2727 /// If in save-comment mode, package up this Line comment in an appropriate 2728 /// way and return it. 2729 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2730 // If we're not in a preprocessor directive, just return the // comment 2731 // directly. 2732 FormTokenWithChars(Result, CurPtr, tok::comment); 2733 2734 if (!ParsingPreprocessorDirective || LexingRawMode) 2735 return true; 2736 2737 // If this Line-style comment is in a macro definition, transmogrify it into 2738 // a C-style block comment. 2739 bool Invalid = false; 2740 std::string Spelling = PP->getSpelling(Result, &Invalid); 2741 if (Invalid) 2742 return true; 2743 2744 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2745 Spelling[1] = '*'; // Change prefix to "/*". 2746 Spelling += "*/"; // add suffix. 2747 2748 Result.setKind(tok::comment); 2749 PP->CreateString(Spelling, Result, 2750 Result.getLocation(), Result.getLocation()); 2751 return true; 2752 } 2753 2754 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2755 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2756 /// a diagnostic if so. We know that the newline is inside of a block comment. 2757 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2758 bool Trigraphs) { 2759 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2760 2761 // Position of the first trigraph in the ending sequence. 2762 const char *TrigraphPos = nullptr; 2763 // Position of the first whitespace after a '\' in the ending sequence. 2764 const char *SpacePos = nullptr; 2765 2766 while (true) { 2767 // Back up off the newline. 2768 --CurPtr; 2769 2770 // If this is a two-character newline sequence, skip the other character. 2771 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2772 // \n\n or \r\r -> not escaped newline. 2773 if (CurPtr[0] == CurPtr[1]) 2774 return false; 2775 // \n\r or \r\n -> skip the newline. 2776 --CurPtr; 2777 } 2778 2779 // If we have horizontal whitespace, skip over it. We allow whitespace 2780 // between the slash and newline. 2781 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2782 SpacePos = CurPtr; 2783 --CurPtr; 2784 } 2785 2786 // If we have a slash, this is an escaped newline. 2787 if (*CurPtr == '\\') { 2788 --CurPtr; 2789 } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2790 // This is a trigraph encoding of a slash. 2791 TrigraphPos = CurPtr - 2; 2792 CurPtr -= 3; 2793 } else { 2794 return false; 2795 } 2796 2797 // If the character preceding the escaped newline is a '*', then after line 2798 // splicing we have a '*/' ending the comment. 2799 if (*CurPtr == '*') 2800 break; 2801 2802 if (*CurPtr != '\n' && *CurPtr != '\r') 2803 return false; 2804 } 2805 2806 if (TrigraphPos) { 2807 // If no trigraphs are enabled, warn that we ignored this trigraph and 2808 // ignore this * character. 2809 if (!Trigraphs) { 2810 if (!L->isLexingRawMode()) 2811 L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 2812 return false; 2813 } 2814 if (!L->isLexingRawMode()) 2815 L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 2816 } 2817 2818 // Warn about having an escaped newline between the */ characters. 2819 if (!L->isLexingRawMode()) 2820 L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 2821 2822 // If there was space between the backslash and newline, warn about it. 2823 if (SpacePos && !L->isLexingRawMode()) 2824 L->Diag(SpacePos, diag::backslash_newline_space); 2825 2826 return true; 2827 } 2828 2829 #ifdef __SSE2__ 2830 #include <emmintrin.h> 2831 #elif __ALTIVEC__ 2832 #include <altivec.h> 2833 #undef bool 2834 #endif 2835 2836 /// We have just read from input the / and * characters that started a comment. 2837 /// Read until we find the * and / characters that terminate the comment. 2838 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2839 /// comments, because they cannot cause the comment to end. The only thing 2840 /// that can happen is the comment could end with an escaped newline between 2841 /// the terminating * and /. 2842 /// 2843 /// If we're in KeepCommentMode or any CommentHandler has inserted 2844 /// some tokens, this will store the first token and return true. 2845 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2846 bool &TokAtPhysicalStartOfLine) { 2847 // Scan one character past where we should, looking for a '/' character. Once 2848 // we find it, check to see if it was preceded by a *. This common 2849 // optimization helps people who like to put a lot of * characters in their 2850 // comments. 2851 2852 // The first character we get with newlines and trigraphs skipped to handle 2853 // the degenerate /*/ case below correctly if the * has an escaped newline 2854 // after it. 2855 unsigned CharSize; 2856 unsigned char C = getCharAndSize(CurPtr, CharSize); 2857 CurPtr += CharSize; 2858 if (C == 0 && CurPtr == BufferEnd+1) { 2859 if (!isLexingRawMode()) 2860 Diag(BufferPtr, diag::err_unterminated_block_comment); 2861 --CurPtr; 2862 2863 // KeepWhitespaceMode should return this broken comment as a token. Since 2864 // it isn't a well formed comment, just return it as an 'unknown' token. 2865 if (isKeepWhitespaceMode()) { 2866 FormTokenWithChars(Result, CurPtr, tok::unknown); 2867 return true; 2868 } 2869 2870 BufferPtr = CurPtr; 2871 return false; 2872 } 2873 2874 // Check to see if the first character after the '/*' is another /. If so, 2875 // then this slash does not end the block comment, it is part of it. 2876 if (C == '/') 2877 C = *CurPtr++; 2878 2879 // C++23 [lex.phases] p1 2880 // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2881 // diagnostic only once per entire ill-formed subsequence to avoid 2882 // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2883 bool UnicodeDecodingAlreadyDiagnosed = false; 2884 2885 while (true) { 2886 // Skip over all non-interesting characters until we find end of buffer or a 2887 // (probably ending) '/' character. 2888 if (CurPtr + 24 < BufferEnd && 2889 // If there is a code-completion point avoid the fast scan because it 2890 // doesn't check for '\0'. 2891 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2892 // While not aligned to a 16-byte boundary. 2893 while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2894 if (!isASCII(C)) 2895 goto MultiByteUTF8; 2896 C = *CurPtr++; 2897 } 2898 if (C == '/') goto FoundSlash; 2899 2900 #ifdef __SSE2__ 2901 __m128i Slashes = _mm_set1_epi8('/'); 2902 while (CurPtr + 16 < BufferEnd) { 2903 int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2904 if (LLVM_UNLIKELY(Mask != 0)) { 2905 goto MultiByteUTF8; 2906 } 2907 // look for slashes 2908 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2909 Slashes)); 2910 if (cmp != 0) { 2911 // Adjust the pointer to point directly after the first slash. It's 2912 // not necessary to set C here, it will be overwritten at the end of 2913 // the outer loop. 2914 CurPtr += llvm::countr_zero<unsigned>(cmp) + 1; 2915 goto FoundSlash; 2916 } 2917 CurPtr += 16; 2918 } 2919 #elif __ALTIVEC__ 2920 __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2921 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2922 0x80, 0x80, 0x80, 0x80}; 2923 __vector unsigned char Slashes = { 2924 '/', '/', '/', '/', '/', '/', '/', '/', 2925 '/', '/', '/', '/', '/', '/', '/', '/' 2926 }; 2927 while (CurPtr + 16 < BufferEnd) { 2928 if (LLVM_UNLIKELY( 2929 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2930 goto MultiByteUTF8; 2931 if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2932 break; 2933 } 2934 CurPtr += 16; 2935 } 2936 2937 #else 2938 while (CurPtr + 16 < BufferEnd) { 2939 bool HasNonASCII = false; 2940 for (unsigned I = 0; I < 16; ++I) 2941 HasNonASCII |= !isASCII(CurPtr[I]); 2942 2943 if (LLVM_UNLIKELY(HasNonASCII)) 2944 goto MultiByteUTF8; 2945 2946 bool HasSlash = false; 2947 for (unsigned I = 0; I < 16; ++I) 2948 HasSlash |= CurPtr[I] == '/'; 2949 if (HasSlash) 2950 break; 2951 CurPtr += 16; 2952 } 2953 #endif 2954 2955 // It has to be one of the bytes scanned, increment to it and read one. 2956 C = *CurPtr++; 2957 } 2958 2959 // Loop to scan the remainder, warning on invalid UTF-8 2960 // if the corresponding warning is enabled, emitting a diagnostic only once 2961 // per sequence that cannot be decoded. 2962 while (C != '/' && C != '\0') { 2963 if (isASCII(C)) { 2964 UnicodeDecodingAlreadyDiagnosed = false; 2965 C = *CurPtr++; 2966 continue; 2967 } 2968 MultiByteUTF8: 2969 // CurPtr is 1 code unit past C, so to decode 2970 // the codepoint, we need to read from the previous position. 2971 unsigned Length = llvm::getUTF8SequenceSize( 2972 (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2973 if (Length == 0) { 2974 if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2975 Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2976 UnicodeDecodingAlreadyDiagnosed = true; 2977 } else { 2978 UnicodeDecodingAlreadyDiagnosed = false; 2979 CurPtr += Length - 1; 2980 } 2981 C = *CurPtr++; 2982 } 2983 2984 if (C == '/') { 2985 FoundSlash: 2986 if (CurPtr[-2] == '*') // We found the final */. We're done! 2987 break; 2988 2989 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2990 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 2991 LangOpts.Trigraphs)) { 2992 // We found the final */, though it had an escaped newline between the 2993 // * and /. We're done! 2994 break; 2995 } 2996 } 2997 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2998 // If this is a /* inside of the comment, emit a warning. Don't do this 2999 // if this is a /*/, which will end the comment. This misses cases with 3000 // embedded escaped newlines, but oh well. 3001 if (!isLexingRawMode()) 3002 Diag(CurPtr-1, diag::warn_nested_block_comment); 3003 } 3004 } else if (C == 0 && CurPtr == BufferEnd+1) { 3005 if (!isLexingRawMode()) 3006 Diag(BufferPtr, diag::err_unterminated_block_comment); 3007 // Note: the user probably forgot a */. We could continue immediately 3008 // after the /*, but this would involve lexing a lot of what really is the 3009 // comment, which surely would confuse the parser. 3010 --CurPtr; 3011 3012 // KeepWhitespaceMode should return this broken comment as a token. Since 3013 // it isn't a well formed comment, just return it as an 'unknown' token. 3014 if (isKeepWhitespaceMode()) { 3015 FormTokenWithChars(Result, CurPtr, tok::unknown); 3016 return true; 3017 } 3018 3019 BufferPtr = CurPtr; 3020 return false; 3021 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 3022 PP->CodeCompleteNaturalLanguage(); 3023 cutOffLexing(); 3024 return false; 3025 } 3026 3027 C = *CurPtr++; 3028 } 3029 3030 // Notify comment handlers about the comment unless we're in a #if 0 block. 3031 if (PP && !isLexingRawMode() && 3032 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 3033 getSourceLocation(CurPtr)))) { 3034 BufferPtr = CurPtr; 3035 return true; // A token has to be returned. 3036 } 3037 3038 // If we are returning comments as tokens, return this comment as a token. 3039 if (inKeepCommentMode()) { 3040 FormTokenWithChars(Result, CurPtr, tok::comment); 3041 return true; 3042 } 3043 3044 // It is common for the tokens immediately after a /**/ comment to be 3045 // whitespace. Instead of going through the big switch, handle it 3046 // efficiently now. This is safe even in KeepWhitespaceMode because we would 3047 // have already returned above with the comment as a token. 3048 if (isHorizontalWhitespace(*CurPtr)) { 3049 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 3050 return false; 3051 } 3052 3053 // Otherwise, just return so that the next character will be lexed as a token. 3054 BufferPtr = CurPtr; 3055 Result.setFlag(Token::LeadingSpace); 3056 return false; 3057 } 3058 3059 //===----------------------------------------------------------------------===// 3060 // Primary Lexing Entry Points 3061 //===----------------------------------------------------------------------===// 3062 3063 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 3064 /// uninterpreted string. This switches the lexer out of directive mode. 3065 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 3066 assert(ParsingPreprocessorDirective && ParsingFilename == false && 3067 "Must be in a preprocessing directive!"); 3068 Token Tmp; 3069 Tmp.startToken(); 3070 3071 // CurPtr - Cache BufferPtr in an automatic variable. 3072 const char *CurPtr = BufferPtr; 3073 while (true) { 3074 char Char = getAndAdvanceChar(CurPtr, Tmp); 3075 switch (Char) { 3076 default: 3077 if (Result) 3078 Result->push_back(Char); 3079 break; 3080 case 0: // Null. 3081 // Found end of file? 3082 if (CurPtr-1 != BufferEnd) { 3083 if (isCodeCompletionPoint(CurPtr-1)) { 3084 PP->CodeCompleteNaturalLanguage(); 3085 cutOffLexing(); 3086 return; 3087 } 3088 3089 // Nope, normal character, continue. 3090 if (Result) 3091 Result->push_back(Char); 3092 break; 3093 } 3094 // FALL THROUGH. 3095 [[fallthrough]]; 3096 case '\r': 3097 case '\n': 3098 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 3099 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 3100 BufferPtr = CurPtr-1; 3101 3102 // Next, lex the character, which should handle the EOD transition. 3103 Lex(Tmp); 3104 if (Tmp.is(tok::code_completion)) { 3105 if (PP) 3106 PP->CodeCompleteNaturalLanguage(); 3107 Lex(Tmp); 3108 } 3109 assert(Tmp.is(tok::eod) && "Unexpected token!"); 3110 3111 // Finally, we're done; 3112 return; 3113 } 3114 } 3115 } 3116 3117 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 3118 /// condition, reporting diagnostics and handling other edge cases as required. 3119 /// This returns true if Result contains a token, false if PP.Lex should be 3120 /// called again. 3121 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 3122 // If we hit the end of the file while parsing a preprocessor directive, 3123 // end the preprocessor directive first. The next token returned will 3124 // then be the end of file. 3125 if (ParsingPreprocessorDirective) { 3126 // Done parsing the "line". 3127 ParsingPreprocessorDirective = false; 3128 // Update the location of token as well as BufferPtr. 3129 FormTokenWithChars(Result, CurPtr, tok::eod); 3130 3131 // Restore comment saving mode, in case it was disabled for directive. 3132 if (PP) 3133 resetExtendedTokenMode(); 3134 return true; // Have a token. 3135 } 3136 3137 // If we are in raw mode, return this event as an EOF token. Let the caller 3138 // that put us in raw mode handle the event. 3139 if (isLexingRawMode()) { 3140 Result.startToken(); 3141 BufferPtr = BufferEnd; 3142 FormTokenWithChars(Result, BufferEnd, tok::eof); 3143 return true; 3144 } 3145 3146 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 3147 PP->setRecordedPreambleConditionalStack(ConditionalStack); 3148 // If the preamble cuts off the end of a header guard, consider it guarded. 3149 // The guard is valid for the preamble content itself, and for tools the 3150 // most useful answer is "yes, this file has a header guard". 3151 if (!ConditionalStack.empty()) 3152 MIOpt.ExitTopLevelConditional(); 3153 ConditionalStack.clear(); 3154 } 3155 3156 // Issue diagnostics for unterminated #if and missing newline. 3157 3158 // If we are in a #if directive, emit an error. 3159 while (!ConditionalStack.empty()) { 3160 if (PP->getCodeCompletionFileLoc() != FileLoc) 3161 PP->Diag(ConditionalStack.back().IfLoc, 3162 diag::err_pp_unterminated_conditional); 3163 ConditionalStack.pop_back(); 3164 } 3165 3166 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 3167 // a pedwarn. 3168 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 3169 DiagnosticsEngine &Diags = PP->getDiagnostics(); 3170 SourceLocation EndLoc = getSourceLocation(BufferEnd); 3171 unsigned DiagID; 3172 3173 if (LangOpts.CPlusPlus11) { 3174 // C++11 [lex.phases] 2.2 p2 3175 // Prefer the C++98 pedantic compatibility warning over the generic, 3176 // non-extension, user-requested "missing newline at EOF" warning. 3177 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 3178 DiagID = diag::warn_cxx98_compat_no_newline_eof; 3179 } else { 3180 DiagID = diag::warn_no_newline_eof; 3181 } 3182 } else { 3183 DiagID = diag::ext_no_newline_eof; 3184 } 3185 3186 Diag(BufferEnd, DiagID) 3187 << FixItHint::CreateInsertion(EndLoc, "\n"); 3188 } 3189 3190 BufferPtr = CurPtr; 3191 3192 // Finally, let the preprocessor handle this. 3193 return PP->HandleEndOfFile(Result, isPragmaLexer()); 3194 } 3195 3196 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 3197 /// the specified lexer will return a tok::l_paren token, 0 if it is something 3198 /// else and 2 if there are no more tokens in the buffer controlled by the 3199 /// lexer. 3200 unsigned Lexer::isNextPPTokenLParen() { 3201 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 3202 3203 if (isDependencyDirectivesLexer()) { 3204 if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 3205 return 2; 3206 return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 3207 tok::l_paren); 3208 } 3209 3210 // Switch to 'skipping' mode. This will ensure that we can lex a token 3211 // without emitting diagnostics, disables macro expansion, and will cause EOF 3212 // to return an EOF token instead of popping the include stack. 3213 LexingRawMode = true; 3214 3215 // Save state that can be changed while lexing so that we can restore it. 3216 const char *TmpBufferPtr = BufferPtr; 3217 bool inPPDirectiveMode = ParsingPreprocessorDirective; 3218 bool atStartOfLine = IsAtStartOfLine; 3219 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3220 bool leadingSpace = HasLeadingSpace; 3221 3222 Token Tok; 3223 Lex(Tok); 3224 3225 // Restore state that may have changed. 3226 BufferPtr = TmpBufferPtr; 3227 ParsingPreprocessorDirective = inPPDirectiveMode; 3228 HasLeadingSpace = leadingSpace; 3229 IsAtStartOfLine = atStartOfLine; 3230 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 3231 3232 // Restore the lexer back to non-skipping mode. 3233 LexingRawMode = false; 3234 3235 if (Tok.is(tok::eof)) 3236 return 2; 3237 return Tok.is(tok::l_paren); 3238 } 3239 3240 /// Find the end of a version control conflict marker. 3241 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 3242 ConflictMarkerKind CMK) { 3243 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 3244 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 3245 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 3246 size_t Pos = RestOfBuffer.find(Terminator); 3247 while (Pos != StringRef::npos) { 3248 // Must occur at start of line. 3249 if (Pos == 0 || 3250 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 3251 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 3252 Pos = RestOfBuffer.find(Terminator); 3253 continue; 3254 } 3255 return RestOfBuffer.data()+Pos; 3256 } 3257 return nullptr; 3258 } 3259 3260 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 3261 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 3262 /// and recover nicely. This returns true if it is a conflict marker and false 3263 /// if not. 3264 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 3265 // Only a conflict marker if it starts at the beginning of a line. 3266 if (CurPtr != BufferStart && 3267 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3268 return false; 3269 3270 // Check to see if we have <<<<<<< or >>>>. 3271 if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") && 3272 !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> ")) 3273 return false; 3274 3275 // If we have a situation where we don't care about conflict markers, ignore 3276 // it. 3277 if (CurrentConflictMarkerState || isLexingRawMode()) 3278 return false; 3279 3280 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 3281 3282 // Check to see if there is an ending marker somewhere in the buffer at the 3283 // start of a line to terminate this conflict marker. 3284 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 3285 // We found a match. We are really in a conflict marker. 3286 // Diagnose this, and ignore to the end of line. 3287 Diag(CurPtr, diag::err_conflict_marker); 3288 CurrentConflictMarkerState = Kind; 3289 3290 // Skip ahead to the end of line. We know this exists because the 3291 // end-of-conflict marker starts with \r or \n. 3292 while (*CurPtr != '\r' && *CurPtr != '\n') { 3293 assert(CurPtr != BufferEnd && "Didn't find end of line"); 3294 ++CurPtr; 3295 } 3296 BufferPtr = CurPtr; 3297 return true; 3298 } 3299 3300 // No end of conflict marker found. 3301 return false; 3302 } 3303 3304 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 3305 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 3306 /// is the end of a conflict marker. Handle it by ignoring up until the end of 3307 /// the line. This returns true if it is a conflict marker and false if not. 3308 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 3309 // Only a conflict marker if it starts at the beginning of a line. 3310 if (CurPtr != BufferStart && 3311 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 3312 return false; 3313 3314 // If we have a situation where we don't care about conflict markers, ignore 3315 // it. 3316 if (!CurrentConflictMarkerState || isLexingRawMode()) 3317 return false; 3318 3319 // Check to see if we have the marker (4 characters in a row). 3320 for (unsigned i = 1; i != 4; ++i) 3321 if (CurPtr[i] != CurPtr[0]) 3322 return false; 3323 3324 // If we do have it, search for the end of the conflict marker. This could 3325 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 3326 // be the end of conflict marker. 3327 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 3328 CurrentConflictMarkerState)) { 3329 CurPtr = End; 3330 3331 // Skip ahead to the end of line. 3332 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 3333 ++CurPtr; 3334 3335 BufferPtr = CurPtr; 3336 3337 // No longer in the conflict marker. 3338 CurrentConflictMarkerState = CMK_None; 3339 return true; 3340 } 3341 3342 return false; 3343 } 3344 3345 static const char *findPlaceholderEnd(const char *CurPtr, 3346 const char *BufferEnd) { 3347 if (CurPtr == BufferEnd) 3348 return nullptr; 3349 BufferEnd -= 1; // Scan until the second last character. 3350 for (; CurPtr != BufferEnd; ++CurPtr) { 3351 if (CurPtr[0] == '#' && CurPtr[1] == '>') 3352 return CurPtr + 2; 3353 } 3354 return nullptr; 3355 } 3356 3357 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 3358 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 3359 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 3360 return false; 3361 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 3362 if (!End) 3363 return false; 3364 const char *Start = CurPtr - 1; 3365 if (!LangOpts.AllowEditorPlaceholders) 3366 Diag(Start, diag::err_placeholder_in_source); 3367 Result.startToken(); 3368 FormTokenWithChars(Result, End, tok::raw_identifier); 3369 Result.setRawIdentifierData(Start); 3370 PP->LookUpIdentifierInfo(Result); 3371 Result.setFlag(Token::IsEditorPlaceholder); 3372 BufferPtr = End; 3373 return true; 3374 } 3375 3376 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 3377 if (PP && PP->isCodeCompletionEnabled()) { 3378 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 3379 return Loc == PP->getCodeCompletionLoc(); 3380 } 3381 3382 return false; 3383 } 3384 3385 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3386 const char *SlashLoc, 3387 Token *Result) { 3388 unsigned CharSize; 3389 char Kind = getCharAndSize(StartPtr, CharSize); 3390 assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 3391 3392 unsigned NumHexDigits; 3393 if (Kind == 'u') 3394 NumHexDigits = 4; 3395 else if (Kind == 'U') 3396 NumHexDigits = 8; 3397 3398 bool Delimited = false; 3399 bool FoundEndDelimiter = false; 3400 unsigned Count = 0; 3401 bool Diagnose = Result && !isLexingRawMode(); 3402 3403 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3404 if (Diagnose) 3405 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3406 return std::nullopt; 3407 } 3408 3409 const char *CurPtr = StartPtr + CharSize; 3410 const char *KindLoc = &CurPtr[-1]; 3411 3412 uint32_t CodePoint = 0; 3413 while (Count != NumHexDigits || Delimited) { 3414 char C = getCharAndSize(CurPtr, CharSize); 3415 if (!Delimited && Count == 0 && C == '{') { 3416 Delimited = true; 3417 CurPtr += CharSize; 3418 continue; 3419 } 3420 3421 if (Delimited && C == '}') { 3422 CurPtr += CharSize; 3423 FoundEndDelimiter = true; 3424 break; 3425 } 3426 3427 unsigned Value = llvm::hexDigitValue(C); 3428 if (Value == -1U) { 3429 if (!Delimited) 3430 break; 3431 if (Diagnose) 3432 Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 3433 << StringRef(KindLoc, 1); 3434 return std::nullopt; 3435 } 3436 3437 if (CodePoint & 0xF000'0000) { 3438 if (Diagnose) 3439 Diag(KindLoc, diag::err_escape_too_large) << 0; 3440 return std::nullopt; 3441 } 3442 3443 CodePoint <<= 4; 3444 CodePoint |= Value; 3445 CurPtr += CharSize; 3446 Count++; 3447 } 3448 3449 if (Count == 0) { 3450 if (Diagnose) 3451 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3452 : diag::warn_ucn_escape_no_digits) 3453 << StringRef(KindLoc, 1); 3454 return std::nullopt; 3455 } 3456 3457 if (Delimited && Kind == 'U') { 3458 if (Diagnose) 3459 Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3460 return std::nullopt; 3461 } 3462 3463 if (!Delimited && Count != NumHexDigits) { 3464 if (Diagnose) { 3465 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3466 // If the user wrote \U1234, suggest a fixit to \u. 3467 if (Count == 4 && NumHexDigits == 8) { 3468 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3469 Diag(KindLoc, diag::note_ucn_four_not_eight) 3470 << FixItHint::CreateReplacement(URange, "u"); 3471 } 3472 } 3473 return std::nullopt; 3474 } 3475 3476 if (Delimited && PP) { 3477 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3478 ? diag::warn_cxx23_delimited_escape_sequence 3479 : diag::ext_delimited_escape_sequence) 3480 << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3481 } 3482 3483 if (Result) { 3484 Result->setFlag(Token::HasUCN); 3485 // If the UCN contains either a trigraph or a line splicing, 3486 // we need to call getAndAdvanceChar again to set the appropriate flags 3487 // on Result. 3488 if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 3489 StartPtr = CurPtr; 3490 else 3491 while (StartPtr != CurPtr) 3492 (void)getAndAdvanceChar(StartPtr, *Result); 3493 } else { 3494 StartPtr = CurPtr; 3495 } 3496 return CodePoint; 3497 } 3498 3499 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3500 const char *SlashLoc, 3501 Token *Result) { 3502 unsigned CharSize; 3503 bool Diagnose = Result && !isLexingRawMode(); 3504 3505 char C = getCharAndSize(StartPtr, CharSize); 3506 assert(C == 'N' && "expected \\N{...}"); 3507 3508 const char *CurPtr = StartPtr + CharSize; 3509 const char *KindLoc = &CurPtr[-1]; 3510 3511 C = getCharAndSize(CurPtr, CharSize); 3512 if (C != '{') { 3513 if (Diagnose) 3514 Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3515 return std::nullopt; 3516 } 3517 CurPtr += CharSize; 3518 const char *StartName = CurPtr; 3519 bool FoundEndDelimiter = false; 3520 llvm::SmallVector<char, 30> Buffer; 3521 while (C) { 3522 C = getCharAndSize(CurPtr, CharSize); 3523 CurPtr += CharSize; 3524 if (C == '}') { 3525 FoundEndDelimiter = true; 3526 break; 3527 } 3528 3529 if (isVerticalWhitespace(C)) 3530 break; 3531 Buffer.push_back(C); 3532 } 3533 3534 if (!FoundEndDelimiter || Buffer.empty()) { 3535 if (Diagnose) 3536 Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3537 : diag::warn_delimited_ucn_incomplete) 3538 << StringRef(KindLoc, 1); 3539 return std::nullopt; 3540 } 3541 3542 StringRef Name(Buffer.data(), Buffer.size()); 3543 std::optional<char32_t> Match = 3544 llvm::sys::unicode::nameToCodepointStrict(Name); 3545 std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3546 if (!Match) { 3547 LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3548 if (Diagnose) { 3549 Diag(StartName, diag::err_invalid_ucn_name) 3550 << StringRef(Buffer.data(), Buffer.size()) 3551 << makeCharRange(*this, StartName, CurPtr - CharSize); 3552 if (LooseMatch) { 3553 Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3554 << FixItHint::CreateReplacement( 3555 makeCharRange(*this, StartName, CurPtr - CharSize), 3556 LooseMatch->Name); 3557 } 3558 } 3559 // We do not offer misspelled character names suggestions here 3560 // as the set of what would be a valid suggestion depends on context, 3561 // and we should not make invalid suggestions. 3562 } 3563 3564 if (Diagnose && Match) 3565 Diag(SlashLoc, PP->getLangOpts().CPlusPlus23 3566 ? diag::warn_cxx23_delimited_escape_sequence 3567 : diag::ext_delimited_escape_sequence) 3568 << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 3569 3570 // If no diagnostic has been emitted yet, likely because we are doing a 3571 // tentative lexing, we do not want to recover here to make sure the token 3572 // will not be incorrectly considered valid. This function will be called 3573 // again and a diagnostic emitted then. 3574 if (LooseMatch && Diagnose) 3575 Match = LooseMatch->CodePoint; 3576 3577 if (Result) { 3578 Result->setFlag(Token::HasUCN); 3579 // If the UCN contains either a trigraph or a line splicing, 3580 // we need to call getAndAdvanceChar again to set the appropriate flags 3581 // on Result. 3582 if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 3583 StartPtr = CurPtr; 3584 else 3585 while (StartPtr != CurPtr) 3586 (void)getAndAdvanceChar(StartPtr, *Result); 3587 } else { 3588 StartPtr = CurPtr; 3589 } 3590 return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 3591 } 3592 3593 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3594 Token *Result) { 3595 3596 unsigned CharSize; 3597 std::optional<uint32_t> CodePointOpt; 3598 char Kind = getCharAndSize(StartPtr, CharSize); 3599 if (Kind == 'u' || Kind == 'U') 3600 CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3601 else if (Kind == 'N') 3602 CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 3603 3604 if (!CodePointOpt) 3605 return 0; 3606 3607 uint32_t CodePoint = *CodePointOpt; 3608 3609 // Don't apply C family restrictions to UCNs in assembly mode 3610 if (LangOpts.AsmPreprocessor) 3611 return CodePoint; 3612 3613 // C23 6.4.3p2: A universal character name shall not designate a code point 3614 // where the hexadecimal value is: 3615 // - in the range D800 through DFFF inclusive; or 3616 // - greater than 10FFFF. 3617 // A universal-character-name outside the c-char-sequence of a character 3618 // constant, or the s-char-sequence of a string-literal shall not designate 3619 // a control character or a character in the basic character set. 3620 3621 // C++11 [lex.charset]p2: If the hexadecimal value for a 3622 // universal-character-name corresponds to a surrogate code point (in the 3623 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3624 // if the hexadecimal value for a universal-character-name outside the 3625 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3626 // string literal corresponds to a control character (in either of the 3627 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3628 // basic source character set, the program is ill-formed. 3629 if (CodePoint < 0xA0) { 3630 // We don't use isLexingRawMode() here because we need to warn about bad 3631 // UCNs even when skipping preprocessing tokens in a #if block. 3632 if (Result && PP) { 3633 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3634 Diag(BufferPtr, diag::err_ucn_control_character); 3635 else { 3636 char C = static_cast<char>(CodePoint); 3637 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3638 } 3639 } 3640 3641 return 0; 3642 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3643 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3644 // We don't use isLexingRawMode() here because we need to diagnose bad 3645 // UCNs even when skipping preprocessing tokens in a #if block. 3646 if (Result && PP) { 3647 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3648 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3649 else 3650 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3651 } 3652 return 0; 3653 } 3654 3655 return CodePoint; 3656 } 3657 3658 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3659 const char *CurPtr) { 3660 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3661 isUnicodeWhitespace(C)) { 3662 Diag(BufferPtr, diag::ext_unicode_whitespace) 3663 << makeCharRange(*this, BufferPtr, CurPtr); 3664 3665 Result.setFlag(Token::LeadingSpace); 3666 return true; 3667 } 3668 return false; 3669 } 3670 3671 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3672 IsAtStartOfLine = Result.isAtStartOfLine(); 3673 HasLeadingSpace = Result.hasLeadingSpace(); 3674 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3675 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3676 } 3677 3678 bool Lexer::Lex(Token &Result) { 3679 assert(!isDependencyDirectivesLexer()); 3680 3681 // Start a new token. 3682 Result.startToken(); 3683 3684 // Set up misc whitespace flags for LexTokenInternal. 3685 if (IsAtStartOfLine) { 3686 Result.setFlag(Token::StartOfLine); 3687 IsAtStartOfLine = false; 3688 } 3689 3690 if (HasLeadingSpace) { 3691 Result.setFlag(Token::LeadingSpace); 3692 HasLeadingSpace = false; 3693 } 3694 3695 if (HasLeadingEmptyMacro) { 3696 Result.setFlag(Token::LeadingEmptyMacro); 3697 HasLeadingEmptyMacro = false; 3698 } 3699 3700 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3701 IsAtPhysicalStartOfLine = false; 3702 bool isRawLex = isLexingRawMode(); 3703 (void) isRawLex; 3704 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3705 // (After the LexTokenInternal call, the lexer might be destroyed.) 3706 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3707 return returnedToken; 3708 } 3709 3710 /// LexTokenInternal - This implements a simple C family lexer. It is an 3711 /// extremely performance critical piece of code. This assumes that the buffer 3712 /// has a null character at the end of the file. This returns a preprocessing 3713 /// token, not a normal token, as such, it is an internal interface. It assumes 3714 /// that the Flags of result have been cleared before calling this. 3715 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3716 LexStart: 3717 assert(!Result.needsCleaning() && "Result needs cleaning"); 3718 assert(!Result.hasPtrData() && "Result has not been reset"); 3719 3720 // CurPtr - Cache BufferPtr in an automatic variable. 3721 const char *CurPtr = BufferPtr; 3722 3723 // Small amounts of horizontal whitespace is very common between tokens. 3724 if (isHorizontalWhitespace(*CurPtr)) { 3725 do { 3726 ++CurPtr; 3727 } while (isHorizontalWhitespace(*CurPtr)); 3728 3729 // If we are keeping whitespace and other tokens, just return what we just 3730 // skipped. The next lexer invocation will return the token after the 3731 // whitespace. 3732 if (isKeepWhitespaceMode()) { 3733 FormTokenWithChars(Result, CurPtr, tok::unknown); 3734 // FIXME: The next token will not have LeadingSpace set. 3735 return true; 3736 } 3737 3738 BufferPtr = CurPtr; 3739 Result.setFlag(Token::LeadingSpace); 3740 } 3741 3742 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3743 3744 // Read a character, advancing over it. 3745 char Char = getAndAdvanceChar(CurPtr, Result); 3746 tok::TokenKind Kind; 3747 3748 if (!isVerticalWhitespace(Char)) 3749 NewLinePtr = nullptr; 3750 3751 switch (Char) { 3752 case 0: // Null. 3753 // Found end of file? 3754 if (CurPtr-1 == BufferEnd) 3755 return LexEndOfFile(Result, CurPtr-1); 3756 3757 // Check if we are performing code completion. 3758 if (isCodeCompletionPoint(CurPtr-1)) { 3759 // Return the code-completion token. 3760 Result.startToken(); 3761 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3762 return true; 3763 } 3764 3765 if (!isLexingRawMode()) 3766 Diag(CurPtr-1, diag::null_in_file); 3767 Result.setFlag(Token::LeadingSpace); 3768 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3769 return true; // KeepWhitespaceMode 3770 3771 // We know the lexer hasn't changed, so just try again with this lexer. 3772 // (We manually eliminate the tail call to avoid recursion.) 3773 goto LexNextToken; 3774 3775 case 26: // DOS & CP/M EOF: "^Z". 3776 // If we're in Microsoft extensions mode, treat this as end of file. 3777 if (LangOpts.MicrosoftExt) { 3778 if (!isLexingRawMode()) 3779 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3780 return LexEndOfFile(Result, CurPtr-1); 3781 } 3782 3783 // If Microsoft extensions are disabled, this is just random garbage. 3784 Kind = tok::unknown; 3785 break; 3786 3787 case '\r': 3788 if (CurPtr[0] == '\n') 3789 (void)getAndAdvanceChar(CurPtr, Result); 3790 [[fallthrough]]; 3791 case '\n': 3792 // If we are inside a preprocessor directive and we see the end of line, 3793 // we know we are done with the directive, so return an EOD token. 3794 if (ParsingPreprocessorDirective) { 3795 // Done parsing the "line". 3796 ParsingPreprocessorDirective = false; 3797 3798 // Restore comment saving mode, in case it was disabled for directive. 3799 if (PP) 3800 resetExtendedTokenMode(); 3801 3802 // Since we consumed a newline, we are back at the start of a line. 3803 IsAtStartOfLine = true; 3804 IsAtPhysicalStartOfLine = true; 3805 NewLinePtr = CurPtr - 1; 3806 3807 Kind = tok::eod; 3808 break; 3809 } 3810 3811 // No leading whitespace seen so far. 3812 Result.clearFlag(Token::LeadingSpace); 3813 3814 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3815 return true; // KeepWhitespaceMode 3816 3817 // We only saw whitespace, so just try again with this lexer. 3818 // (We manually eliminate the tail call to avoid recursion.) 3819 goto LexNextToken; 3820 case ' ': 3821 case '\t': 3822 case '\f': 3823 case '\v': 3824 SkipHorizontalWhitespace: 3825 Result.setFlag(Token::LeadingSpace); 3826 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3827 return true; // KeepWhitespaceMode 3828 3829 SkipIgnoredUnits: 3830 CurPtr = BufferPtr; 3831 3832 // If the next token is obviously a // or /* */ comment, skip it efficiently 3833 // too (without going through the big switch stmt). 3834 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3835 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3836 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3837 return true; // There is a token to return. 3838 goto SkipIgnoredUnits; 3839 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3840 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3841 return true; // There is a token to return. 3842 goto SkipIgnoredUnits; 3843 } else if (isHorizontalWhitespace(*CurPtr)) { 3844 goto SkipHorizontalWhitespace; 3845 } 3846 // We only saw whitespace, so just try again with this lexer. 3847 // (We manually eliminate the tail call to avoid recursion.) 3848 goto LexNextToken; 3849 3850 // C99 6.4.4.1: Integer Constants. 3851 // C99 6.4.4.2: Floating Constants. 3852 case '0': case '1': case '2': case '3': case '4': 3853 case '5': case '6': case '7': case '8': case '9': 3854 // Notify MIOpt that we read a non-whitespace/non-comment token. 3855 MIOpt.ReadToken(); 3856 return LexNumericConstant(Result, CurPtr); 3857 3858 // Identifier (e.g., uber), or 3859 // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or 3860 // UTF-8 or UTF-16 string literal (C11/C++11). 3861 case 'u': 3862 // Notify MIOpt that we read a non-whitespace/non-comment token. 3863 MIOpt.ReadToken(); 3864 3865 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3866 Char = getCharAndSize(CurPtr, SizeTmp); 3867 3868 // UTF-16 string literal 3869 if (Char == '"') 3870 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3871 tok::utf16_string_literal); 3872 3873 // UTF-16 character constant 3874 if (Char == '\'') 3875 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3876 tok::utf16_char_constant); 3877 3878 // UTF-16 raw string literal 3879 if (Char == 'R' && LangOpts.RawStringLiterals && 3880 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3881 return LexRawStringLiteral(Result, 3882 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3883 SizeTmp2, Result), 3884 tok::utf16_string_literal); 3885 3886 if (Char == '8') { 3887 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3888 3889 // UTF-8 string literal 3890 if (Char2 == '"') 3891 return LexStringLiteral(Result, 3892 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3893 SizeTmp2, Result), 3894 tok::utf8_string_literal); 3895 if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23)) 3896 return LexCharConstant( 3897 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3898 SizeTmp2, Result), 3899 tok::utf8_char_constant); 3900 3901 if (Char2 == 'R' && LangOpts.RawStringLiterals) { 3902 unsigned SizeTmp3; 3903 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3904 // UTF-8 raw string literal 3905 if (Char3 == '"') { 3906 return LexRawStringLiteral(Result, 3907 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3908 SizeTmp2, Result), 3909 SizeTmp3, Result), 3910 tok::utf8_string_literal); 3911 } 3912 } 3913 } 3914 } 3915 3916 // treat u like the start of an identifier. 3917 return LexIdentifierContinue(Result, CurPtr); 3918 3919 case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 3920 // Notify MIOpt that we read a non-whitespace/non-comment token. 3921 MIOpt.ReadToken(); 3922 3923 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3924 Char = getCharAndSize(CurPtr, SizeTmp); 3925 3926 // UTF-32 string literal 3927 if (Char == '"') 3928 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3929 tok::utf32_string_literal); 3930 3931 // UTF-32 character constant 3932 if (Char == '\'') 3933 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3934 tok::utf32_char_constant); 3935 3936 // UTF-32 raw string literal 3937 if (Char == 'R' && LangOpts.RawStringLiterals && 3938 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3939 return LexRawStringLiteral(Result, 3940 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3941 SizeTmp2, Result), 3942 tok::utf32_string_literal); 3943 } 3944 3945 // treat U like the start of an identifier. 3946 return LexIdentifierContinue(Result, CurPtr); 3947 3948 case 'R': // Identifier or C++0x raw string literal 3949 // Notify MIOpt that we read a non-whitespace/non-comment token. 3950 MIOpt.ReadToken(); 3951 3952 if (LangOpts.RawStringLiterals) { 3953 Char = getCharAndSize(CurPtr, SizeTmp); 3954 3955 if (Char == '"') 3956 return LexRawStringLiteral(Result, 3957 ConsumeChar(CurPtr, SizeTmp, Result), 3958 tok::string_literal); 3959 } 3960 3961 // treat R like the start of an identifier. 3962 return LexIdentifierContinue(Result, CurPtr); 3963 3964 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3965 // Notify MIOpt that we read a non-whitespace/non-comment token. 3966 MIOpt.ReadToken(); 3967 Char = getCharAndSize(CurPtr, SizeTmp); 3968 3969 // Wide string literal. 3970 if (Char == '"') 3971 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3972 tok::wide_string_literal); 3973 3974 // Wide raw string literal. 3975 if (LangOpts.RawStringLiterals && Char == 'R' && 3976 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3977 return LexRawStringLiteral(Result, 3978 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3979 SizeTmp2, Result), 3980 tok::wide_string_literal); 3981 3982 // Wide character constant. 3983 if (Char == '\'') 3984 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3985 tok::wide_char_constant); 3986 // FALL THROUGH, treating L like the start of an identifier. 3987 [[fallthrough]]; 3988 3989 // C99 6.4.2: Identifiers. 3990 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3991 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3992 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3993 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3994 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3995 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3996 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3997 case 'v': case 'w': case 'x': case 'y': case 'z': 3998 case '_': 3999 // Notify MIOpt that we read a non-whitespace/non-comment token. 4000 MIOpt.ReadToken(); 4001 return LexIdentifierContinue(Result, CurPtr); 4002 4003 case '$': // $ in identifiers. 4004 if (LangOpts.DollarIdents) { 4005 if (!isLexingRawMode()) 4006 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 4007 // Notify MIOpt that we read a non-whitespace/non-comment token. 4008 MIOpt.ReadToken(); 4009 return LexIdentifierContinue(Result, CurPtr); 4010 } 4011 4012 Kind = tok::unknown; 4013 break; 4014 4015 // C99 6.4.4: Character Constants. 4016 case '\'': 4017 // Notify MIOpt that we read a non-whitespace/non-comment token. 4018 MIOpt.ReadToken(); 4019 return LexCharConstant(Result, CurPtr, tok::char_constant); 4020 4021 // C99 6.4.5: String Literals. 4022 case '"': 4023 // Notify MIOpt that we read a non-whitespace/non-comment token. 4024 MIOpt.ReadToken(); 4025 return LexStringLiteral(Result, CurPtr, 4026 ParsingFilename ? tok::header_name 4027 : tok::string_literal); 4028 4029 // C99 6.4.6: Punctuators. 4030 case '?': 4031 Kind = tok::question; 4032 break; 4033 case '[': 4034 Kind = tok::l_square; 4035 break; 4036 case ']': 4037 Kind = tok::r_square; 4038 break; 4039 case '(': 4040 Kind = tok::l_paren; 4041 break; 4042 case ')': 4043 Kind = tok::r_paren; 4044 break; 4045 case '{': 4046 Kind = tok::l_brace; 4047 break; 4048 case '}': 4049 Kind = tok::r_brace; 4050 break; 4051 case '.': 4052 Char = getCharAndSize(CurPtr, SizeTmp); 4053 if (Char >= '0' && Char <= '9') { 4054 // Notify MIOpt that we read a non-whitespace/non-comment token. 4055 MIOpt.ReadToken(); 4056 4057 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 4058 } else if (LangOpts.CPlusPlus && Char == '*') { 4059 Kind = tok::periodstar; 4060 CurPtr += SizeTmp; 4061 } else if (Char == '.' && 4062 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 4063 Kind = tok::ellipsis; 4064 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4065 SizeTmp2, Result); 4066 } else { 4067 Kind = tok::period; 4068 } 4069 break; 4070 case '&': 4071 Char = getCharAndSize(CurPtr, SizeTmp); 4072 if (Char == '&') { 4073 Kind = tok::ampamp; 4074 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4075 } else if (Char == '=') { 4076 Kind = tok::ampequal; 4077 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4078 } else { 4079 Kind = tok::amp; 4080 } 4081 break; 4082 case '*': 4083 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4084 Kind = tok::starequal; 4085 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4086 } else { 4087 Kind = tok::star; 4088 } 4089 break; 4090 case '+': 4091 Char = getCharAndSize(CurPtr, SizeTmp); 4092 if (Char == '+') { 4093 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4094 Kind = tok::plusplus; 4095 } else if (Char == '=') { 4096 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4097 Kind = tok::plusequal; 4098 } else { 4099 Kind = tok::plus; 4100 } 4101 break; 4102 case '-': 4103 Char = getCharAndSize(CurPtr, SizeTmp); 4104 if (Char == '-') { // -- 4105 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4106 Kind = tok::minusminus; 4107 } else if (Char == '>' && LangOpts.CPlusPlus && 4108 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 4109 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4110 SizeTmp2, Result); 4111 Kind = tok::arrowstar; 4112 } else if (Char == '>') { // -> 4113 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4114 Kind = tok::arrow; 4115 } else if (Char == '=') { // -= 4116 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4117 Kind = tok::minusequal; 4118 } else { 4119 Kind = tok::minus; 4120 } 4121 break; 4122 case '~': 4123 Kind = tok::tilde; 4124 break; 4125 case '!': 4126 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 4127 Kind = tok::exclaimequal; 4128 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4129 } else { 4130 Kind = tok::exclaim; 4131 } 4132 break; 4133 case '/': 4134 // 6.4.9: Comments 4135 Char = getCharAndSize(CurPtr, SizeTmp); 4136 if (Char == '/') { // Line comment. 4137 // Even if Line comments are disabled (e.g. in C89 mode), we generally 4138 // want to lex this as a comment. There is one problem with this though, 4139 // that in one particular corner case, this can change the behavior of the 4140 // resultant program. For example, In "foo //**/ bar", C89 would lex 4141 // this as "foo / bar" and languages with Line comments would lex it as 4142 // "foo". Check to see if the character after the second slash is a '*'. 4143 // If so, we will lex that as a "/" instead of the start of a comment. 4144 // However, we never do this if we are just preprocessing. 4145 bool TreatAsComment = 4146 LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 4147 if (!TreatAsComment) 4148 if (!(PP && PP->isPreprocessedOutput())) 4149 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 4150 4151 if (TreatAsComment) { 4152 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4153 TokAtPhysicalStartOfLine)) 4154 return true; // There is a token to return. 4155 4156 // It is common for the tokens immediately after a // comment to be 4157 // whitespace (indentation for the next line). Instead of going through 4158 // the big switch, handle it efficiently now. 4159 goto SkipIgnoredUnits; 4160 } 4161 } 4162 4163 if (Char == '*') { // /**/ comment. 4164 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 4165 TokAtPhysicalStartOfLine)) 4166 return true; // There is a token to return. 4167 4168 // We only saw whitespace, so just try again with this lexer. 4169 // (We manually eliminate the tail call to avoid recursion.) 4170 goto LexNextToken; 4171 } 4172 4173 if (Char == '=') { 4174 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4175 Kind = tok::slashequal; 4176 } else { 4177 Kind = tok::slash; 4178 } 4179 break; 4180 case '%': 4181 Char = getCharAndSize(CurPtr, SizeTmp); 4182 if (Char == '=') { 4183 Kind = tok::percentequal; 4184 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4185 } else if (LangOpts.Digraphs && Char == '>') { 4186 Kind = tok::r_brace; // '%>' -> '}' 4187 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4188 } else if (LangOpts.Digraphs && Char == ':') { 4189 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4190 Char = getCharAndSize(CurPtr, SizeTmp); 4191 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 4192 Kind = tok::hashhash; // '%:%:' -> '##' 4193 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4194 SizeTmp2, Result); 4195 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 4196 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4197 if (!isLexingRawMode()) 4198 Diag(BufferPtr, diag::ext_charize_microsoft); 4199 Kind = tok::hashat; 4200 } else { // '%:' -> '#' 4201 // We parsed a # character. If this occurs at the start of the line, 4202 // it's actually the start of a preprocessing directive. Callback to 4203 // the preprocessor to handle it. 4204 // TODO: -fpreprocessed mode?? 4205 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4206 goto HandleDirective; 4207 4208 Kind = tok::hash; 4209 } 4210 } else { 4211 Kind = tok::percent; 4212 } 4213 break; 4214 case '<': 4215 Char = getCharAndSize(CurPtr, SizeTmp); 4216 if (ParsingFilename) { 4217 return LexAngledStringLiteral(Result, CurPtr); 4218 } else if (Char == '<') { 4219 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4220 if (After == '=') { 4221 Kind = tok::lesslessequal; 4222 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4223 SizeTmp2, Result); 4224 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 4225 // If this is actually a '<<<<<<<' version control conflict marker, 4226 // recognize it as such and recover nicely. 4227 goto LexNextToken; 4228 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 4229 // If this is '<<<<' and we're in a Perforce-style conflict marker, 4230 // ignore it. 4231 goto LexNextToken; 4232 } else if (LangOpts.CUDA && After == '<') { 4233 Kind = tok::lesslessless; 4234 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4235 SizeTmp2, Result); 4236 } else { 4237 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4238 Kind = tok::lessless; 4239 } 4240 } else if (Char == '=') { 4241 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4242 if (After == '>') { 4243 if (LangOpts.CPlusPlus20) { 4244 if (!isLexingRawMode()) 4245 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 4246 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4247 SizeTmp2, Result); 4248 Kind = tok::spaceship; 4249 break; 4250 } 4251 // Suggest adding a space between the '<=' and the '>' to avoid a 4252 // change in semantics if this turns up in C++ <=17 mode. 4253 if (LangOpts.CPlusPlus && !isLexingRawMode()) { 4254 Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 4255 << FixItHint::CreateInsertion( 4256 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 4257 } 4258 } 4259 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4260 Kind = tok::lessequal; 4261 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 4262 if (LangOpts.CPlusPlus11 && 4263 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 4264 // C++0x [lex.pptoken]p3: 4265 // Otherwise, if the next three characters are <:: and the subsequent 4266 // character is neither : nor >, the < is treated as a preprocessor 4267 // token by itself and not as the first character of the alternative 4268 // token <:. 4269 unsigned SizeTmp3; 4270 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 4271 if (After != ':' && After != '>') { 4272 Kind = tok::less; 4273 if (!isLexingRawMode()) 4274 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 4275 break; 4276 } 4277 } 4278 4279 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4280 Kind = tok::l_square; 4281 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 4282 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4283 Kind = tok::l_brace; 4284 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 4285 lexEditorPlaceholder(Result, CurPtr)) { 4286 return true; 4287 } else { 4288 Kind = tok::less; 4289 } 4290 break; 4291 case '>': 4292 Char = getCharAndSize(CurPtr, SizeTmp); 4293 if (Char == '=') { 4294 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4295 Kind = tok::greaterequal; 4296 } else if (Char == '>') { 4297 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 4298 if (After == '=') { 4299 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4300 SizeTmp2, Result); 4301 Kind = tok::greatergreaterequal; 4302 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 4303 // If this is actually a '>>>>' conflict marker, recognize it as such 4304 // and recover nicely. 4305 goto LexNextToken; 4306 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 4307 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 4308 goto LexNextToken; 4309 } else if (LangOpts.CUDA && After == '>') { 4310 Kind = tok::greatergreatergreater; 4311 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 4312 SizeTmp2, Result); 4313 } else { 4314 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4315 Kind = tok::greatergreater; 4316 } 4317 } else { 4318 Kind = tok::greater; 4319 } 4320 break; 4321 case '^': 4322 Char = getCharAndSize(CurPtr, SizeTmp); 4323 if (Char == '=') { 4324 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4325 Kind = tok::caretequal; 4326 } else if (LangOpts.OpenCL && Char == '^') { 4327 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4328 Kind = tok::caretcaret; 4329 } else { 4330 Kind = tok::caret; 4331 } 4332 break; 4333 case '|': 4334 Char = getCharAndSize(CurPtr, SizeTmp); 4335 if (Char == '=') { 4336 Kind = tok::pipeequal; 4337 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4338 } else if (Char == '|') { 4339 // If this is '|||||||' and we're in a conflict marker, ignore it. 4340 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 4341 goto LexNextToken; 4342 Kind = tok::pipepipe; 4343 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4344 } else { 4345 Kind = tok::pipe; 4346 } 4347 break; 4348 case ':': 4349 Char = getCharAndSize(CurPtr, SizeTmp); 4350 if (LangOpts.Digraphs && Char == '>') { 4351 Kind = tok::r_square; // ':>' -> ']' 4352 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4353 } else if (Char == ':') { 4354 Kind = tok::coloncolon; 4355 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4356 } else { 4357 Kind = tok::colon; 4358 } 4359 break; 4360 case ';': 4361 Kind = tok::semi; 4362 break; 4363 case '=': 4364 Char = getCharAndSize(CurPtr, SizeTmp); 4365 if (Char == '=') { 4366 // If this is '====' and we're in a conflict marker, ignore it. 4367 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 4368 goto LexNextToken; 4369 4370 Kind = tok::equalequal; 4371 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4372 } else { 4373 Kind = tok::equal; 4374 } 4375 break; 4376 case ',': 4377 Kind = tok::comma; 4378 break; 4379 case '#': 4380 Char = getCharAndSize(CurPtr, SizeTmp); 4381 if (Char == '#') { 4382 Kind = tok::hashhash; 4383 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4384 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 4385 Kind = tok::hashat; 4386 if (!isLexingRawMode()) 4387 Diag(BufferPtr, diag::ext_charize_microsoft); 4388 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 4389 } else { 4390 // We parsed a # character. If this occurs at the start of the line, 4391 // it's actually the start of a preprocessing directive. Callback to 4392 // the preprocessor to handle it. 4393 // TODO: -fpreprocessed mode?? 4394 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 4395 goto HandleDirective; 4396 4397 Kind = tok::hash; 4398 } 4399 break; 4400 4401 case '@': 4402 // Objective C support. 4403 if (CurPtr[-1] == '@' && LangOpts.ObjC) 4404 Kind = tok::at; 4405 else 4406 Kind = tok::unknown; 4407 break; 4408 4409 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 4410 case '\\': 4411 if (!LangOpts.AsmPreprocessor) { 4412 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 4413 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4414 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4415 return true; // KeepWhitespaceMode 4416 4417 // We only saw whitespace, so just try again with this lexer. 4418 // (We manually eliminate the tail call to avoid recursion.) 4419 goto LexNextToken; 4420 } 4421 4422 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4423 } 4424 } 4425 4426 Kind = tok::unknown; 4427 break; 4428 4429 default: { 4430 if (isASCII(Char)) { 4431 Kind = tok::unknown; 4432 break; 4433 } 4434 4435 llvm::UTF32 CodePoint; 4436 4437 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 4438 // an escaped newline. 4439 --CurPtr; 4440 llvm::ConversionResult Status = 4441 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 4442 (const llvm::UTF8 *)BufferEnd, 4443 &CodePoint, 4444 llvm::strictConversion); 4445 if (Status == llvm::conversionOK) { 4446 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 4447 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 4448 return true; // KeepWhitespaceMode 4449 4450 // We only saw whitespace, so just try again with this lexer. 4451 // (We manually eliminate the tail call to avoid recursion.) 4452 goto LexNextToken; 4453 } 4454 return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 4455 } 4456 4457 if (isLexingRawMode() || ParsingPreprocessorDirective || 4458 PP->isPreprocessedOutput()) { 4459 ++CurPtr; 4460 Kind = tok::unknown; 4461 break; 4462 } 4463 4464 // Non-ASCII characters tend to creep into source code unintentionally. 4465 // Instead of letting the parser complain about the unknown token, 4466 // just diagnose the invalid UTF-8, then drop the character. 4467 Diag(CurPtr, diag::err_invalid_utf8); 4468 4469 BufferPtr = CurPtr+1; 4470 // We're pretending the character didn't exist, so just try again with 4471 // this lexer. 4472 // (We manually eliminate the tail call to avoid recursion.) 4473 goto LexNextToken; 4474 } 4475 } 4476 4477 // Notify MIOpt that we read a non-whitespace/non-comment token. 4478 MIOpt.ReadToken(); 4479 4480 // Update the location of token as well as BufferPtr. 4481 FormTokenWithChars(Result, CurPtr, Kind); 4482 return true; 4483 4484 HandleDirective: 4485 // We parsed a # character and it's the start of a preprocessing directive. 4486 4487 FormTokenWithChars(Result, CurPtr, tok::hash); 4488 PP->HandleDirective(Result); 4489 4490 if (PP->hadModuleLoaderFatalFailure()) 4491 // With a fatal failure in the module loader, we abort parsing. 4492 return true; 4493 4494 // We parsed the directive; lex a token with the new state. 4495 return false; 4496 4497 LexNextToken: 4498 Result.clearFlag(Token::NeedsCleaning); 4499 goto LexStart; 4500 } 4501 4502 const char *Lexer::convertDependencyDirectiveToken( 4503 const dependency_directives_scan::Token &DDTok, Token &Result) { 4504 const char *TokPtr = BufferStart + DDTok.Offset; 4505 Result.startToken(); 4506 Result.setLocation(getSourceLocation(TokPtr)); 4507 Result.setKind(DDTok.Kind); 4508 Result.setFlag((Token::TokenFlags)DDTok.Flags); 4509 Result.setLength(DDTok.Length); 4510 BufferPtr = TokPtr + DDTok.Length; 4511 return TokPtr; 4512 } 4513 4514 bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4515 assert(isDependencyDirectivesLexer()); 4516 4517 using namespace dependency_directives_scan; 4518 4519 while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4520 if (DepDirectives.front().Kind == pp_eof) 4521 return LexEndOfFile(Result, BufferEnd); 4522 if (DepDirectives.front().Kind == tokens_present_before_eof) 4523 MIOpt.ReadToken(); 4524 NextDepDirectiveTokenIndex = 0; 4525 DepDirectives = DepDirectives.drop_front(); 4526 } 4527 4528 const dependency_directives_scan::Token &DDTok = 4529 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4530 if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4531 // Read something other than a preprocessor directive hash. 4532 MIOpt.ReadToken(); 4533 } 4534 4535 if (ParsingFilename && DDTok.is(tok::less)) { 4536 BufferPtr = BufferStart + DDTok.Offset; 4537 LexAngledStringLiteral(Result, BufferPtr + 1); 4538 if (Result.isNot(tok::header_name)) 4539 return true; 4540 // Advance the index of lexed tokens. 4541 while (true) { 4542 const dependency_directives_scan::Token &NextTok = 4543 DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4544 if (BufferStart + NextTok.Offset >= BufferPtr) 4545 break; 4546 ++NextDepDirectiveTokenIndex; 4547 } 4548 return true; 4549 } 4550 4551 const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4552 4553 if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4554 PP->HandleDirective(Result); 4555 return false; 4556 } 4557 if (Result.is(tok::raw_identifier)) { 4558 Result.setRawIdentifierData(TokPtr); 4559 if (!isLexingRawMode()) { 4560 const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4561 if (II->isHandleIdentifierCase()) 4562 return PP->HandleIdentifier(Result); 4563 } 4564 return true; 4565 } 4566 if (Result.isLiteral()) { 4567 Result.setLiteralData(TokPtr); 4568 return true; 4569 } 4570 if (Result.is(tok::colon)) { 4571 // Convert consecutive colons to 'tok::coloncolon'. 4572 if (*BufferPtr == ':') { 4573 assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4574 tok::colon)); 4575 ++NextDepDirectiveTokenIndex; 4576 Result.setKind(tok::coloncolon); 4577 } 4578 return true; 4579 } 4580 if (Result.is(tok::eod)) 4581 ParsingPreprocessorDirective = false; 4582 4583 return true; 4584 } 4585 4586 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4587 assert(isDependencyDirectivesLexer()); 4588 4589 using namespace dependency_directives_scan; 4590 4591 bool Stop = false; 4592 unsigned NestedIfs = 0; 4593 do { 4594 DepDirectives = DepDirectives.drop_front(); 4595 switch (DepDirectives.front().Kind) { 4596 case pp_none: 4597 llvm_unreachable("unexpected 'pp_none'"); 4598 case pp_include: 4599 case pp___include_macros: 4600 case pp_define: 4601 case pp_undef: 4602 case pp_import: 4603 case pp_pragma_import: 4604 case pp_pragma_once: 4605 case pp_pragma_push_macro: 4606 case pp_pragma_pop_macro: 4607 case pp_pragma_include_alias: 4608 case pp_pragma_system_header: 4609 case pp_include_next: 4610 case decl_at_import: 4611 case cxx_module_decl: 4612 case cxx_import_decl: 4613 case cxx_export_module_decl: 4614 case cxx_export_import_decl: 4615 case tokens_present_before_eof: 4616 break; 4617 case pp_if: 4618 case pp_ifdef: 4619 case pp_ifndef: 4620 ++NestedIfs; 4621 break; 4622 case pp_elif: 4623 case pp_elifdef: 4624 case pp_elifndef: 4625 case pp_else: 4626 if (!NestedIfs) { 4627 Stop = true; 4628 } 4629 break; 4630 case pp_endif: 4631 if (!NestedIfs) { 4632 Stop = true; 4633 } else { 4634 --NestedIfs; 4635 } 4636 break; 4637 case pp_eof: 4638 NextDepDirectiveTokenIndex = 0; 4639 return LexEndOfFile(Result, BufferEnd); 4640 } 4641 } while (!Stop); 4642 4643 const dependency_directives_scan::Token &DDTok = 4644 DepDirectives.front().Tokens.front(); 4645 assert(DDTok.is(tok::hash)); 4646 NextDepDirectiveTokenIndex = 1; 4647 4648 convertDependencyDirectiveToken(DDTok, Result); 4649 return false; 4650 } 4651