1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This file implements the Lexer and Token interfaces. 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "clang/Lex/Lexer.h" 14 #include "UnicodeCharSets.h" 15 #include "clang/Basic/CharInfo.h" 16 #include "clang/Basic/IdentifierTable.h" 17 #include "clang/Basic/LangOptions.h" 18 #include "clang/Basic/SourceLocation.h" 19 #include "clang/Basic/SourceManager.h" 20 #include "clang/Basic/TokenKinds.h" 21 #include "clang/Lex/LexDiagnostic.h" 22 #include "clang/Lex/LiteralSupport.h" 23 #include "clang/Lex/MultipleIncludeOpt.h" 24 #include "clang/Lex/Preprocessor.h" 25 #include "clang/Lex/PreprocessorOptions.h" 26 #include "clang/Lex/Token.h" 27 #include "clang/Basic/Diagnostic.h" 28 #include "clang/Basic/LLVM.h" 29 #include "clang/Basic/TokenKinds.h" 30 #include "llvm/ADT/None.h" 31 #include "llvm/ADT/Optional.h" 32 #include "llvm/ADT/StringExtras.h" 33 #include "llvm/ADT/StringSwitch.h" 34 #include "llvm/ADT/StringRef.h" 35 #include "llvm/Support/Compiler.h" 36 #include "llvm/Support/ConvertUTF.h" 37 #include "llvm/Support/MathExtras.h" 38 #include "llvm/Support/MemoryBuffer.h" 39 #include "llvm/Support/NativeFormatting.h" 40 #include "llvm/Support/UnicodeCharRanges.h" 41 #include <algorithm> 42 #include <cassert> 43 #include <cstddef> 44 #include <cstdint> 45 #include <cstring> 46 #include <string> 47 #include <tuple> 48 #include <utility> 49 50 using namespace clang; 51 52 //===----------------------------------------------------------------------===// 53 // Token Class Implementation 54 //===----------------------------------------------------------------------===// 55 56 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 57 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 58 if (isAnnotation()) 59 return false; 60 if (IdentifierInfo *II = getIdentifierInfo()) 61 return II->getObjCKeywordID() == objcKey; 62 return false; 63 } 64 65 /// getObjCKeywordID - Return the ObjC keyword kind. 66 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 67 if (isAnnotation()) 68 return tok::objc_not_keyword; 69 IdentifierInfo *specId = getIdentifierInfo(); 70 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 71 } 72 73 //===----------------------------------------------------------------------===// 74 // Lexer Class Implementation 75 //===----------------------------------------------------------------------===// 76 77 void Lexer::anchor() {} 78 79 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 80 const char *BufEnd) { 81 BufferStart = BufStart; 82 BufferPtr = BufPtr; 83 BufferEnd = BufEnd; 84 85 assert(BufEnd[0] == 0 && 86 "We assume that the input buffer has a null character at the end" 87 " to simplify lexing!"); 88 89 // Check whether we have a BOM in the beginning of the buffer. If yes - act 90 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 91 // skip the UTF-8 BOM if it's present. 92 if (BufferStart == BufferPtr) { 93 // Determine the size of the BOM. 94 StringRef Buf(BufferStart, BufferEnd - BufferStart); 95 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 96 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 97 .Default(0); 98 99 // Skip the BOM. 100 BufferPtr += BOMLength; 101 } 102 103 Is_PragmaLexer = false; 104 CurrentConflictMarkerState = CMK_None; 105 106 // Start of the file is a start of line. 107 IsAtStartOfLine = true; 108 IsAtPhysicalStartOfLine = true; 109 110 HasLeadingSpace = false; 111 HasLeadingEmptyMacro = false; 112 113 // We are not after parsing a #. 114 ParsingPreprocessorDirective = false; 115 116 // We are not after parsing #include. 117 ParsingFilename = false; 118 119 // We are not in raw mode. Raw mode disables diagnostics and interpretation 120 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 121 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 122 // or otherwise skipping over tokens. 123 LexingRawMode = false; 124 125 // Default to not keeping comments. 126 ExtendedTokenMode = 0; 127 } 128 129 /// Lexer constructor - Create a new lexer object for the specified buffer 130 /// with the specified preprocessor managing the lexing process. This lexer 131 /// assumes that the associated file buffer and Preprocessor objects will 132 /// outlive it, so it doesn't take ownership of either of them. 133 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 134 : PreprocessorLexer(&PP, FID), 135 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 136 LangOpts(PP.getLangOpts()) { 137 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 138 InputFile->getBufferEnd()); 139 140 resetExtendedTokenMode(); 141 } 142 143 /// Lexer constructor - Create a new raw lexer object. This object is only 144 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 145 /// range will outlive it, so it doesn't take ownership of it. 146 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 147 const char *BufStart, const char *BufPtr, const char *BufEnd) 148 : FileLoc(fileloc), LangOpts(langOpts) { 149 InitLexer(BufStart, BufPtr, BufEnd); 150 151 // We *are* in raw mode. 152 LexingRawMode = true; 153 } 154 155 /// Lexer constructor - Create a new raw lexer object. This object is only 156 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 157 /// range will outlive it, so it doesn't take ownership of it. 158 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 159 const SourceManager &SM, const LangOptions &langOpts) 160 : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), 161 FromFile->getBufferStart(), FromFile->getBufferEnd()) {} 162 163 void Lexer::resetExtendedTokenMode() { 164 assert(PP && "Cannot reset token mode without a preprocessor"); 165 if (LangOpts.TraditionalCPP) 166 SetKeepWhitespaceMode(true); 167 else 168 SetCommentRetentionState(PP->getCommentRetentionState()); 169 } 170 171 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 172 /// _Pragma expansion. This has a variety of magic semantics that this method 173 /// sets up. It returns a new'd Lexer that must be delete'd when done. 174 /// 175 /// On entrance to this routine, TokStartLoc is a macro location which has a 176 /// spelling loc that indicates the bytes to be lexed for the token and an 177 /// expansion location that indicates where all lexed tokens should be 178 /// "expanded from". 179 /// 180 /// TODO: It would really be nice to make _Pragma just be a wrapper around a 181 /// normal lexer that remaps tokens as they fly by. This would require making 182 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 183 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 184 /// out of the critical path of the lexer! 185 /// 186 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 187 SourceLocation ExpansionLocStart, 188 SourceLocation ExpansionLocEnd, 189 unsigned TokLen, Preprocessor &PP) { 190 SourceManager &SM = PP.getSourceManager(); 191 192 // Create the lexer as if we were going to lex the file normally. 193 FileID SpellingFID = SM.getFileID(SpellingLoc); 194 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 195 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 196 197 // Now that the lexer is created, change the start/end locations so that we 198 // just lex the subsection of the file that we want. This is lexing from a 199 // scratch buffer. 200 const char *StrData = SM.getCharacterData(SpellingLoc); 201 202 L->BufferPtr = StrData; 203 L->BufferEnd = StrData+TokLen; 204 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 205 206 // Set the SourceLocation with the remapping information. This ensures that 207 // GetMappedTokenLoc will remap the tokens as they are lexed. 208 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 209 ExpansionLocStart, 210 ExpansionLocEnd, TokLen); 211 212 // Ensure that the lexer thinks it is inside a directive, so that end \n will 213 // return an EOD token. 214 L->ParsingPreprocessorDirective = true; 215 216 // This lexer really is for _Pragma. 217 L->Is_PragmaLexer = true; 218 return L; 219 } 220 221 bool Lexer::skipOver(unsigned NumBytes) { 222 IsAtPhysicalStartOfLine = true; 223 IsAtStartOfLine = true; 224 if ((BufferPtr + NumBytes) > BufferEnd) 225 return true; 226 BufferPtr += NumBytes; 227 return false; 228 } 229 230 template <typename T> static void StringifyImpl(T &Str, char Quote) { 231 typename T::size_type i = 0, e = Str.size(); 232 while (i < e) { 233 if (Str[i] == '\\' || Str[i] == Quote) { 234 Str.insert(Str.begin() + i, '\\'); 235 i += 2; 236 ++e; 237 } else if (Str[i] == '\n' || Str[i] == '\r') { 238 // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 239 if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 240 Str[i] != Str[i + 1]) { 241 Str[i] = '\\'; 242 Str[i + 1] = 'n'; 243 } else { 244 // Replace '\n' and '\r' to '\\' followed by 'n'. 245 Str[i] = '\\'; 246 Str.insert(Str.begin() + i + 1, 'n'); 247 ++e; 248 } 249 i += 2; 250 } else 251 ++i; 252 } 253 } 254 255 std::string Lexer::Stringify(StringRef Str, bool Charify) { 256 std::string Result = Str; 257 char Quote = Charify ? '\'' : '"'; 258 StringifyImpl(Result, Quote); 259 return Result; 260 } 261 262 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 263 264 //===----------------------------------------------------------------------===// 265 // Token Spelling 266 //===----------------------------------------------------------------------===// 267 268 /// Slow case of getSpelling. Extract the characters comprising the 269 /// spelling of this token from the provided input buffer. 270 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 271 const LangOptions &LangOpts, char *Spelling) { 272 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 273 274 size_t Length = 0; 275 const char *BufEnd = BufPtr + Tok.getLength(); 276 277 if (tok::isStringLiteral(Tok.getKind())) { 278 // Munch the encoding-prefix and opening double-quote. 279 while (BufPtr < BufEnd) { 280 unsigned Size; 281 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 282 BufPtr += Size; 283 284 if (Spelling[Length - 1] == '"') 285 break; 286 } 287 288 // Raw string literals need special handling; trigraph expansion and line 289 // splicing do not occur within their d-char-sequence nor within their 290 // r-char-sequence. 291 if (Length >= 2 && 292 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 293 // Search backwards from the end of the token to find the matching closing 294 // quote. 295 const char *RawEnd = BufEnd; 296 do --RawEnd; while (*RawEnd != '"'); 297 size_t RawLength = RawEnd - BufPtr + 1; 298 299 // Everything between the quotes is included verbatim in the spelling. 300 memcpy(Spelling + Length, BufPtr, RawLength); 301 Length += RawLength; 302 BufPtr += RawLength; 303 304 // The rest of the token is lexed normally. 305 } 306 } 307 308 while (BufPtr < BufEnd) { 309 unsigned Size; 310 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 311 BufPtr += Size; 312 } 313 314 assert(Length < Tok.getLength() && 315 "NeedsCleaning flag set on token that didn't need cleaning!"); 316 return Length; 317 } 318 319 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 320 /// token are the characters used to represent the token in the source file 321 /// after trigraph expansion and escaped-newline folding. In particular, this 322 /// wants to get the true, uncanonicalized, spelling of things like digraphs 323 /// UCNs, etc. 324 StringRef Lexer::getSpelling(SourceLocation loc, 325 SmallVectorImpl<char> &buffer, 326 const SourceManager &SM, 327 const LangOptions &options, 328 bool *invalid) { 329 // Break down the source location. 330 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 331 332 // Try to the load the file buffer. 333 bool invalidTemp = false; 334 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 335 if (invalidTemp) { 336 if (invalid) *invalid = true; 337 return {}; 338 } 339 340 const char *tokenBegin = file.data() + locInfo.second; 341 342 // Lex from the start of the given location. 343 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 344 file.begin(), tokenBegin, file.end()); 345 Token token; 346 lexer.LexFromRawLexer(token); 347 348 unsigned length = token.getLength(); 349 350 // Common case: no need for cleaning. 351 if (!token.needsCleaning()) 352 return StringRef(tokenBegin, length); 353 354 // Hard case, we need to relex the characters into the string. 355 buffer.resize(length); 356 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 357 return StringRef(buffer.data(), buffer.size()); 358 } 359 360 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 361 /// token are the characters used to represent the token in the source file 362 /// after trigraph expansion and escaped-newline folding. In particular, this 363 /// wants to get the true, uncanonicalized, spelling of things like digraphs 364 /// UCNs, etc. 365 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 366 const LangOptions &LangOpts, bool *Invalid) { 367 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 368 369 bool CharDataInvalid = false; 370 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 371 &CharDataInvalid); 372 if (Invalid) 373 *Invalid = CharDataInvalid; 374 if (CharDataInvalid) 375 return {}; 376 377 // If this token contains nothing interesting, return it directly. 378 if (!Tok.needsCleaning()) 379 return std::string(TokStart, TokStart + Tok.getLength()); 380 381 std::string Result; 382 Result.resize(Tok.getLength()); 383 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 384 return Result; 385 } 386 387 /// getSpelling - This method is used to get the spelling of a token into a 388 /// preallocated buffer, instead of as an std::string. The caller is required 389 /// to allocate enough space for the token, which is guaranteed to be at least 390 /// Tok.getLength() bytes long. The actual length of the token is returned. 391 /// 392 /// Note that this method may do two possible things: it may either fill in 393 /// the buffer specified with characters, or it may *change the input pointer* 394 /// to point to a constant buffer with the data already in it (avoiding a 395 /// copy). The caller is not allowed to modify the returned buffer pointer 396 /// if an internal buffer is returned. 397 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 398 const SourceManager &SourceMgr, 399 const LangOptions &LangOpts, bool *Invalid) { 400 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 401 402 const char *TokStart = nullptr; 403 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 404 if (Tok.is(tok::raw_identifier)) 405 TokStart = Tok.getRawIdentifier().data(); 406 else if (!Tok.hasUCN()) { 407 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 408 // Just return the string from the identifier table, which is very quick. 409 Buffer = II->getNameStart(); 410 return II->getLength(); 411 } 412 } 413 414 // NOTE: this can be checked even after testing for an IdentifierInfo. 415 if (Tok.isLiteral()) 416 TokStart = Tok.getLiteralData(); 417 418 if (!TokStart) { 419 // Compute the start of the token in the input lexer buffer. 420 bool CharDataInvalid = false; 421 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 422 if (Invalid) 423 *Invalid = CharDataInvalid; 424 if (CharDataInvalid) { 425 Buffer = ""; 426 return 0; 427 } 428 } 429 430 // If this token contains nothing interesting, return it directly. 431 if (!Tok.needsCleaning()) { 432 Buffer = TokStart; 433 return Tok.getLength(); 434 } 435 436 // Otherwise, hard case, relex the characters into the string. 437 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 438 } 439 440 /// MeasureTokenLength - Relex the token at the specified location and return 441 /// its length in bytes in the input file. If the token needs cleaning (e.g. 442 /// includes a trigraph or an escaped newline) then this count includes bytes 443 /// that are part of that. 444 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 445 const SourceManager &SM, 446 const LangOptions &LangOpts) { 447 Token TheTok; 448 if (getRawToken(Loc, TheTok, SM, LangOpts)) 449 return 0; 450 return TheTok.getLength(); 451 } 452 453 /// Relex the token at the specified location. 454 /// \returns true if there was a failure, false on success. 455 bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 456 const SourceManager &SM, 457 const LangOptions &LangOpts, 458 bool IgnoreWhiteSpace) { 459 // TODO: this could be special cased for common tokens like identifiers, ')', 460 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 461 // all obviously single-char tokens. This could use 462 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 463 // something. 464 465 // If this comes from a macro expansion, we really do want the macro name, not 466 // the token this macro expanded to. 467 Loc = SM.getExpansionLoc(Loc); 468 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 469 bool Invalid = false; 470 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 471 if (Invalid) 472 return true; 473 474 const char *StrData = Buffer.data()+LocInfo.second; 475 476 if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 477 return true; 478 479 // Create a lexer starting at the beginning of this token. 480 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 481 Buffer.begin(), StrData, Buffer.end()); 482 TheLexer.SetCommentRetentionState(true); 483 TheLexer.LexFromRawLexer(Result); 484 return false; 485 } 486 487 /// Returns the pointer that points to the beginning of line that contains 488 /// the given offset, or null if the offset if invalid. 489 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 490 const char *BufStart = Buffer.data(); 491 if (Offset >= Buffer.size()) 492 return nullptr; 493 494 const char *LexStart = BufStart + Offset; 495 for (; LexStart != BufStart; --LexStart) { 496 if (isVerticalWhitespace(LexStart[0]) && 497 !Lexer::isNewLineEscaped(BufStart, LexStart)) { 498 // LexStart should point at first character of logical line. 499 ++LexStart; 500 break; 501 } 502 } 503 return LexStart; 504 } 505 506 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 507 const SourceManager &SM, 508 const LangOptions &LangOpts) { 509 assert(Loc.isFileID()); 510 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 511 if (LocInfo.first.isInvalid()) 512 return Loc; 513 514 bool Invalid = false; 515 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 516 if (Invalid) 517 return Loc; 518 519 // Back up from the current location until we hit the beginning of a line 520 // (or the buffer). We'll relex from that point. 521 const char *StrData = Buffer.data() + LocInfo.second; 522 const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 523 if (!LexStart || LexStart == StrData) 524 return Loc; 525 526 // Create a lexer starting at the beginning of this token. 527 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 528 Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 529 Buffer.end()); 530 TheLexer.SetCommentRetentionState(true); 531 532 // Lex tokens until we find the token that contains the source location. 533 Token TheTok; 534 do { 535 TheLexer.LexFromRawLexer(TheTok); 536 537 if (TheLexer.getBufferLocation() > StrData) { 538 // Lexing this token has taken the lexer past the source location we're 539 // looking for. If the current token encompasses our source location, 540 // return the beginning of that token. 541 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 542 return TheTok.getLocation(); 543 544 // We ended up skipping over the source location entirely, which means 545 // that it points into whitespace. We're done here. 546 break; 547 } 548 } while (TheTok.getKind() != tok::eof); 549 550 // We've passed our source location; just return the original source location. 551 return Loc; 552 } 553 554 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 555 const SourceManager &SM, 556 const LangOptions &LangOpts) { 557 if (Loc.isFileID()) 558 return getBeginningOfFileToken(Loc, SM, LangOpts); 559 560 if (!SM.isMacroArgExpansion(Loc)) 561 return Loc; 562 563 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 564 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 565 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 566 std::pair<FileID, unsigned> BeginFileLocInfo = 567 SM.getDecomposedLoc(BeginFileLoc); 568 assert(FileLocInfo.first == BeginFileLocInfo.first && 569 FileLocInfo.second >= BeginFileLocInfo.second); 570 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 571 } 572 573 namespace { 574 575 enum PreambleDirectiveKind { 576 PDK_Skipped, 577 PDK_Unknown 578 }; 579 580 } // namespace 581 582 PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 583 const LangOptions &LangOpts, 584 unsigned MaxLines) { 585 // Create a lexer starting at the beginning of the file. Note that we use a 586 // "fake" file source location at offset 1 so that the lexer will track our 587 // position within the file. 588 const unsigned StartOffset = 1; 589 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 590 Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 591 Buffer.end()); 592 TheLexer.SetCommentRetentionState(true); 593 594 bool InPreprocessorDirective = false; 595 Token TheTok; 596 SourceLocation ActiveCommentLoc; 597 598 unsigned MaxLineOffset = 0; 599 if (MaxLines) { 600 const char *CurPtr = Buffer.begin(); 601 unsigned CurLine = 0; 602 while (CurPtr != Buffer.end()) { 603 char ch = *CurPtr++; 604 if (ch == '\n') { 605 ++CurLine; 606 if (CurLine == MaxLines) 607 break; 608 } 609 } 610 if (CurPtr != Buffer.end()) 611 MaxLineOffset = CurPtr - Buffer.begin(); 612 } 613 614 do { 615 TheLexer.LexFromRawLexer(TheTok); 616 617 if (InPreprocessorDirective) { 618 // If we've hit the end of the file, we're done. 619 if (TheTok.getKind() == tok::eof) { 620 break; 621 } 622 623 // If we haven't hit the end of the preprocessor directive, skip this 624 // token. 625 if (!TheTok.isAtStartOfLine()) 626 continue; 627 628 // We've passed the end of the preprocessor directive, and will look 629 // at this token again below. 630 InPreprocessorDirective = false; 631 } 632 633 // Keep track of the # of lines in the preamble. 634 if (TheTok.isAtStartOfLine()) { 635 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 636 637 // If we were asked to limit the number of lines in the preamble, 638 // and we're about to exceed that limit, we're done. 639 if (MaxLineOffset && TokOffset >= MaxLineOffset) 640 break; 641 } 642 643 // Comments are okay; skip over them. 644 if (TheTok.getKind() == tok::comment) { 645 if (ActiveCommentLoc.isInvalid()) 646 ActiveCommentLoc = TheTok.getLocation(); 647 continue; 648 } 649 650 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 651 // This is the start of a preprocessor directive. 652 Token HashTok = TheTok; 653 InPreprocessorDirective = true; 654 ActiveCommentLoc = SourceLocation(); 655 656 // Figure out which directive this is. Since we're lexing raw tokens, 657 // we don't have an identifier table available. Instead, just look at 658 // the raw identifier to recognize and categorize preprocessor directives. 659 TheLexer.LexFromRawLexer(TheTok); 660 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 661 StringRef Keyword = TheTok.getRawIdentifier(); 662 PreambleDirectiveKind PDK 663 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 664 .Case("include", PDK_Skipped) 665 .Case("__include_macros", PDK_Skipped) 666 .Case("define", PDK_Skipped) 667 .Case("undef", PDK_Skipped) 668 .Case("line", PDK_Skipped) 669 .Case("error", PDK_Skipped) 670 .Case("pragma", PDK_Skipped) 671 .Case("import", PDK_Skipped) 672 .Case("include_next", PDK_Skipped) 673 .Case("warning", PDK_Skipped) 674 .Case("ident", PDK_Skipped) 675 .Case("sccs", PDK_Skipped) 676 .Case("assert", PDK_Skipped) 677 .Case("unassert", PDK_Skipped) 678 .Case("if", PDK_Skipped) 679 .Case("ifdef", PDK_Skipped) 680 .Case("ifndef", PDK_Skipped) 681 .Case("elif", PDK_Skipped) 682 .Case("else", PDK_Skipped) 683 .Case("endif", PDK_Skipped) 684 .Default(PDK_Unknown); 685 686 switch (PDK) { 687 case PDK_Skipped: 688 continue; 689 690 case PDK_Unknown: 691 // We don't know what this directive is; stop at the '#'. 692 break; 693 } 694 } 695 696 // We only end up here if we didn't recognize the preprocessor 697 // directive or it was one that can't occur in the preamble at this 698 // point. Roll back the current token to the location of the '#'. 699 TheTok = HashTok; 700 } 701 702 // We hit a token that we don't recognize as being in the 703 // "preprocessing only" part of the file, so we're no longer in 704 // the preamble. 705 break; 706 } while (true); 707 708 SourceLocation End; 709 if (ActiveCommentLoc.isValid()) 710 End = ActiveCommentLoc; // don't truncate a decl comment. 711 else 712 End = TheTok.getLocation(); 713 714 return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 715 TheTok.isAtStartOfLine()); 716 } 717 718 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 719 const SourceManager &SM, 720 const LangOptions &LangOpts) { 721 // Figure out how many physical characters away the specified expansion 722 // character is. This needs to take into consideration newlines and 723 // trigraphs. 724 bool Invalid = false; 725 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 726 727 // If they request the first char of the token, we're trivially done. 728 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 729 return 0; 730 731 unsigned PhysOffset = 0; 732 733 // The usual case is that tokens don't contain anything interesting. Skip 734 // over the uninteresting characters. If a token only consists of simple 735 // chars, this method is extremely fast. 736 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 737 if (CharNo == 0) 738 return PhysOffset; 739 ++TokPtr; 740 --CharNo; 741 ++PhysOffset; 742 } 743 744 // If we have a character that may be a trigraph or escaped newline, use a 745 // lexer to parse it correctly. 746 for (; CharNo; --CharNo) { 747 unsigned Size; 748 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 749 TokPtr += Size; 750 PhysOffset += Size; 751 } 752 753 // Final detail: if we end up on an escaped newline, we want to return the 754 // location of the actual byte of the token. For example foo\<newline>bar 755 // advanced by 3 should return the location of b, not of \\. One compounding 756 // detail of this is that the escape may be made by a trigraph. 757 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 758 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 759 760 return PhysOffset; 761 } 762 763 /// Computes the source location just past the end of the 764 /// token at this source location. 765 /// 766 /// This routine can be used to produce a source location that 767 /// points just past the end of the token referenced by \p Loc, and 768 /// is generally used when a diagnostic needs to point just after a 769 /// token where it expected something different that it received. If 770 /// the returned source location would not be meaningful (e.g., if 771 /// it points into a macro), this routine returns an invalid 772 /// source location. 773 /// 774 /// \param Offset an offset from the end of the token, where the source 775 /// location should refer to. The default offset (0) produces a source 776 /// location pointing just past the end of the token; an offset of 1 produces 777 /// a source location pointing to the last character in the token, etc. 778 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 779 const SourceManager &SM, 780 const LangOptions &LangOpts) { 781 if (Loc.isInvalid()) 782 return {}; 783 784 if (Loc.isMacroID()) { 785 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 786 return {}; // Points inside the macro expansion. 787 } 788 789 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 790 if (Len > Offset) 791 Len = Len - Offset; 792 else 793 return Loc; 794 795 return Loc.getLocWithOffset(Len); 796 } 797 798 /// Returns true if the given MacroID location points at the first 799 /// token of the macro expansion. 800 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 801 const SourceManager &SM, 802 const LangOptions &LangOpts, 803 SourceLocation *MacroBegin) { 804 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 805 806 SourceLocation expansionLoc; 807 if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 808 return false; 809 810 if (expansionLoc.isFileID()) { 811 // No other macro expansions, this is the first. 812 if (MacroBegin) 813 *MacroBegin = expansionLoc; 814 return true; 815 } 816 817 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 818 } 819 820 /// Returns true if the given MacroID location points at the last 821 /// token of the macro expansion. 822 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 823 const SourceManager &SM, 824 const LangOptions &LangOpts, 825 SourceLocation *MacroEnd) { 826 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 827 828 SourceLocation spellLoc = SM.getSpellingLoc(loc); 829 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 830 if (tokLen == 0) 831 return false; 832 833 SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 834 SourceLocation expansionLoc; 835 if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 836 return false; 837 838 if (expansionLoc.isFileID()) { 839 // No other macro expansions. 840 if (MacroEnd) 841 *MacroEnd = expansionLoc; 842 return true; 843 } 844 845 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 846 } 847 848 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 849 const SourceManager &SM, 850 const LangOptions &LangOpts) { 851 SourceLocation Begin = Range.getBegin(); 852 SourceLocation End = Range.getEnd(); 853 assert(Begin.isFileID() && End.isFileID()); 854 if (Range.isTokenRange()) { 855 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 856 if (End.isInvalid()) 857 return {}; 858 } 859 860 // Break down the source locations. 861 FileID FID; 862 unsigned BeginOffs; 863 std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 864 if (FID.isInvalid()) 865 return {}; 866 867 unsigned EndOffs; 868 if (!SM.isInFileID(End, FID, &EndOffs) || 869 BeginOffs > EndOffs) 870 return {}; 871 872 return CharSourceRange::getCharRange(Begin, End); 873 } 874 875 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 876 const SourceManager &SM, 877 const LangOptions &LangOpts) { 878 SourceLocation Begin = Range.getBegin(); 879 SourceLocation End = Range.getEnd(); 880 if (Begin.isInvalid() || End.isInvalid()) 881 return {}; 882 883 if (Begin.isFileID() && End.isFileID()) 884 return makeRangeFromFileLocs(Range, SM, LangOpts); 885 886 if (Begin.isMacroID() && End.isFileID()) { 887 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 888 return {}; 889 Range.setBegin(Begin); 890 return makeRangeFromFileLocs(Range, SM, LangOpts); 891 } 892 893 if (Begin.isFileID() && End.isMacroID()) { 894 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 895 &End)) || 896 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 897 &End))) 898 return {}; 899 Range.setEnd(End); 900 return makeRangeFromFileLocs(Range, SM, LangOpts); 901 } 902 903 assert(Begin.isMacroID() && End.isMacroID()); 904 SourceLocation MacroBegin, MacroEnd; 905 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 906 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 907 &MacroEnd)) || 908 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 909 &MacroEnd)))) { 910 Range.setBegin(MacroBegin); 911 Range.setEnd(MacroEnd); 912 return makeRangeFromFileLocs(Range, SM, LangOpts); 913 } 914 915 bool Invalid = false; 916 const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 917 &Invalid); 918 if (Invalid) 919 return {}; 920 921 if (BeginEntry.getExpansion().isMacroArgExpansion()) { 922 const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 923 &Invalid); 924 if (Invalid) 925 return {}; 926 927 if (EndEntry.getExpansion().isMacroArgExpansion() && 928 BeginEntry.getExpansion().getExpansionLocStart() == 929 EndEntry.getExpansion().getExpansionLocStart()) { 930 Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 931 Range.setEnd(SM.getImmediateSpellingLoc(End)); 932 return makeFileCharRange(Range, SM, LangOpts); 933 } 934 } 935 936 return {}; 937 } 938 939 StringRef Lexer::getSourceText(CharSourceRange Range, 940 const SourceManager &SM, 941 const LangOptions &LangOpts, 942 bool *Invalid) { 943 Range = makeFileCharRange(Range, SM, LangOpts); 944 if (Range.isInvalid()) { 945 if (Invalid) *Invalid = true; 946 return {}; 947 } 948 949 // Break down the source location. 950 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 951 if (beginInfo.first.isInvalid()) { 952 if (Invalid) *Invalid = true; 953 return {}; 954 } 955 956 unsigned EndOffs; 957 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 958 beginInfo.second > EndOffs) { 959 if (Invalid) *Invalid = true; 960 return {}; 961 } 962 963 // Try to the load the file buffer. 964 bool invalidTemp = false; 965 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 966 if (invalidTemp) { 967 if (Invalid) *Invalid = true; 968 return {}; 969 } 970 971 if (Invalid) *Invalid = false; 972 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 973 } 974 975 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 976 const SourceManager &SM, 977 const LangOptions &LangOpts) { 978 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 979 980 // Find the location of the immediate macro expansion. 981 while (true) { 982 FileID FID = SM.getFileID(Loc); 983 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 984 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 985 Loc = Expansion.getExpansionLocStart(); 986 if (!Expansion.isMacroArgExpansion()) 987 break; 988 989 // For macro arguments we need to check that the argument did not come 990 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 991 992 // Loc points to the argument id of the macro definition, move to the 993 // macro expansion. 994 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 995 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 996 if (SpellLoc.isFileID()) 997 break; // No inner macro. 998 999 // If spelling location resides in the same FileID as macro expansion 1000 // location, it means there is no inner macro. 1001 FileID MacroFID = SM.getFileID(Loc); 1002 if (SM.isInFileID(SpellLoc, MacroFID)) 1003 break; 1004 1005 // Argument came from inner macro. 1006 Loc = SpellLoc; 1007 } 1008 1009 // Find the spelling location of the start of the non-argument expansion 1010 // range. This is where the macro name was spelled in order to begin 1011 // expanding this macro. 1012 Loc = SM.getSpellingLoc(Loc); 1013 1014 // Dig out the buffer where the macro name was spelled and the extents of the 1015 // name so that we can render it into the expansion note. 1016 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1017 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1018 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1019 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1020 } 1021 1022 StringRef Lexer::getImmediateMacroNameForDiagnostics( 1023 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1024 assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1025 // Walk past macro argument expansions. 1026 while (SM.isMacroArgExpansion(Loc)) 1027 Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1028 1029 // If the macro's spelling has no FileID, then it's actually a token paste 1030 // or stringization (or similar) and not a macro at all. 1031 if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 1032 return {}; 1033 1034 // Find the spelling location of the start of the non-argument expansion 1035 // range. This is where the macro name was spelled in order to begin 1036 // expanding this macro. 1037 Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1038 1039 // Dig out the buffer where the macro name was spelled and the extents of the 1040 // name so that we can render it into the expansion note. 1041 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1042 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1043 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1044 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1045 } 1046 1047 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 1048 return isIdentifierBody(c, LangOpts.DollarIdents); 1049 } 1050 1051 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1052 assert(isVerticalWhitespace(Str[0])); 1053 if (Str - 1 < BufferStart) 1054 return false; 1055 1056 if ((Str[0] == '\n' && Str[-1] == '\r') || 1057 (Str[0] == '\r' && Str[-1] == '\n')) { 1058 if (Str - 2 < BufferStart) 1059 return false; 1060 --Str; 1061 } 1062 --Str; 1063 1064 // Rewind to first non-space character: 1065 while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1066 --Str; 1067 1068 return *Str == '\\'; 1069 } 1070 1071 StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1072 const SourceManager &SM) { 1073 if (Loc.isInvalid() || Loc.isMacroID()) 1074 return {}; 1075 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1076 if (LocInfo.first.isInvalid()) 1077 return {}; 1078 bool Invalid = false; 1079 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1080 if (Invalid) 1081 return {}; 1082 const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1083 if (!Line) 1084 return {}; 1085 StringRef Rest = Buffer.substr(Line - Buffer.data()); 1086 size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1087 return NumWhitespaceChars == StringRef::npos 1088 ? "" 1089 : Rest.take_front(NumWhitespaceChars); 1090 } 1091 1092 //===----------------------------------------------------------------------===// 1093 // Diagnostics forwarding code. 1094 //===----------------------------------------------------------------------===// 1095 1096 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1097 /// lexer buffer was all expanded at a single point, perform the mapping. 1098 /// This is currently only used for _Pragma implementation, so it is the slow 1099 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1100 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1101 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1102 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1103 SourceLocation FileLoc, 1104 unsigned CharNo, unsigned TokLen) { 1105 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1106 1107 // Otherwise, we're lexing "mapped tokens". This is used for things like 1108 // _Pragma handling. Combine the expansion location of FileLoc with the 1109 // spelling location. 1110 SourceManager &SM = PP.getSourceManager(); 1111 1112 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1113 // characters come from spelling(FileLoc)+Offset. 1114 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1115 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1116 1117 // Figure out the expansion loc range, which is the range covered by the 1118 // original _Pragma(...) sequence. 1119 CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1120 1121 return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1122 } 1123 1124 /// getSourceLocation - Return a source location identifier for the specified 1125 /// offset in the current file. 1126 SourceLocation Lexer::getSourceLocation(const char *Loc, 1127 unsigned TokLen) const { 1128 assert(Loc >= BufferStart && Loc <= BufferEnd && 1129 "Location out of range for this buffer!"); 1130 1131 // In the normal case, we're just lexing from a simple file buffer, return 1132 // the file id from FileLoc with the offset specified. 1133 unsigned CharNo = Loc-BufferStart; 1134 if (FileLoc.isFileID()) 1135 return FileLoc.getLocWithOffset(CharNo); 1136 1137 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1138 // tokens are lexed from where the _Pragma was defined. 1139 assert(PP && "This doesn't work on raw lexers"); 1140 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1141 } 1142 1143 /// Diag - Forwarding function for diagnostics. This translate a source 1144 /// position in the current buffer into a SourceLocation object for rendering. 1145 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1146 return PP->Diag(getSourceLocation(Loc), DiagID); 1147 } 1148 1149 //===----------------------------------------------------------------------===// 1150 // Trigraph and Escaped Newline Handling Code. 1151 //===----------------------------------------------------------------------===// 1152 1153 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1154 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1155 static char GetTrigraphCharForLetter(char Letter) { 1156 switch (Letter) { 1157 default: return 0; 1158 case '=': return '#'; 1159 case ')': return ']'; 1160 case '(': return '['; 1161 case '!': return '|'; 1162 case '\'': return '^'; 1163 case '>': return '}'; 1164 case '/': return '\\'; 1165 case '<': return '{'; 1166 case '-': return '~'; 1167 } 1168 } 1169 1170 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1171 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1172 /// return the result character. Finally, emit a warning about trigraph use 1173 /// whether trigraphs are enabled or not. 1174 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1175 char Res = GetTrigraphCharForLetter(*CP); 1176 if (!Res || !L) return Res; 1177 1178 if (!L->getLangOpts().Trigraphs) { 1179 if (!L->isLexingRawMode()) 1180 L->Diag(CP-2, diag::trigraph_ignored); 1181 return 0; 1182 } 1183 1184 if (!L->isLexingRawMode()) 1185 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1186 return Res; 1187 } 1188 1189 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1190 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1191 /// trigraph equivalent on entry to this function. 1192 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1193 unsigned Size = 0; 1194 while (isWhitespace(Ptr[Size])) { 1195 ++Size; 1196 1197 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1198 continue; 1199 1200 // If this is a \r\n or \n\r, skip the other half. 1201 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1202 Ptr[Size-1] != Ptr[Size]) 1203 ++Size; 1204 1205 return Size; 1206 } 1207 1208 // Not an escaped newline, must be a \t or something else. 1209 return 0; 1210 } 1211 1212 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1213 /// them), skip over them and return the first non-escaped-newline found, 1214 /// otherwise return P. 1215 const char *Lexer::SkipEscapedNewLines(const char *P) { 1216 while (true) { 1217 const char *AfterEscape; 1218 if (*P == '\\') { 1219 AfterEscape = P+1; 1220 } else if (*P == '?') { 1221 // If not a trigraph for escape, bail out. 1222 if (P[1] != '?' || P[2] != '/') 1223 return P; 1224 // FIXME: Take LangOpts into account; the language might not 1225 // support trigraphs. 1226 AfterEscape = P+3; 1227 } else { 1228 return P; 1229 } 1230 1231 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1232 if (NewLineSize == 0) return P; 1233 P = AfterEscape+NewLineSize; 1234 } 1235 } 1236 1237 Optional<Token> Lexer::findNextToken(SourceLocation Loc, 1238 const SourceManager &SM, 1239 const LangOptions &LangOpts) { 1240 if (Loc.isMacroID()) { 1241 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1242 return None; 1243 } 1244 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1245 1246 // Break down the source location. 1247 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1248 1249 // Try to load the file buffer. 1250 bool InvalidTemp = false; 1251 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1252 if (InvalidTemp) 1253 return None; 1254 1255 const char *TokenBegin = File.data() + LocInfo.second; 1256 1257 // Lex from the start of the given location. 1258 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1259 TokenBegin, File.end()); 1260 // Find the token. 1261 Token Tok; 1262 lexer.LexFromRawLexer(Tok); 1263 return Tok; 1264 } 1265 1266 /// Checks that the given token is the first token that occurs after the 1267 /// given location (this excludes comments and whitespace). Returns the location 1268 /// immediately after the specified token. If the token is not found or the 1269 /// location is inside a macro, the returned source location will be invalid. 1270 SourceLocation Lexer::findLocationAfterToken( 1271 SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1272 const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1273 Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1274 if (!Tok || Tok->isNot(TKind)) 1275 return {}; 1276 SourceLocation TokenLoc = Tok->getLocation(); 1277 1278 // Calculate how much whitespace needs to be skipped if any. 1279 unsigned NumWhitespaceChars = 0; 1280 if (SkipTrailingWhitespaceAndNewLine) { 1281 const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1282 unsigned char C = *TokenEnd; 1283 while (isHorizontalWhitespace(C)) { 1284 C = *(++TokenEnd); 1285 NumWhitespaceChars++; 1286 } 1287 1288 // Skip \r, \n, \r\n, or \n\r 1289 if (C == '\n' || C == '\r') { 1290 char PrevC = C; 1291 C = *(++TokenEnd); 1292 NumWhitespaceChars++; 1293 if ((C == '\n' || C == '\r') && C != PrevC) 1294 NumWhitespaceChars++; 1295 } 1296 } 1297 1298 return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1299 } 1300 1301 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1302 /// get its size, and return it. This is tricky in several cases: 1303 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1304 /// then either return the trigraph (skipping 3 chars) or the '?', 1305 /// depending on whether trigraphs are enabled or not. 1306 /// 2. If this is an escaped newline (potentially with whitespace between 1307 /// the backslash and newline), implicitly skip the newline and return 1308 /// the char after it. 1309 /// 1310 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1311 /// know that we can accumulate into Size, and that we have already incremented 1312 /// Ptr by Size bytes. 1313 /// 1314 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1315 /// be updated to match. 1316 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1317 Token *Tok) { 1318 // If we have a slash, look for an escaped newline. 1319 if (Ptr[0] == '\\') { 1320 ++Size; 1321 ++Ptr; 1322 Slash: 1323 // Common case, backslash-char where the char is not whitespace. 1324 if (!isWhitespace(Ptr[0])) return '\\'; 1325 1326 // See if we have optional whitespace characters between the slash and 1327 // newline. 1328 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1329 // Remember that this token needs to be cleaned. 1330 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1331 1332 // Warn if there was whitespace between the backslash and newline. 1333 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1334 Diag(Ptr, diag::backslash_newline_space); 1335 1336 // Found backslash<whitespace><newline>. Parse the char after it. 1337 Size += EscapedNewLineSize; 1338 Ptr += EscapedNewLineSize; 1339 1340 // Use slow version to accumulate a correct size field. 1341 return getCharAndSizeSlow(Ptr, Size, Tok); 1342 } 1343 1344 // Otherwise, this is not an escaped newline, just return the slash. 1345 return '\\'; 1346 } 1347 1348 // If this is a trigraph, process it. 1349 if (Ptr[0] == '?' && Ptr[1] == '?') { 1350 // If this is actually a legal trigraph (not something like "??x"), emit 1351 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1352 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 1353 // Remember that this token needs to be cleaned. 1354 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1355 1356 Ptr += 3; 1357 Size += 3; 1358 if (C == '\\') goto Slash; 1359 return C; 1360 } 1361 } 1362 1363 // If this is neither, return a single character. 1364 ++Size; 1365 return *Ptr; 1366 } 1367 1368 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1369 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1370 /// and that we have already incremented Ptr by Size bytes. 1371 /// 1372 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1373 /// be updated to match. 1374 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1375 const LangOptions &LangOpts) { 1376 // If we have a slash, look for an escaped newline. 1377 if (Ptr[0] == '\\') { 1378 ++Size; 1379 ++Ptr; 1380 Slash: 1381 // Common case, backslash-char where the char is not whitespace. 1382 if (!isWhitespace(Ptr[0])) return '\\'; 1383 1384 // See if we have optional whitespace characters followed by a newline. 1385 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1386 // Found backslash<whitespace><newline>. Parse the char after it. 1387 Size += EscapedNewLineSize; 1388 Ptr += EscapedNewLineSize; 1389 1390 // Use slow version to accumulate a correct size field. 1391 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 1392 } 1393 1394 // Otherwise, this is not an escaped newline, just return the slash. 1395 return '\\'; 1396 } 1397 1398 // If this is a trigraph, process it. 1399 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1400 // If this is actually a legal trigraph (not something like "??x"), return 1401 // it. 1402 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1403 Ptr += 3; 1404 Size += 3; 1405 if (C == '\\') goto Slash; 1406 return C; 1407 } 1408 } 1409 1410 // If this is neither, return a single character. 1411 ++Size; 1412 return *Ptr; 1413 } 1414 1415 //===----------------------------------------------------------------------===// 1416 // Helper methods for lexing. 1417 //===----------------------------------------------------------------------===// 1418 1419 /// Routine that indiscriminately sets the offset into the source file. 1420 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1421 BufferPtr = BufferStart + Offset; 1422 if (BufferPtr > BufferEnd) 1423 BufferPtr = BufferEnd; 1424 // FIXME: What exactly does the StartOfLine bit mean? There are two 1425 // possible meanings for the "start" of the line: the first token on the 1426 // unexpanded line, or the first token on the expanded line. 1427 IsAtStartOfLine = StartOfLine; 1428 IsAtPhysicalStartOfLine = StartOfLine; 1429 } 1430 1431 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 1432 if (LangOpts.AsmPreprocessor) { 1433 return false; 1434 } else if (LangOpts.DollarIdents && '$' == C) { 1435 return true; 1436 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1437 static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1438 C11AllowedIDCharRanges); 1439 return C11AllowedIDChars.contains(C); 1440 } else if (LangOpts.CPlusPlus) { 1441 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1442 CXX03AllowedIDCharRanges); 1443 return CXX03AllowedIDChars.contains(C); 1444 } else { 1445 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1446 C99AllowedIDCharRanges); 1447 return C99AllowedIDChars.contains(C); 1448 } 1449 } 1450 1451 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 1452 assert(isAllowedIDChar(C, LangOpts)); 1453 if (LangOpts.AsmPreprocessor) { 1454 return false; 1455 } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1456 static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1457 C11DisallowedInitialIDCharRanges); 1458 return !C11DisallowedInitialIDChars.contains(C); 1459 } else if (LangOpts.CPlusPlus) { 1460 return true; 1461 } else { 1462 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1463 C99DisallowedInitialIDCharRanges); 1464 return !C99DisallowedInitialIDChars.contains(C); 1465 } 1466 } 1467 1468 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1469 const char *End) { 1470 return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1471 L.getSourceLocation(End)); 1472 } 1473 1474 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1475 CharSourceRange Range, bool IsFirst) { 1476 // Check C99 compatibility. 1477 if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1478 enum { 1479 CannotAppearInIdentifier = 0, 1480 CannotStartIdentifier 1481 }; 1482 1483 static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1484 C99AllowedIDCharRanges); 1485 static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1486 C99DisallowedInitialIDCharRanges); 1487 if (!C99AllowedIDChars.contains(C)) { 1488 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1489 << Range 1490 << CannotAppearInIdentifier; 1491 } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1492 Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1493 << Range 1494 << CannotStartIdentifier; 1495 } 1496 } 1497 1498 // Check C++98 compatibility. 1499 if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { 1500 static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1501 CXX03AllowedIDCharRanges); 1502 if (!CXX03AllowedIDChars.contains(C)) { 1503 Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) 1504 << Range; 1505 } 1506 } 1507 } 1508 1509 /// After encountering UTF-8 character C and interpreting it as an identifier 1510 /// character, check whether it's a homoglyph for a common non-identifier 1511 /// source character that is unlikely to be an intentional identifier 1512 /// character and warn if so. 1513 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1514 CharSourceRange Range) { 1515 // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1516 struct HomoglyphPair { 1517 uint32_t Character; 1518 char LooksLike; 1519 bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1520 }; 1521 static constexpr HomoglyphPair SortedHomoglyphs[] = { 1522 {U'\u00ad', 0}, // SOFT HYPHEN 1523 {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1524 {U'\u037e', ';'}, // GREEK QUESTION MARK 1525 {U'\u200b', 0}, // ZERO WIDTH SPACE 1526 {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1527 {U'\u200d', 0}, // ZERO WIDTH JOINER 1528 {U'\u2060', 0}, // WORD JOINER 1529 {U'\u2061', 0}, // FUNCTION APPLICATION 1530 {U'\u2062', 0}, // INVISIBLE TIMES 1531 {U'\u2063', 0}, // INVISIBLE SEPARATOR 1532 {U'\u2064', 0}, // INVISIBLE PLUS 1533 {U'\u2212', '-'}, // MINUS SIGN 1534 {U'\u2215', '/'}, // DIVISION SLASH 1535 {U'\u2216', '\\'}, // SET MINUS 1536 {U'\u2217', '*'}, // ASTERISK OPERATOR 1537 {U'\u2223', '|'}, // DIVIDES 1538 {U'\u2227', '^'}, // LOGICAL AND 1539 {U'\u2236', ':'}, // RATIO 1540 {U'\u223c', '~'}, // TILDE OPERATOR 1541 {U'\ua789', ':'}, // MODIFIER LETTER COLON 1542 {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1543 {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1544 {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1545 {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1546 {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1547 {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1548 {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1549 {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1550 {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1551 {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1552 {U'\uff0c', ','}, // FULLWIDTH COMMA 1553 {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1554 {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1555 {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1556 {U'\uff1a', ':'}, // FULLWIDTH COLON 1557 {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1558 {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1559 {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1560 {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1561 {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1562 {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1563 {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1564 {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1565 {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1566 {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1567 {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1568 {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1569 {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1570 {U'\uff5e', '~'}, // FULLWIDTH TILDE 1571 {0, 0} 1572 }; 1573 auto Homoglyph = 1574 std::lower_bound(std::begin(SortedHomoglyphs), 1575 std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1576 if (Homoglyph->Character == C) { 1577 llvm::SmallString<5> CharBuf; 1578 { 1579 llvm::raw_svector_ostream CharOS(CharBuf); 1580 llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1581 } 1582 if (Homoglyph->LooksLike) { 1583 const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1584 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1585 << Range << CharBuf << LooksLikeStr; 1586 } else { 1587 Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1588 << Range << CharBuf; 1589 } 1590 } 1591 } 1592 1593 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1594 Token &Result) { 1595 const char *UCNPtr = CurPtr + Size; 1596 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1597 if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) 1598 return false; 1599 1600 if (!isLexingRawMode()) 1601 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1602 makeCharRange(*this, CurPtr, UCNPtr), 1603 /*IsFirst=*/false); 1604 1605 Result.setFlag(Token::HasUCN); 1606 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1607 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1608 CurPtr = UCNPtr; 1609 else 1610 while (CurPtr != UCNPtr) 1611 (void)getAndAdvanceChar(CurPtr, Result); 1612 return true; 1613 } 1614 1615 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 1616 const char *UnicodePtr = CurPtr; 1617 llvm::UTF32 CodePoint; 1618 llvm::ConversionResult Result = 1619 llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 1620 (const llvm::UTF8 *)BufferEnd, 1621 &CodePoint, 1622 llvm::strictConversion); 1623 if (Result != llvm::conversionOK || 1624 !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) 1625 return false; 1626 1627 if (!isLexingRawMode()) { 1628 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1629 makeCharRange(*this, CurPtr, UnicodePtr), 1630 /*IsFirst=*/false); 1631 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1632 makeCharRange(*this, CurPtr, UnicodePtr)); 1633 } 1634 1635 CurPtr = UnicodePtr; 1636 return true; 1637 } 1638 1639 bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1640 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1641 unsigned Size; 1642 unsigned char C = *CurPtr++; 1643 while (isIdentifierBody(C)) 1644 C = *CurPtr++; 1645 1646 --CurPtr; // Back up over the skipped character. 1647 1648 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1649 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1650 // 1651 // TODO: Could merge these checks into an InfoTable flag to make the 1652 // comparison cheaper 1653 if (isASCII(C) && C != '\\' && C != '?' && 1654 (C != '$' || !LangOpts.DollarIdents)) { 1655 FinishIdentifier: 1656 const char *IdStart = BufferPtr; 1657 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1658 Result.setRawIdentifierData(IdStart); 1659 1660 // If we are in raw mode, return this identifier raw. There is no need to 1661 // look up identifier information or attempt to macro expand it. 1662 if (LexingRawMode) 1663 return true; 1664 1665 // Fill in Result.IdentifierInfo and update the token kind, 1666 // looking up the identifier in the identifier table. 1667 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1668 // Note that we have to call PP->LookUpIdentifierInfo() even for code 1669 // completion, it writes IdentifierInfo into Result, and callers rely on it. 1670 1671 // If the completion point is at the end of an identifier, we want to treat 1672 // the identifier as incomplete even if it resolves to a macro or a keyword. 1673 // This allows e.g. 'class^' to complete to 'classifier'. 1674 if (isCodeCompletionPoint(CurPtr)) { 1675 // Return the code-completion token. 1676 Result.setKind(tok::code_completion); 1677 // Skip the code-completion char and all immediate identifier characters. 1678 // This ensures we get consistent behavior when completing at any point in 1679 // an identifier (i.e. at the start, in the middle, at the end). Note that 1680 // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1681 // simpler. 1682 assert(*CurPtr == 0 && "Completion character must be 0"); 1683 ++CurPtr; 1684 // Note that code completion token is not added as a separate character 1685 // when the completion point is at the end of the buffer. Therefore, we need 1686 // to check if the buffer has ended. 1687 if (CurPtr < BufferEnd) { 1688 while (isIdentifierBody(*CurPtr)) 1689 ++CurPtr; 1690 } 1691 BufferPtr = CurPtr; 1692 return true; 1693 } 1694 1695 // Finally, now that we know we have an identifier, pass this off to the 1696 // preprocessor, which may macro expand it or something. 1697 if (II->isHandleIdentifierCase()) 1698 return PP->HandleIdentifier(Result); 1699 1700 return true; 1701 } 1702 1703 // Otherwise, $,\,? in identifier found. Enter slower path. 1704 1705 C = getCharAndSize(CurPtr, Size); 1706 while (true) { 1707 if (C == '$') { 1708 // If we hit a $ and they are not supported in identifiers, we are done. 1709 if (!LangOpts.DollarIdents) goto FinishIdentifier; 1710 1711 // Otherwise, emit a diagnostic and continue. 1712 if (!isLexingRawMode()) 1713 Diag(CurPtr, diag::ext_dollar_in_identifier); 1714 CurPtr = ConsumeChar(CurPtr, Size, Result); 1715 C = getCharAndSize(CurPtr, Size); 1716 continue; 1717 } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 1718 C = getCharAndSize(CurPtr, Size); 1719 continue; 1720 } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 1721 C = getCharAndSize(CurPtr, Size); 1722 continue; 1723 } else if (!isIdentifierBody(C)) { 1724 goto FinishIdentifier; 1725 } 1726 1727 // Otherwise, this character is good, consume it. 1728 CurPtr = ConsumeChar(CurPtr, Size, Result); 1729 1730 C = getCharAndSize(CurPtr, Size); 1731 while (isIdentifierBody(C)) { 1732 CurPtr = ConsumeChar(CurPtr, Size, Result); 1733 C = getCharAndSize(CurPtr, Size); 1734 } 1735 } 1736 } 1737 1738 /// isHexaLiteral - Return true if Start points to a hex constant. 1739 /// in microsoft mode (where this is supposed to be several different tokens). 1740 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1741 unsigned Size; 1742 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 1743 if (C1 != '0') 1744 return false; 1745 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 1746 return (C2 == 'x' || C2 == 'X'); 1747 } 1748 1749 /// LexNumericConstant - Lex the remainder of a integer or floating point 1750 /// constant. From[-1] is the first character lexed. Return the end of the 1751 /// constant. 1752 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1753 unsigned Size; 1754 char C = getCharAndSize(CurPtr, Size); 1755 char PrevCh = 0; 1756 while (isPreprocessingNumberBody(C)) { 1757 CurPtr = ConsumeChar(CurPtr, Size, Result); 1758 PrevCh = C; 1759 C = getCharAndSize(CurPtr, Size); 1760 } 1761 1762 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1763 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1764 // If we are in Microsoft mode, don't continue if the constant is hex. 1765 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1766 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 1767 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1768 } 1769 1770 // If we have a hex FP constant, continue. 1771 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 1772 // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 1773 // not-quite-conforming extension. Only do so if this looks like it's 1774 // actually meant to be a hexfloat, and not if it has a ud-suffix. 1775 bool IsHexFloat = true; 1776 if (!LangOpts.C99) { 1777 if (!isHexaLiteral(BufferPtr, LangOpts)) 1778 IsHexFloat = false; 1779 else if (!getLangOpts().CPlusPlus17 && 1780 std::find(BufferPtr, CurPtr, '_') != CurPtr) 1781 IsHexFloat = false; 1782 } 1783 if (IsHexFloat) 1784 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1785 } 1786 1787 // If we have a digit separator, continue. 1788 if (C == '\'' && getLangOpts().CPlusPlus14) { 1789 unsigned NextSize; 1790 char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 1791 if (isIdentifierBody(Next)) { 1792 if (!isLexingRawMode()) 1793 Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); 1794 CurPtr = ConsumeChar(CurPtr, Size, Result); 1795 CurPtr = ConsumeChar(CurPtr, NextSize, Result); 1796 return LexNumericConstant(Result, CurPtr); 1797 } 1798 } 1799 1800 // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 1801 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1802 return LexNumericConstant(Result, CurPtr); 1803 if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1804 return LexNumericConstant(Result, CurPtr); 1805 1806 // Update the location of token as well as BufferPtr. 1807 const char *TokStart = BufferPtr; 1808 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1809 Result.setLiteralData(TokStart); 1810 return true; 1811 } 1812 1813 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1814 /// in C++11, or warn on a ud-suffix in C++98. 1815 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 1816 bool IsStringLiteral) { 1817 assert(getLangOpts().CPlusPlus); 1818 1819 // Maximally munch an identifier. 1820 unsigned Size; 1821 char C = getCharAndSize(CurPtr, Size); 1822 bool Consumed = false; 1823 1824 if (!isIdentifierHead(C)) { 1825 if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1826 Consumed = true; 1827 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1828 Consumed = true; 1829 else 1830 return CurPtr; 1831 } 1832 1833 if (!getLangOpts().CPlusPlus11) { 1834 if (!isLexingRawMode()) 1835 Diag(CurPtr, 1836 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 1837 : diag::warn_cxx11_compat_reserved_user_defined_literal) 1838 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1839 return CurPtr; 1840 } 1841 1842 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 1843 // that does not start with an underscore is ill-formed. As a conforming 1844 // extension, we treat all such suffixes as if they had whitespace before 1845 // them. We assume a suffix beginning with a UCN or UTF-8 character is more 1846 // likely to be a ud-suffix than a macro, however, and accept that. 1847 if (!Consumed) { 1848 bool IsUDSuffix = false; 1849 if (C == '_') 1850 IsUDSuffix = true; 1851 else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 1852 // In C++1y, we need to look ahead a few characters to see if this is a 1853 // valid suffix for a string literal or a numeric literal (this could be 1854 // the 'operator""if' defining a numeric literal operator). 1855 const unsigned MaxStandardSuffixLength = 3; 1856 char Buffer[MaxStandardSuffixLength] = { C }; 1857 unsigned Consumed = Size; 1858 unsigned Chars = 1; 1859 while (true) { 1860 unsigned NextSize; 1861 char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 1862 getLangOpts()); 1863 if (!isIdentifierBody(Next)) { 1864 // End of suffix. Check whether this is on the whitelist. 1865 const StringRef CompleteSuffix(Buffer, Chars); 1866 IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), 1867 CompleteSuffix); 1868 break; 1869 } 1870 1871 if (Chars == MaxStandardSuffixLength) 1872 // Too long: can't be a standard suffix. 1873 break; 1874 1875 Buffer[Chars++] = Next; 1876 Consumed += NextSize; 1877 } 1878 } 1879 1880 if (!IsUDSuffix) { 1881 if (!isLexingRawMode()) 1882 Diag(CurPtr, getLangOpts().MSVCCompat 1883 ? diag::ext_ms_reserved_user_defined_literal 1884 : diag::ext_reserved_user_defined_literal) 1885 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1886 return CurPtr; 1887 } 1888 1889 CurPtr = ConsumeChar(CurPtr, Size, Result); 1890 } 1891 1892 Result.setFlag(Token::HasUDSuffix); 1893 while (true) { 1894 C = getCharAndSize(CurPtr, Size); 1895 if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } 1896 else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} 1897 else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} 1898 else break; 1899 } 1900 1901 return CurPtr; 1902 } 1903 1904 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1905 /// either " or L" or u8" or u" or U". 1906 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1907 tok::TokenKind Kind) { 1908 const char *AfterQuote = CurPtr; 1909 // Does this string contain the \0 character? 1910 const char *NulCharacter = nullptr; 1911 1912 if (!isLexingRawMode() && 1913 (Kind == tok::utf8_string_literal || 1914 Kind == tok::utf16_string_literal || 1915 Kind == tok::utf32_string_literal)) 1916 Diag(BufferPtr, getLangOpts().CPlusPlus 1917 ? diag::warn_cxx98_compat_unicode_literal 1918 : diag::warn_c99_compat_unicode_literal); 1919 1920 char C = getAndAdvanceChar(CurPtr, Result); 1921 while (C != '"') { 1922 // Skip escaped characters. Escaped newlines will already be processed by 1923 // getAndAdvanceChar. 1924 if (C == '\\') 1925 C = getAndAdvanceChar(CurPtr, Result); 1926 1927 if (C == '\n' || C == '\r' || // Newline. 1928 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1929 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1930 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 1931 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1932 return true; 1933 } 1934 1935 if (C == 0) { 1936 if (isCodeCompletionPoint(CurPtr-1)) { 1937 if (ParsingFilename) 1938 codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 1939 else 1940 PP->CodeCompleteNaturalLanguage(); 1941 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 1942 cutOffLexing(); 1943 return true; 1944 } 1945 1946 NulCharacter = CurPtr-1; 1947 } 1948 C = getAndAdvanceChar(CurPtr, Result); 1949 } 1950 1951 // If we are in C++11, lex the optional ud-suffix. 1952 if (getLangOpts().CPlusPlus) 1953 CurPtr = LexUDSuffix(Result, CurPtr, true); 1954 1955 // If a nul character existed in the string, warn about it. 1956 if (NulCharacter && !isLexingRawMode()) 1957 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 1958 1959 // Update the location of the token as well as the BufferPtr instance var. 1960 const char *TokStart = BufferPtr; 1961 FormTokenWithChars(Result, CurPtr, Kind); 1962 Result.setLiteralData(TokStart); 1963 return true; 1964 } 1965 1966 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1967 /// having lexed R", LR", u8R", uR", or UR". 1968 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1969 tok::TokenKind Kind) { 1970 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1971 // Between the initial and final double quote characters of the raw string, 1972 // any transformations performed in phases 1 and 2 (trigraphs, 1973 // universal-character-names, and line splicing) are reverted. 1974 1975 if (!isLexingRawMode()) 1976 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1977 1978 unsigned PrefixLen = 0; 1979 1980 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1981 ++PrefixLen; 1982 1983 // If the last character was not a '(', then we didn't lex a valid delimiter. 1984 if (CurPtr[PrefixLen] != '(') { 1985 if (!isLexingRawMode()) { 1986 const char *PrefixEnd = &CurPtr[PrefixLen]; 1987 if (PrefixLen == 16) { 1988 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1989 } else { 1990 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1991 << StringRef(PrefixEnd, 1); 1992 } 1993 } 1994 1995 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1996 // it's possible the '"' was intended to be part of the raw string, but 1997 // there's not much we can do about that. 1998 while (true) { 1999 char C = *CurPtr++; 2000 2001 if (C == '"') 2002 break; 2003 if (C == 0 && CurPtr-1 == BufferEnd) { 2004 --CurPtr; 2005 break; 2006 } 2007 } 2008 2009 FormTokenWithChars(Result, CurPtr, tok::unknown); 2010 return true; 2011 } 2012 2013 // Save prefix and move CurPtr past it 2014 const char *Prefix = CurPtr; 2015 CurPtr += PrefixLen + 1; // skip over prefix and '(' 2016 2017 while (true) { 2018 char C = *CurPtr++; 2019 2020 if (C == ')') { 2021 // Check for prefix match and closing quote. 2022 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2023 CurPtr += PrefixLen + 1; // skip over prefix and '"' 2024 break; 2025 } 2026 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2027 if (!isLexingRawMode()) 2028 Diag(BufferPtr, diag::err_unterminated_raw_string) 2029 << StringRef(Prefix, PrefixLen); 2030 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2031 return true; 2032 } 2033 } 2034 2035 // If we are in C++11, lex the optional ud-suffix. 2036 if (getLangOpts().CPlusPlus) 2037 CurPtr = LexUDSuffix(Result, CurPtr, true); 2038 2039 // Update the location of token as well as BufferPtr. 2040 const char *TokStart = BufferPtr; 2041 FormTokenWithChars(Result, CurPtr, Kind); 2042 Result.setLiteralData(TokStart); 2043 return true; 2044 } 2045 2046 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2047 /// after having lexed the '<' character. This is used for #include filenames. 2048 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2049 // Does this string contain the \0 character? 2050 const char *NulCharacter = nullptr; 2051 const char *AfterLessPos = CurPtr; 2052 char C = getAndAdvanceChar(CurPtr, Result); 2053 while (C != '>') { 2054 // Skip escaped characters. Escaped newlines will already be processed by 2055 // getAndAdvanceChar. 2056 if (C == '\\') 2057 C = getAndAdvanceChar(CurPtr, Result); 2058 2059 if (C == '\n' || C == '\r' || // Newline. 2060 (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2061 // If the filename is unterminated, then it must just be a lone < 2062 // character. Return this as such. 2063 FormTokenWithChars(Result, AfterLessPos, tok::less); 2064 return true; 2065 } 2066 2067 if (C == 0) { 2068 if (isCodeCompletionPoint(CurPtr - 1)) { 2069 codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2070 cutOffLexing(); 2071 FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2072 return true; 2073 } 2074 NulCharacter = CurPtr-1; 2075 } 2076 C = getAndAdvanceChar(CurPtr, Result); 2077 } 2078 2079 // If a nul character existed in the string, warn about it. 2080 if (NulCharacter && !isLexingRawMode()) 2081 Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2082 2083 // Update the location of token as well as BufferPtr. 2084 const char *TokStart = BufferPtr; 2085 FormTokenWithChars(Result, CurPtr, tok::header_name); 2086 Result.setLiteralData(TokStart); 2087 return true; 2088 } 2089 2090 void Lexer::codeCompleteIncludedFile(const char *PathStart, 2091 const char *CompletionPoint, 2092 bool IsAngled) { 2093 // Completion only applies to the filename, after the last slash. 2094 StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2095 auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/"); 2096 StringRef Dir = 2097 (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2098 const char *StartOfFilename = 2099 (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2100 // Code completion filter range is the filename only, up to completion point. 2101 PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2102 StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2103 // We should replace the characters up to the closing quote, if any. 2104 while (CompletionPoint < BufferEnd) { 2105 char Next = *(CompletionPoint + 1); 2106 if (Next == 0 || Next == '\r' || Next == '\n') 2107 break; 2108 ++CompletionPoint; 2109 if (Next == (IsAngled ? '>' : '"')) 2110 break; 2111 } 2112 PP->setCodeCompletionTokenRange( 2113 FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2114 FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2115 PP->CodeCompleteIncludedFile(Dir, IsAngled); 2116 } 2117 2118 /// LexCharConstant - Lex the remainder of a character constant, after having 2119 /// lexed either ' or L' or u8' or u' or U'. 2120 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2121 tok::TokenKind Kind) { 2122 // Does this character contain the \0 character? 2123 const char *NulCharacter = nullptr; 2124 2125 if (!isLexingRawMode()) { 2126 if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2127 Diag(BufferPtr, getLangOpts().CPlusPlus 2128 ? diag::warn_cxx98_compat_unicode_literal 2129 : diag::warn_c99_compat_unicode_literal); 2130 else if (Kind == tok::utf8_char_constant) 2131 Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2132 } 2133 2134 char C = getAndAdvanceChar(CurPtr, Result); 2135 if (C == '\'') { 2136 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2137 Diag(BufferPtr, diag::ext_empty_character); 2138 FormTokenWithChars(Result, CurPtr, tok::unknown); 2139 return true; 2140 } 2141 2142 while (C != '\'') { 2143 // Skip escaped characters. 2144 if (C == '\\') 2145 C = getAndAdvanceChar(CurPtr, Result); 2146 2147 if (C == '\n' || C == '\r' || // Newline. 2148 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2149 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2150 Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2151 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2152 return true; 2153 } 2154 2155 if (C == 0) { 2156 if (isCodeCompletionPoint(CurPtr-1)) { 2157 PP->CodeCompleteNaturalLanguage(); 2158 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2159 cutOffLexing(); 2160 return true; 2161 } 2162 2163 NulCharacter = CurPtr-1; 2164 } 2165 C = getAndAdvanceChar(CurPtr, Result); 2166 } 2167 2168 // If we are in C++11, lex the optional ud-suffix. 2169 if (getLangOpts().CPlusPlus) 2170 CurPtr = LexUDSuffix(Result, CurPtr, false); 2171 2172 // If a nul character existed in the character, warn about it. 2173 if (NulCharacter && !isLexingRawMode()) 2174 Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2175 2176 // Update the location of token as well as BufferPtr. 2177 const char *TokStart = BufferPtr; 2178 FormTokenWithChars(Result, CurPtr, Kind); 2179 Result.setLiteralData(TokStart); 2180 return true; 2181 } 2182 2183 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2184 /// Update BufferPtr to point to the next non-whitespace character and return. 2185 /// 2186 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2187 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2188 bool &TokAtPhysicalStartOfLine) { 2189 // Whitespace - Skip it, then return the token after the whitespace. 2190 bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2191 2192 unsigned char Char = *CurPtr; 2193 2194 // Skip consecutive spaces efficiently. 2195 while (true) { 2196 // Skip horizontal whitespace very aggressively. 2197 while (isHorizontalWhitespace(Char)) 2198 Char = *++CurPtr; 2199 2200 // Otherwise if we have something other than whitespace, we're done. 2201 if (!isVerticalWhitespace(Char)) 2202 break; 2203 2204 if (ParsingPreprocessorDirective) { 2205 // End of preprocessor directive line, let LexTokenInternal handle this. 2206 BufferPtr = CurPtr; 2207 return false; 2208 } 2209 2210 // OK, but handle newline. 2211 SawNewline = true; 2212 Char = *++CurPtr; 2213 } 2214 2215 // If the client wants us to return whitespace, return it now. 2216 if (isKeepWhitespaceMode()) { 2217 FormTokenWithChars(Result, CurPtr, tok::unknown); 2218 if (SawNewline) { 2219 IsAtStartOfLine = true; 2220 IsAtPhysicalStartOfLine = true; 2221 } 2222 // FIXME: The next token will not have LeadingSpace set. 2223 return true; 2224 } 2225 2226 // If this isn't immediately after a newline, there is leading space. 2227 char PrevChar = CurPtr[-1]; 2228 bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2229 2230 Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2231 if (SawNewline) { 2232 Result.setFlag(Token::StartOfLine); 2233 TokAtPhysicalStartOfLine = true; 2234 } 2235 2236 BufferPtr = CurPtr; 2237 return false; 2238 } 2239 2240 /// We have just read the // characters from input. Skip until we find the 2241 /// newline character that terminates the comment. Then update BufferPtr and 2242 /// return. 2243 /// 2244 /// If we're in KeepCommentMode or any CommentHandler has inserted 2245 /// some tokens, this will store the first token and return true. 2246 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2247 bool &TokAtPhysicalStartOfLine) { 2248 // If Line comments aren't explicitly enabled for this language, emit an 2249 // extension warning. 2250 if (!LangOpts.LineComment && !isLexingRawMode()) { 2251 Diag(BufferPtr, diag::ext_line_comment); 2252 2253 // Mark them enabled so we only emit one warning for this translation 2254 // unit. 2255 LangOpts.LineComment = true; 2256 } 2257 2258 // Scan over the body of the comment. The common case, when scanning, is that 2259 // the comment contains normal ascii characters with nothing interesting in 2260 // them. As such, optimize for this case with the inner loop. 2261 // 2262 // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2263 // character that ends the line comment. 2264 char C; 2265 while (true) { 2266 C = *CurPtr; 2267 // Skip over characters in the fast loop. 2268 while (C != 0 && // Potentially EOF. 2269 C != '\n' && C != '\r') // Newline or DOS-style newline. 2270 C = *++CurPtr; 2271 2272 const char *NextLine = CurPtr; 2273 if (C != 0) { 2274 // We found a newline, see if it's escaped. 2275 const char *EscapePtr = CurPtr-1; 2276 bool HasSpace = false; 2277 while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2278 --EscapePtr; 2279 HasSpace = true; 2280 } 2281 2282 if (*EscapePtr == '\\') 2283 // Escaped newline. 2284 CurPtr = EscapePtr; 2285 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2286 EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2287 // Trigraph-escaped newline. 2288 CurPtr = EscapePtr-2; 2289 else 2290 break; // This is a newline, we're done. 2291 2292 // If there was space between the backslash and newline, warn about it. 2293 if (HasSpace && !isLexingRawMode()) 2294 Diag(EscapePtr, diag::backslash_newline_space); 2295 } 2296 2297 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2298 // properly decode the character. Read it in raw mode to avoid emitting 2299 // diagnostics about things like trigraphs. If we see an escaped newline, 2300 // we'll handle it below. 2301 const char *OldPtr = CurPtr; 2302 bool OldRawMode = isLexingRawMode(); 2303 LexingRawMode = true; 2304 C = getAndAdvanceChar(CurPtr, Result); 2305 LexingRawMode = OldRawMode; 2306 2307 // If we only read only one character, then no special handling is needed. 2308 // We're done and can skip forward to the newline. 2309 if (C != 0 && CurPtr == OldPtr+1) { 2310 CurPtr = NextLine; 2311 break; 2312 } 2313 2314 // If we read multiple characters, and one of those characters was a \r or 2315 // \n, then we had an escaped newline within the comment. Emit diagnostic 2316 // unless the next line is also a // comment. 2317 if (CurPtr != OldPtr + 1 && C != '/' && 2318 (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2319 for (; OldPtr != CurPtr; ++OldPtr) 2320 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2321 // Okay, we found a // comment that ends in a newline, if the next 2322 // line is also a // comment, but has spaces, don't emit a diagnostic. 2323 if (isWhitespace(C)) { 2324 const char *ForwardPtr = CurPtr; 2325 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2326 ++ForwardPtr; 2327 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2328 break; 2329 } 2330 2331 if (!isLexingRawMode()) 2332 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2333 break; 2334 } 2335 } 2336 2337 if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2338 --CurPtr; 2339 break; 2340 } 2341 2342 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2343 PP->CodeCompleteNaturalLanguage(); 2344 cutOffLexing(); 2345 return false; 2346 } 2347 } 2348 2349 // Found but did not consume the newline. Notify comment handlers about the 2350 // comment unless we're in a #if 0 block. 2351 if (PP && !isLexingRawMode() && 2352 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2353 getSourceLocation(CurPtr)))) { 2354 BufferPtr = CurPtr; 2355 return true; // A token has to be returned. 2356 } 2357 2358 // If we are returning comments as tokens, return this comment as a token. 2359 if (inKeepCommentMode()) 2360 return SaveLineComment(Result, CurPtr); 2361 2362 // If we are inside a preprocessor directive and we see the end of line, 2363 // return immediately, so that the lexer can return this as an EOD token. 2364 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2365 BufferPtr = CurPtr; 2366 return false; 2367 } 2368 2369 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2370 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2371 // contribute to another token), it isn't needed for correctness. Note that 2372 // this is ok even in KeepWhitespaceMode, because we would have returned the 2373 /// comment above in that mode. 2374 ++CurPtr; 2375 2376 // The next returned token is at the start of the line. 2377 Result.setFlag(Token::StartOfLine); 2378 TokAtPhysicalStartOfLine = true; 2379 // No leading whitespace seen so far. 2380 Result.clearFlag(Token::LeadingSpace); 2381 BufferPtr = CurPtr; 2382 return false; 2383 } 2384 2385 /// If in save-comment mode, package up this Line comment in an appropriate 2386 /// way and return it. 2387 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2388 // If we're not in a preprocessor directive, just return the // comment 2389 // directly. 2390 FormTokenWithChars(Result, CurPtr, tok::comment); 2391 2392 if (!ParsingPreprocessorDirective || LexingRawMode) 2393 return true; 2394 2395 // If this Line-style comment is in a macro definition, transmogrify it into 2396 // a C-style block comment. 2397 bool Invalid = false; 2398 std::string Spelling = PP->getSpelling(Result, &Invalid); 2399 if (Invalid) 2400 return true; 2401 2402 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2403 Spelling[1] = '*'; // Change prefix to "/*". 2404 Spelling += "*/"; // add suffix. 2405 2406 Result.setKind(tok::comment); 2407 PP->CreateString(Spelling, Result, 2408 Result.getLocation(), Result.getLocation()); 2409 return true; 2410 } 2411 2412 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2413 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2414 /// a diagnostic if so. We know that the newline is inside of a block comment. 2415 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2416 Lexer *L) { 2417 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2418 2419 // Back up off the newline. 2420 --CurPtr; 2421 2422 // If this is a two-character newline sequence, skip the other character. 2423 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2424 // \n\n or \r\r -> not escaped newline. 2425 if (CurPtr[0] == CurPtr[1]) 2426 return false; 2427 // \n\r or \r\n -> skip the newline. 2428 --CurPtr; 2429 } 2430 2431 // If we have horizontal whitespace, skip over it. We allow whitespace 2432 // between the slash and newline. 2433 bool HasSpace = false; 2434 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2435 --CurPtr; 2436 HasSpace = true; 2437 } 2438 2439 // If we have a slash, we know this is an escaped newline. 2440 if (*CurPtr == '\\') { 2441 if (CurPtr[-1] != '*') return false; 2442 } else { 2443 // It isn't a slash, is it the ?? / trigraph? 2444 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2445 CurPtr[-3] != '*') 2446 return false; 2447 2448 // This is the trigraph ending the comment. Emit a stern warning! 2449 CurPtr -= 2; 2450 2451 // If no trigraphs are enabled, warn that we ignored this trigraph and 2452 // ignore this * character. 2453 if (!L->getLangOpts().Trigraphs) { 2454 if (!L->isLexingRawMode()) 2455 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2456 return false; 2457 } 2458 if (!L->isLexingRawMode()) 2459 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2460 } 2461 2462 // Warn about having an escaped newline between the */ characters. 2463 if (!L->isLexingRawMode()) 2464 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2465 2466 // If there was space between the backslash and newline, warn about it. 2467 if (HasSpace && !L->isLexingRawMode()) 2468 L->Diag(CurPtr, diag::backslash_newline_space); 2469 2470 return true; 2471 } 2472 2473 #ifdef __SSE2__ 2474 #include <emmintrin.h> 2475 #elif __ALTIVEC__ 2476 #include <altivec.h> 2477 #undef bool 2478 #endif 2479 2480 /// We have just read from input the / and * characters that started a comment. 2481 /// Read until we find the * and / characters that terminate the comment. 2482 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2483 /// comments, because they cannot cause the comment to end. The only thing 2484 /// that can happen is the comment could end with an escaped newline between 2485 /// the terminating * and /. 2486 /// 2487 /// If we're in KeepCommentMode or any CommentHandler has inserted 2488 /// some tokens, this will store the first token and return true. 2489 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2490 bool &TokAtPhysicalStartOfLine) { 2491 // Scan one character past where we should, looking for a '/' character. Once 2492 // we find it, check to see if it was preceded by a *. This common 2493 // optimization helps people who like to put a lot of * characters in their 2494 // comments. 2495 2496 // The first character we get with newlines and trigraphs skipped to handle 2497 // the degenerate /*/ case below correctly if the * has an escaped newline 2498 // after it. 2499 unsigned CharSize; 2500 unsigned char C = getCharAndSize(CurPtr, CharSize); 2501 CurPtr += CharSize; 2502 if (C == 0 && CurPtr == BufferEnd+1) { 2503 if (!isLexingRawMode()) 2504 Diag(BufferPtr, diag::err_unterminated_block_comment); 2505 --CurPtr; 2506 2507 // KeepWhitespaceMode should return this broken comment as a token. Since 2508 // it isn't a well formed comment, just return it as an 'unknown' token. 2509 if (isKeepWhitespaceMode()) { 2510 FormTokenWithChars(Result, CurPtr, tok::unknown); 2511 return true; 2512 } 2513 2514 BufferPtr = CurPtr; 2515 return false; 2516 } 2517 2518 // Check to see if the first character after the '/*' is another /. If so, 2519 // then this slash does not end the block comment, it is part of it. 2520 if (C == '/') 2521 C = *CurPtr++; 2522 2523 while (true) { 2524 // Skip over all non-interesting characters until we find end of buffer or a 2525 // (probably ending) '/' character. 2526 if (CurPtr + 24 < BufferEnd && 2527 // If there is a code-completion point avoid the fast scan because it 2528 // doesn't check for '\0'. 2529 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2530 // While not aligned to a 16-byte boundary. 2531 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2532 C = *CurPtr++; 2533 2534 if (C == '/') goto FoundSlash; 2535 2536 #ifdef __SSE2__ 2537 __m128i Slashes = _mm_set1_epi8('/'); 2538 while (CurPtr+16 <= BufferEnd) { 2539 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2540 Slashes)); 2541 if (cmp != 0) { 2542 // Adjust the pointer to point directly after the first slash. It's 2543 // not necessary to set C here, it will be overwritten at the end of 2544 // the outer loop. 2545 CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 2546 goto FoundSlash; 2547 } 2548 CurPtr += 16; 2549 } 2550 #elif __ALTIVEC__ 2551 __vector unsigned char Slashes = { 2552 '/', '/', '/', '/', '/', '/', '/', '/', 2553 '/', '/', '/', '/', '/', '/', '/', '/' 2554 }; 2555 while (CurPtr + 16 <= BufferEnd && 2556 !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) 2557 CurPtr += 16; 2558 #else 2559 // Scan for '/' quickly. Many block comments are very large. 2560 while (CurPtr[0] != '/' && 2561 CurPtr[1] != '/' && 2562 CurPtr[2] != '/' && 2563 CurPtr[3] != '/' && 2564 CurPtr+4 < BufferEnd) { 2565 CurPtr += 4; 2566 } 2567 #endif 2568 2569 // It has to be one of the bytes scanned, increment to it and read one. 2570 C = *CurPtr++; 2571 } 2572 2573 // Loop to scan the remainder. 2574 while (C != '/' && C != '\0') 2575 C = *CurPtr++; 2576 2577 if (C == '/') { 2578 FoundSlash: 2579 if (CurPtr[-2] == '*') // We found the final */. We're done! 2580 break; 2581 2582 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2583 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2584 // We found the final */, though it had an escaped newline between the 2585 // * and /. We're done! 2586 break; 2587 } 2588 } 2589 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2590 // If this is a /* inside of the comment, emit a warning. Don't do this 2591 // if this is a /*/, which will end the comment. This misses cases with 2592 // embedded escaped newlines, but oh well. 2593 if (!isLexingRawMode()) 2594 Diag(CurPtr-1, diag::warn_nested_block_comment); 2595 } 2596 } else if (C == 0 && CurPtr == BufferEnd+1) { 2597 if (!isLexingRawMode()) 2598 Diag(BufferPtr, diag::err_unterminated_block_comment); 2599 // Note: the user probably forgot a */. We could continue immediately 2600 // after the /*, but this would involve lexing a lot of what really is the 2601 // comment, which surely would confuse the parser. 2602 --CurPtr; 2603 2604 // KeepWhitespaceMode should return this broken comment as a token. Since 2605 // it isn't a well formed comment, just return it as an 'unknown' token. 2606 if (isKeepWhitespaceMode()) { 2607 FormTokenWithChars(Result, CurPtr, tok::unknown); 2608 return true; 2609 } 2610 2611 BufferPtr = CurPtr; 2612 return false; 2613 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2614 PP->CodeCompleteNaturalLanguage(); 2615 cutOffLexing(); 2616 return false; 2617 } 2618 2619 C = *CurPtr++; 2620 } 2621 2622 // Notify comment handlers about the comment unless we're in a #if 0 block. 2623 if (PP && !isLexingRawMode() && 2624 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2625 getSourceLocation(CurPtr)))) { 2626 BufferPtr = CurPtr; 2627 return true; // A token has to be returned. 2628 } 2629 2630 // If we are returning comments as tokens, return this comment as a token. 2631 if (inKeepCommentMode()) { 2632 FormTokenWithChars(Result, CurPtr, tok::comment); 2633 return true; 2634 } 2635 2636 // It is common for the tokens immediately after a /**/ comment to be 2637 // whitespace. Instead of going through the big switch, handle it 2638 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2639 // have already returned above with the comment as a token. 2640 if (isHorizontalWhitespace(*CurPtr)) { 2641 SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 2642 return false; 2643 } 2644 2645 // Otherwise, just return so that the next character will be lexed as a token. 2646 BufferPtr = CurPtr; 2647 Result.setFlag(Token::LeadingSpace); 2648 return false; 2649 } 2650 2651 //===----------------------------------------------------------------------===// 2652 // Primary Lexing Entry Points 2653 //===----------------------------------------------------------------------===// 2654 2655 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2656 /// uninterpreted string. This switches the lexer out of directive mode. 2657 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 2658 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2659 "Must be in a preprocessing directive!"); 2660 Token Tmp; 2661 Tmp.startToken(); 2662 2663 // CurPtr - Cache BufferPtr in an automatic variable. 2664 const char *CurPtr = BufferPtr; 2665 while (true) { 2666 char Char = getAndAdvanceChar(CurPtr, Tmp); 2667 switch (Char) { 2668 default: 2669 if (Result) 2670 Result->push_back(Char); 2671 break; 2672 case 0: // Null. 2673 // Found end of file? 2674 if (CurPtr-1 != BufferEnd) { 2675 if (isCodeCompletionPoint(CurPtr-1)) { 2676 PP->CodeCompleteNaturalLanguage(); 2677 cutOffLexing(); 2678 return; 2679 } 2680 2681 // Nope, normal character, continue. 2682 if (Result) 2683 Result->push_back(Char); 2684 break; 2685 } 2686 // FALL THROUGH. 2687 LLVM_FALLTHROUGH; 2688 case '\r': 2689 case '\n': 2690 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2691 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2692 BufferPtr = CurPtr-1; 2693 2694 // Next, lex the character, which should handle the EOD transition. 2695 Lex(Tmp); 2696 if (Tmp.is(tok::code_completion)) { 2697 if (PP) 2698 PP->CodeCompleteNaturalLanguage(); 2699 Lex(Tmp); 2700 } 2701 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2702 2703 // Finally, we're done; 2704 return; 2705 } 2706 } 2707 } 2708 2709 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 2710 /// condition, reporting diagnostics and handling other edge cases as required. 2711 /// This returns true if Result contains a token, false if PP.Lex should be 2712 /// called again. 2713 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2714 // If we hit the end of the file while parsing a preprocessor directive, 2715 // end the preprocessor directive first. The next token returned will 2716 // then be the end of file. 2717 if (ParsingPreprocessorDirective) { 2718 // Done parsing the "line". 2719 ParsingPreprocessorDirective = false; 2720 // Update the location of token as well as BufferPtr. 2721 FormTokenWithChars(Result, CurPtr, tok::eod); 2722 2723 // Restore comment saving mode, in case it was disabled for directive. 2724 if (PP) 2725 resetExtendedTokenMode(); 2726 return true; // Have a token. 2727 } 2728 2729 // If we are in raw mode, return this event as an EOF token. Let the caller 2730 // that put us in raw mode handle the event. 2731 if (isLexingRawMode()) { 2732 Result.startToken(); 2733 BufferPtr = BufferEnd; 2734 FormTokenWithChars(Result, BufferEnd, tok::eof); 2735 return true; 2736 } 2737 2738 if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 2739 PP->setRecordedPreambleConditionalStack(ConditionalStack); 2740 ConditionalStack.clear(); 2741 } 2742 2743 // Issue diagnostics for unterminated #if and missing newline. 2744 2745 // If we are in a #if directive, emit an error. 2746 while (!ConditionalStack.empty()) { 2747 if (PP->getCodeCompletionFileLoc() != FileLoc) 2748 PP->Diag(ConditionalStack.back().IfLoc, 2749 diag::err_pp_unterminated_conditional); 2750 ConditionalStack.pop_back(); 2751 } 2752 2753 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2754 // a pedwarn. 2755 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 2756 DiagnosticsEngine &Diags = PP->getDiagnostics(); 2757 SourceLocation EndLoc = getSourceLocation(BufferEnd); 2758 unsigned DiagID; 2759 2760 if (LangOpts.CPlusPlus11) { 2761 // C++11 [lex.phases] 2.2 p2 2762 // Prefer the C++98 pedantic compatibility warning over the generic, 2763 // non-extension, user-requested "missing newline at EOF" warning. 2764 if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 2765 DiagID = diag::warn_cxx98_compat_no_newline_eof; 2766 } else { 2767 DiagID = diag::warn_no_newline_eof; 2768 } 2769 } else { 2770 DiagID = diag::ext_no_newline_eof; 2771 } 2772 2773 Diag(BufferEnd, DiagID) 2774 << FixItHint::CreateInsertion(EndLoc, "\n"); 2775 } 2776 2777 BufferPtr = CurPtr; 2778 2779 // Finally, let the preprocessor handle this. 2780 return PP->HandleEndOfFile(Result, isPragmaLexer()); 2781 } 2782 2783 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2784 /// the specified lexer will return a tok::l_paren token, 0 if it is something 2785 /// else and 2 if there are no more tokens in the buffer controlled by the 2786 /// lexer. 2787 unsigned Lexer::isNextPPTokenLParen() { 2788 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2789 2790 // Switch to 'skipping' mode. This will ensure that we can lex a token 2791 // without emitting diagnostics, disables macro expansion, and will cause EOF 2792 // to return an EOF token instead of popping the include stack. 2793 LexingRawMode = true; 2794 2795 // Save state that can be changed while lexing so that we can restore it. 2796 const char *TmpBufferPtr = BufferPtr; 2797 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2798 bool atStartOfLine = IsAtStartOfLine; 2799 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 2800 bool leadingSpace = HasLeadingSpace; 2801 2802 Token Tok; 2803 Lex(Tok); 2804 2805 // Restore state that may have changed. 2806 BufferPtr = TmpBufferPtr; 2807 ParsingPreprocessorDirective = inPPDirectiveMode; 2808 HasLeadingSpace = leadingSpace; 2809 IsAtStartOfLine = atStartOfLine; 2810 IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 2811 2812 // Restore the lexer back to non-skipping mode. 2813 LexingRawMode = false; 2814 2815 if (Tok.is(tok::eof)) 2816 return 2; 2817 return Tok.is(tok::l_paren); 2818 } 2819 2820 /// Find the end of a version control conflict marker. 2821 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2822 ConflictMarkerKind CMK) { 2823 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2824 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2825 auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 2826 size_t Pos = RestOfBuffer.find(Terminator); 2827 while (Pos != StringRef::npos) { 2828 // Must occur at start of line. 2829 if (Pos == 0 || 2830 (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 2831 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2832 Pos = RestOfBuffer.find(Terminator); 2833 continue; 2834 } 2835 return RestOfBuffer.data()+Pos; 2836 } 2837 return nullptr; 2838 } 2839 2840 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 2841 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2842 /// and recover nicely. This returns true if it is a conflict marker and false 2843 /// if not. 2844 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2845 // Only a conflict marker if it starts at the beginning of a line. 2846 if (CurPtr != BufferStart && 2847 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2848 return false; 2849 2850 // Check to see if we have <<<<<<< or >>>>. 2851 if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 2852 !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 2853 return false; 2854 2855 // If we have a situation where we don't care about conflict markers, ignore 2856 // it. 2857 if (CurrentConflictMarkerState || isLexingRawMode()) 2858 return false; 2859 2860 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2861 2862 // Check to see if there is an ending marker somewhere in the buffer at the 2863 // start of a line to terminate this conflict marker. 2864 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2865 // We found a match. We are really in a conflict marker. 2866 // Diagnose this, and ignore to the end of line. 2867 Diag(CurPtr, diag::err_conflict_marker); 2868 CurrentConflictMarkerState = Kind; 2869 2870 // Skip ahead to the end of line. We know this exists because the 2871 // end-of-conflict marker starts with \r or \n. 2872 while (*CurPtr != '\r' && *CurPtr != '\n') { 2873 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2874 ++CurPtr; 2875 } 2876 BufferPtr = CurPtr; 2877 return true; 2878 } 2879 2880 // No end of conflict marker found. 2881 return false; 2882 } 2883 2884 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2885 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2886 /// is the end of a conflict marker. Handle it by ignoring up until the end of 2887 /// the line. This returns true if it is a conflict marker and false if not. 2888 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2889 // Only a conflict marker if it starts at the beginning of a line. 2890 if (CurPtr != BufferStart && 2891 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2892 return false; 2893 2894 // If we have a situation where we don't care about conflict markers, ignore 2895 // it. 2896 if (!CurrentConflictMarkerState || isLexingRawMode()) 2897 return false; 2898 2899 // Check to see if we have the marker (4 characters in a row). 2900 for (unsigned i = 1; i != 4; ++i) 2901 if (CurPtr[i] != CurPtr[0]) 2902 return false; 2903 2904 // If we do have it, search for the end of the conflict marker. This could 2905 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2906 // be the end of conflict marker. 2907 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2908 CurrentConflictMarkerState)) { 2909 CurPtr = End; 2910 2911 // Skip ahead to the end of line. 2912 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2913 ++CurPtr; 2914 2915 BufferPtr = CurPtr; 2916 2917 // No longer in the conflict marker. 2918 CurrentConflictMarkerState = CMK_None; 2919 return true; 2920 } 2921 2922 return false; 2923 } 2924 2925 static const char *findPlaceholderEnd(const char *CurPtr, 2926 const char *BufferEnd) { 2927 if (CurPtr == BufferEnd) 2928 return nullptr; 2929 BufferEnd -= 1; // Scan until the second last character. 2930 for (; CurPtr != BufferEnd; ++CurPtr) { 2931 if (CurPtr[0] == '#' && CurPtr[1] == '>') 2932 return CurPtr + 2; 2933 } 2934 return nullptr; 2935 } 2936 2937 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 2938 assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 2939 if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 2940 return false; 2941 const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 2942 if (!End) 2943 return false; 2944 const char *Start = CurPtr - 1; 2945 if (!LangOpts.AllowEditorPlaceholders) 2946 Diag(Start, diag::err_placeholder_in_source); 2947 Result.startToken(); 2948 FormTokenWithChars(Result, End, tok::raw_identifier); 2949 Result.setRawIdentifierData(Start); 2950 PP->LookUpIdentifierInfo(Result); 2951 Result.setFlag(Token::IsEditorPlaceholder); 2952 BufferPtr = End; 2953 return true; 2954 } 2955 2956 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2957 if (PP && PP->isCodeCompletionEnabled()) { 2958 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2959 return Loc == PP->getCodeCompletionLoc(); 2960 } 2961 2962 return false; 2963 } 2964 2965 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 2966 Token *Result) { 2967 unsigned CharSize; 2968 char Kind = getCharAndSize(StartPtr, CharSize); 2969 2970 unsigned NumHexDigits; 2971 if (Kind == 'u') 2972 NumHexDigits = 4; 2973 else if (Kind == 'U') 2974 NumHexDigits = 8; 2975 else 2976 return 0; 2977 2978 if (!LangOpts.CPlusPlus && !LangOpts.C99) { 2979 if (Result && !isLexingRawMode()) 2980 Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 2981 return 0; 2982 } 2983 2984 const char *CurPtr = StartPtr + CharSize; 2985 const char *KindLoc = &CurPtr[-1]; 2986 2987 uint32_t CodePoint = 0; 2988 for (unsigned i = 0; i < NumHexDigits; ++i) { 2989 char C = getCharAndSize(CurPtr, CharSize); 2990 2991 unsigned Value = llvm::hexDigitValue(C); 2992 if (Value == -1U) { 2993 if (Result && !isLexingRawMode()) { 2994 if (i == 0) { 2995 Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 2996 << StringRef(KindLoc, 1); 2997 } else { 2998 Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 2999 3000 // If the user wrote \U1234, suggest a fixit to \u. 3001 if (i == 4 && NumHexDigits == 8) { 3002 CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 3003 Diag(KindLoc, diag::note_ucn_four_not_eight) 3004 << FixItHint::CreateReplacement(URange, "u"); 3005 } 3006 } 3007 } 3008 3009 return 0; 3010 } 3011 3012 CodePoint <<= 4; 3013 CodePoint += Value; 3014 3015 CurPtr += CharSize; 3016 } 3017 3018 if (Result) { 3019 Result->setFlag(Token::HasUCN); 3020 if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) 3021 StartPtr = CurPtr; 3022 else 3023 while (StartPtr != CurPtr) 3024 (void)getAndAdvanceChar(StartPtr, *Result); 3025 } else { 3026 StartPtr = CurPtr; 3027 } 3028 3029 // Don't apply C family restrictions to UCNs in assembly mode 3030 if (LangOpts.AsmPreprocessor) 3031 return CodePoint; 3032 3033 // C99 6.4.3p2: A universal character name shall not specify a character whose 3034 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 3035 // 0060 (`), nor one in the range D800 through DFFF inclusive.) 3036 // C++11 [lex.charset]p2: If the hexadecimal value for a 3037 // universal-character-name corresponds to a surrogate code point (in the 3038 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3039 // if the hexadecimal value for a universal-character-name outside the 3040 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3041 // string literal corresponds to a control character (in either of the 3042 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3043 // basic source character set, the program is ill-formed. 3044 if (CodePoint < 0xA0) { 3045 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 3046 return CodePoint; 3047 3048 // We don't use isLexingRawMode() here because we need to warn about bad 3049 // UCNs even when skipping preprocessing tokens in a #if block. 3050 if (Result && PP) { 3051 if (CodePoint < 0x20 || CodePoint >= 0x7F) 3052 Diag(BufferPtr, diag::err_ucn_control_character); 3053 else { 3054 char C = static_cast<char>(CodePoint); 3055 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3056 } 3057 } 3058 3059 return 0; 3060 } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3061 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3062 // We don't use isLexingRawMode() here because we need to diagnose bad 3063 // UCNs even when skipping preprocessing tokens in a #if block. 3064 if (Result && PP) { 3065 if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3066 Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3067 else 3068 Diag(BufferPtr, diag::err_ucn_escape_invalid); 3069 } 3070 return 0; 3071 } 3072 3073 return CodePoint; 3074 } 3075 3076 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3077 const char *CurPtr) { 3078 static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 3079 UnicodeWhitespaceCharRanges); 3080 if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3081 UnicodeWhitespaceChars.contains(C)) { 3082 Diag(BufferPtr, diag::ext_unicode_whitespace) 3083 << makeCharRange(*this, BufferPtr, CurPtr); 3084 3085 Result.setFlag(Token::LeadingSpace); 3086 return true; 3087 } 3088 return false; 3089 } 3090 3091 bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 3092 if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { 3093 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3094 !PP->isPreprocessedOutput()) { 3095 maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 3096 makeCharRange(*this, BufferPtr, CurPtr), 3097 /*IsFirst=*/true); 3098 maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 3099 makeCharRange(*this, BufferPtr, CurPtr)); 3100 } 3101 3102 MIOpt.ReadToken(); 3103 return LexIdentifier(Result, CurPtr); 3104 } 3105 3106 if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3107 !PP->isPreprocessedOutput() && 3108 !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { 3109 // Non-ASCII characters tend to creep into source code unintentionally. 3110 // Instead of letting the parser complain about the unknown token, 3111 // just drop the character. 3112 // Note that we can /only/ do this when the non-ASCII character is actually 3113 // spelled as Unicode, not written as a UCN. The standard requires that 3114 // we not throw away any possible preprocessor tokens, but there's a 3115 // loophole in the mapping of Unicode characters to basic character set 3116 // characters that allows us to map these particular characters to, say, 3117 // whitespace. 3118 Diag(BufferPtr, diag::err_non_ascii) 3119 << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); 3120 3121 BufferPtr = CurPtr; 3122 return false; 3123 } 3124 3125 // Otherwise, we have an explicit UCN or a character that's unlikely to show 3126 // up by accident. 3127 MIOpt.ReadToken(); 3128 FormTokenWithChars(Result, CurPtr, tok::unknown); 3129 return true; 3130 } 3131 3132 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3133 IsAtStartOfLine = Result.isAtStartOfLine(); 3134 HasLeadingSpace = Result.hasLeadingSpace(); 3135 HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3136 // Note that this doesn't affect IsAtPhysicalStartOfLine. 3137 } 3138 3139 bool Lexer::Lex(Token &Result) { 3140 // Start a new token. 3141 Result.startToken(); 3142 3143 // Set up misc whitespace flags for LexTokenInternal. 3144 if (IsAtStartOfLine) { 3145 Result.setFlag(Token::StartOfLine); 3146 IsAtStartOfLine = false; 3147 } 3148 3149 if (HasLeadingSpace) { 3150 Result.setFlag(Token::LeadingSpace); 3151 HasLeadingSpace = false; 3152 } 3153 3154 if (HasLeadingEmptyMacro) { 3155 Result.setFlag(Token::LeadingEmptyMacro); 3156 HasLeadingEmptyMacro = false; 3157 } 3158 3159 bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3160 IsAtPhysicalStartOfLine = false; 3161 bool isRawLex = isLexingRawMode(); 3162 (void) isRawLex; 3163 bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3164 // (After the LexTokenInternal call, the lexer might be destroyed.) 3165 assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3166 return returnedToken; 3167 } 3168 3169 /// LexTokenInternal - This implements a simple C family lexer. It is an 3170 /// extremely performance critical piece of code. This assumes that the buffer 3171 /// has a null character at the end of the file. This returns a preprocessing 3172 /// token, not a normal token, as such, it is an internal interface. It assumes 3173 /// that the Flags of result have been cleared before calling this. 3174 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3175 LexNextToken: 3176 // New token, can't need cleaning yet. 3177 Result.clearFlag(Token::NeedsCleaning); 3178 Result.setIdentifierInfo(nullptr); 3179 3180 // CurPtr - Cache BufferPtr in an automatic variable. 3181 const char *CurPtr = BufferPtr; 3182 3183 // Small amounts of horizontal whitespace is very common between tokens. 3184 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 3185 ++CurPtr; 3186 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 3187 ++CurPtr; 3188 3189 // If we are keeping whitespace and other tokens, just return what we just 3190 // skipped. The next lexer invocation will return the token after the 3191 // whitespace. 3192 if (isKeepWhitespaceMode()) { 3193 FormTokenWithChars(Result, CurPtr, tok::unknown); 3194 // FIXME: The next token will not have LeadingSpace set. 3195 return true; 3196 } 3197 3198 BufferPtr = CurPtr; 3199 Result.setFlag(Token::LeadingSpace); 3200 } 3201 3202 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3203 3204 // Read a character, advancing over it. 3205 char Char = getAndAdvanceChar(CurPtr, Result); 3206 tok::TokenKind Kind; 3207 3208 switch (Char) { 3209 case 0: // Null. 3210 // Found end of file? 3211 if (CurPtr-1 == BufferEnd) 3212 return LexEndOfFile(Result, CurPtr-1); 3213 3214 // Check if we are performing code completion. 3215 if (isCodeCompletionPoint(CurPtr-1)) { 3216 // Return the code-completion token. 3217 Result.startToken(); 3218 FormTokenWithChars(Result, CurPtr, tok::code_completion); 3219 return true; 3220 } 3221 3222 if (!isLexingRawMode()) 3223 Diag(CurPtr-1, diag::null_in_file); 3224 Result.setFlag(Token::LeadingSpace); 3225 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3226 return true; // KeepWhitespaceMode 3227 3228 // We know the lexer hasn't changed, so just try again with this lexer. 3229 // (We manually eliminate the tail call to avoid recursion.) 3230 goto LexNextToken; 3231 3232 case 26: // DOS & CP/M EOF: "^Z". 3233 // If we're in Microsoft extensions mode, treat this as end of file. 3234 if (LangOpts.MicrosoftExt) { 3235 if (!isLexingRawMode()) 3236 Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3237 return LexEndOfFile(Result, CurPtr-1); 3238 } 3239 3240 // If Microsoft extensions are disabled, this is just random garbage. 3241 Kind = tok::unknown; 3242 break; 3243 3244 case '\r': 3245 if (CurPtr[0] == '\n') 3246 (void)getAndAdvanceChar(CurPtr, Result); 3247 LLVM_FALLTHROUGH; 3248 case '\n': 3249 // If we are inside a preprocessor directive and we see the end of line, 3250 // we know we are done with the directive, so return an EOD token. 3251 if (ParsingPreprocessorDirective) { 3252 // Done parsing the "line". 3253 ParsingPreprocessorDirective = false; 3254 3255 // Restore comment saving mode, in case it was disabled for directive. 3256 if (PP) 3257 resetExtendedTokenMode(); 3258 3259 // Since we consumed a newline, we are back at the start of a line. 3260 IsAtStartOfLine = true; 3261 IsAtPhysicalStartOfLine = true; 3262 3263 Kind = tok::eod; 3264 break; 3265 } 3266 3267 // No leading whitespace seen so far. 3268 Result.clearFlag(Token::LeadingSpace); 3269 3270 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3271 return true; // KeepWhitespaceMode 3272 3273 // We only saw whitespace, so just try again with this lexer. 3274 // (We manually eliminate the tail call to avoid recursion.) 3275 goto LexNextToken; 3276 case ' ': 3277 case '\t': 3278 case '\f': 3279 case '\v': 3280 SkipHorizontalWhitespace: 3281 Result.setFlag(Token::LeadingSpace); 3282 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3283 return true; // KeepWhitespaceMode 3284 3285 SkipIgnoredUnits: 3286 CurPtr = BufferPtr; 3287 3288 // If the next token is obviously a // or /* */ comment, skip it efficiently 3289 // too (without going through the big switch stmt). 3290 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3291 LangOpts.LineComment && 3292 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3293 if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3294 return true; // There is a token to return. 3295 goto SkipIgnoredUnits; 3296 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3297 if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3298 return true; // There is a token to return. 3299 goto SkipIgnoredUnits; 3300 } else if (isHorizontalWhitespace(*CurPtr)) { 3301 goto SkipHorizontalWhitespace; 3302 } 3303 // We only saw whitespace, so just try again with this lexer. 3304 // (We manually eliminate the tail call to avoid recursion.) 3305 goto LexNextToken; 3306 3307 // C99 6.4.4.1: Integer Constants. 3308 // C99 6.4.4.2: Floating Constants. 3309 case '0': case '1': case '2': case '3': case '4': 3310 case '5': case '6': case '7': case '8': case '9': 3311 // Notify MIOpt that we read a non-whitespace/non-comment token. 3312 MIOpt.ReadToken(); 3313 return LexNumericConstant(Result, CurPtr); 3314 3315 case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 3316 // Notify MIOpt that we read a non-whitespace/non-comment token. 3317 MIOpt.ReadToken(); 3318 3319 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3320 Char = getCharAndSize(CurPtr, SizeTmp); 3321 3322 // UTF-16 string literal 3323 if (Char == '"') 3324 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3325 tok::utf16_string_literal); 3326 3327 // UTF-16 character constant 3328 if (Char == '\'') 3329 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3330 tok::utf16_char_constant); 3331 3332 // UTF-16 raw string literal 3333 if (Char == 'R' && LangOpts.CPlusPlus11 && 3334 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3335 return LexRawStringLiteral(Result, 3336 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3337 SizeTmp2, Result), 3338 tok::utf16_string_literal); 3339 3340 if (Char == '8') { 3341 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3342 3343 // UTF-8 string literal 3344 if (Char2 == '"') 3345 return LexStringLiteral(Result, 3346 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3347 SizeTmp2, Result), 3348 tok::utf8_string_literal); 3349 if (Char2 == '\'' && LangOpts.CPlusPlus17) 3350 return LexCharConstant( 3351 Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3352 SizeTmp2, Result), 3353 tok::utf8_char_constant); 3354 3355 if (Char2 == 'R' && LangOpts.CPlusPlus11) { 3356 unsigned SizeTmp3; 3357 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3358 // UTF-8 raw string literal 3359 if (Char3 == '"') { 3360 return LexRawStringLiteral(Result, 3361 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3362 SizeTmp2, Result), 3363 SizeTmp3, Result), 3364 tok::utf8_string_literal); 3365 } 3366 } 3367 } 3368 } 3369 3370 // treat u like the start of an identifier. 3371 return LexIdentifier(Result, CurPtr); 3372 3373 case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 3374 // Notify MIOpt that we read a non-whitespace/non-comment token. 3375 MIOpt.ReadToken(); 3376 3377 if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3378 Char = getCharAndSize(CurPtr, SizeTmp); 3379 3380 // UTF-32 string literal 3381 if (Char == '"') 3382 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3383 tok::utf32_string_literal); 3384 3385 // UTF-32 character constant 3386 if (Char == '\'') 3387 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3388 tok::utf32_char_constant); 3389 3390 // UTF-32 raw string literal 3391 if (Char == 'R' && LangOpts.CPlusPlus11 && 3392 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3393 return LexRawStringLiteral(Result, 3394 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3395 SizeTmp2, Result), 3396 tok::utf32_string_literal); 3397 } 3398 3399 // treat U like the start of an identifier. 3400 return LexIdentifier(Result, CurPtr); 3401 3402 case 'R': // Identifier or C++0x raw string literal 3403 // Notify MIOpt that we read a non-whitespace/non-comment token. 3404 MIOpt.ReadToken(); 3405 3406 if (LangOpts.CPlusPlus11) { 3407 Char = getCharAndSize(CurPtr, SizeTmp); 3408 3409 if (Char == '"') 3410 return LexRawStringLiteral(Result, 3411 ConsumeChar(CurPtr, SizeTmp, Result), 3412 tok::string_literal); 3413 } 3414 3415 // treat R like the start of an identifier. 3416 return LexIdentifier(Result, CurPtr); 3417 3418 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3419 // Notify MIOpt that we read a non-whitespace/non-comment token. 3420 MIOpt.ReadToken(); 3421 Char = getCharAndSize(CurPtr, SizeTmp); 3422 3423 // Wide string literal. 3424 if (Char == '"') 3425 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3426 tok::wide_string_literal); 3427 3428 // Wide raw string literal. 3429 if (LangOpts.CPlusPlus11 && Char == 'R' && 3430 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3431 return LexRawStringLiteral(Result, 3432 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3433 SizeTmp2, Result), 3434 tok::wide_string_literal); 3435 3436 // Wide character constant. 3437 if (Char == '\'') 3438 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3439 tok::wide_char_constant); 3440 // FALL THROUGH, treating L like the start of an identifier. 3441 LLVM_FALLTHROUGH; 3442 3443 // C99 6.4.2: Identifiers. 3444 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3445 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3446 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3447 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3448 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3449 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3450 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3451 case 'v': case 'w': case 'x': case 'y': case 'z': 3452 case '_': 3453 // Notify MIOpt that we read a non-whitespace/non-comment token. 3454 MIOpt.ReadToken(); 3455 return LexIdentifier(Result, CurPtr); 3456 3457 case '$': // $ in identifiers. 3458 if (LangOpts.DollarIdents) { 3459 if (!isLexingRawMode()) 3460 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3461 // Notify MIOpt that we read a non-whitespace/non-comment token. 3462 MIOpt.ReadToken(); 3463 return LexIdentifier(Result, CurPtr); 3464 } 3465 3466 Kind = tok::unknown; 3467 break; 3468 3469 // C99 6.4.4: Character Constants. 3470 case '\'': 3471 // Notify MIOpt that we read a non-whitespace/non-comment token. 3472 MIOpt.ReadToken(); 3473 return LexCharConstant(Result, CurPtr, tok::char_constant); 3474 3475 // C99 6.4.5: String Literals. 3476 case '"': 3477 // Notify MIOpt that we read a non-whitespace/non-comment token. 3478 MIOpt.ReadToken(); 3479 return LexStringLiteral(Result, CurPtr, 3480 ParsingFilename ? tok::header_name 3481 : tok::string_literal); 3482 3483 // C99 6.4.6: Punctuators. 3484 case '?': 3485 Kind = tok::question; 3486 break; 3487 case '[': 3488 Kind = tok::l_square; 3489 break; 3490 case ']': 3491 Kind = tok::r_square; 3492 break; 3493 case '(': 3494 Kind = tok::l_paren; 3495 break; 3496 case ')': 3497 Kind = tok::r_paren; 3498 break; 3499 case '{': 3500 Kind = tok::l_brace; 3501 break; 3502 case '}': 3503 Kind = tok::r_brace; 3504 break; 3505 case '.': 3506 Char = getCharAndSize(CurPtr, SizeTmp); 3507 if (Char >= '0' && Char <= '9') { 3508 // Notify MIOpt that we read a non-whitespace/non-comment token. 3509 MIOpt.ReadToken(); 3510 3511 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 3512 } else if (LangOpts.CPlusPlus && Char == '*') { 3513 Kind = tok::periodstar; 3514 CurPtr += SizeTmp; 3515 } else if (Char == '.' && 3516 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 3517 Kind = tok::ellipsis; 3518 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3519 SizeTmp2, Result); 3520 } else { 3521 Kind = tok::period; 3522 } 3523 break; 3524 case '&': 3525 Char = getCharAndSize(CurPtr, SizeTmp); 3526 if (Char == '&') { 3527 Kind = tok::ampamp; 3528 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3529 } else if (Char == '=') { 3530 Kind = tok::ampequal; 3531 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3532 } else { 3533 Kind = tok::amp; 3534 } 3535 break; 3536 case '*': 3537 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3538 Kind = tok::starequal; 3539 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3540 } else { 3541 Kind = tok::star; 3542 } 3543 break; 3544 case '+': 3545 Char = getCharAndSize(CurPtr, SizeTmp); 3546 if (Char == '+') { 3547 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3548 Kind = tok::plusplus; 3549 } else if (Char == '=') { 3550 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3551 Kind = tok::plusequal; 3552 } else { 3553 Kind = tok::plus; 3554 } 3555 break; 3556 case '-': 3557 Char = getCharAndSize(CurPtr, SizeTmp); 3558 if (Char == '-') { // -- 3559 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3560 Kind = tok::minusminus; 3561 } else if (Char == '>' && LangOpts.CPlusPlus && 3562 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 3563 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3564 SizeTmp2, Result); 3565 Kind = tok::arrowstar; 3566 } else if (Char == '>') { // -> 3567 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3568 Kind = tok::arrow; 3569 } else if (Char == '=') { // -= 3570 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3571 Kind = tok::minusequal; 3572 } else { 3573 Kind = tok::minus; 3574 } 3575 break; 3576 case '~': 3577 Kind = tok::tilde; 3578 break; 3579 case '!': 3580 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3581 Kind = tok::exclaimequal; 3582 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3583 } else { 3584 Kind = tok::exclaim; 3585 } 3586 break; 3587 case '/': 3588 // 6.4.9: Comments 3589 Char = getCharAndSize(CurPtr, SizeTmp); 3590 if (Char == '/') { // Line comment. 3591 // Even if Line comments are disabled (e.g. in C89 mode), we generally 3592 // want to lex this as a comment. There is one problem with this though, 3593 // that in one particular corner case, this can change the behavior of the 3594 // resultant program. For example, In "foo //**/ bar", C89 would lex 3595 // this as "foo / bar" and languages with Line comments would lex it as 3596 // "foo". Check to see if the character after the second slash is a '*'. 3597 // If so, we will lex that as a "/" instead of the start of a comment. 3598 // However, we never do this if we are just preprocessing. 3599 bool TreatAsComment = LangOpts.LineComment && 3600 (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 3601 if (!TreatAsComment) 3602 if (!(PP && PP->isPreprocessedOutput())) 3603 TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 3604 3605 if (TreatAsComment) { 3606 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3607 TokAtPhysicalStartOfLine)) 3608 return true; // There is a token to return. 3609 3610 // It is common for the tokens immediately after a // comment to be 3611 // whitespace (indentation for the next line). Instead of going through 3612 // the big switch, handle it efficiently now. 3613 goto SkipIgnoredUnits; 3614 } 3615 } 3616 3617 if (Char == '*') { // /**/ comment. 3618 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3619 TokAtPhysicalStartOfLine)) 3620 return true; // There is a token to return. 3621 3622 // We only saw whitespace, so just try again with this lexer. 3623 // (We manually eliminate the tail call to avoid recursion.) 3624 goto LexNextToken; 3625 } 3626 3627 if (Char == '=') { 3628 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3629 Kind = tok::slashequal; 3630 } else { 3631 Kind = tok::slash; 3632 } 3633 break; 3634 case '%': 3635 Char = getCharAndSize(CurPtr, SizeTmp); 3636 if (Char == '=') { 3637 Kind = tok::percentequal; 3638 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3639 } else if (LangOpts.Digraphs && Char == '>') { 3640 Kind = tok::r_brace; // '%>' -> '}' 3641 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3642 } else if (LangOpts.Digraphs && Char == ':') { 3643 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3644 Char = getCharAndSize(CurPtr, SizeTmp); 3645 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 3646 Kind = tok::hashhash; // '%:%:' -> '##' 3647 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3648 SizeTmp2, Result); 3649 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 3650 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3651 if (!isLexingRawMode()) 3652 Diag(BufferPtr, diag::ext_charize_microsoft); 3653 Kind = tok::hashat; 3654 } else { // '%:' -> '#' 3655 // We parsed a # character. If this occurs at the start of the line, 3656 // it's actually the start of a preprocessing directive. Callback to 3657 // the preprocessor to handle it. 3658 // TODO: -fpreprocessed mode?? 3659 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3660 goto HandleDirective; 3661 3662 Kind = tok::hash; 3663 } 3664 } else { 3665 Kind = tok::percent; 3666 } 3667 break; 3668 case '<': 3669 Char = getCharAndSize(CurPtr, SizeTmp); 3670 if (ParsingFilename) { 3671 return LexAngledStringLiteral(Result, CurPtr); 3672 } else if (Char == '<') { 3673 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3674 if (After == '=') { 3675 Kind = tok::lesslessequal; 3676 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3677 SizeTmp2, Result); 3678 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3679 // If this is actually a '<<<<<<<' version control conflict marker, 3680 // recognize it as such and recover nicely. 3681 goto LexNextToken; 3682 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3683 // If this is '<<<<' and we're in a Perforce-style conflict marker, 3684 // ignore it. 3685 goto LexNextToken; 3686 } else if (LangOpts.CUDA && After == '<') { 3687 Kind = tok::lesslessless; 3688 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3689 SizeTmp2, Result); 3690 } else { 3691 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3692 Kind = tok::lessless; 3693 } 3694 } else if (Char == '=') { 3695 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3696 if (After == '>') { 3697 if (getLangOpts().CPlusPlus2a) { 3698 if (!isLexingRawMode()) 3699 Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 3700 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3701 SizeTmp2, Result); 3702 Kind = tok::spaceship; 3703 break; 3704 } 3705 // Suggest adding a space between the '<=' and the '>' to avoid a 3706 // change in semantics if this turns up in C++ <=17 mode. 3707 if (getLangOpts().CPlusPlus && !isLexingRawMode()) { 3708 Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship) 3709 << FixItHint::CreateInsertion( 3710 getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 3711 } 3712 } 3713 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3714 Kind = tok::lessequal; 3715 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 3716 if (LangOpts.CPlusPlus11 && 3717 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3718 // C++0x [lex.pptoken]p3: 3719 // Otherwise, if the next three characters are <:: and the subsequent 3720 // character is neither : nor >, the < is treated as a preprocessor 3721 // token by itself and not as the first character of the alternative 3722 // token <:. 3723 unsigned SizeTmp3; 3724 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3725 if (After != ':' && After != '>') { 3726 Kind = tok::less; 3727 if (!isLexingRawMode()) 3728 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3729 break; 3730 } 3731 } 3732 3733 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3734 Kind = tok::l_square; 3735 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 3736 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3737 Kind = tok::l_brace; 3738 } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 3739 lexEditorPlaceholder(Result, CurPtr)) { 3740 return true; 3741 } else { 3742 Kind = tok::less; 3743 } 3744 break; 3745 case '>': 3746 Char = getCharAndSize(CurPtr, SizeTmp); 3747 if (Char == '=') { 3748 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3749 Kind = tok::greaterequal; 3750 } else if (Char == '>') { 3751 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3752 if (After == '=') { 3753 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3754 SizeTmp2, Result); 3755 Kind = tok::greatergreaterequal; 3756 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3757 // If this is actually a '>>>>' conflict marker, recognize it as such 3758 // and recover nicely. 3759 goto LexNextToken; 3760 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3761 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3762 goto LexNextToken; 3763 } else if (LangOpts.CUDA && After == '>') { 3764 Kind = tok::greatergreatergreater; 3765 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3766 SizeTmp2, Result); 3767 } else { 3768 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3769 Kind = tok::greatergreater; 3770 } 3771 } else { 3772 Kind = tok::greater; 3773 } 3774 break; 3775 case '^': 3776 Char = getCharAndSize(CurPtr, SizeTmp); 3777 if (Char == '=') { 3778 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3779 Kind = tok::caretequal; 3780 } else if (LangOpts.OpenCL && Char == '^') { 3781 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3782 Kind = tok::caretcaret; 3783 } else { 3784 Kind = tok::caret; 3785 } 3786 break; 3787 case '|': 3788 Char = getCharAndSize(CurPtr, SizeTmp); 3789 if (Char == '=') { 3790 Kind = tok::pipeequal; 3791 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3792 } else if (Char == '|') { 3793 // If this is '|||||||' and we're in a conflict marker, ignore it. 3794 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3795 goto LexNextToken; 3796 Kind = tok::pipepipe; 3797 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3798 } else { 3799 Kind = tok::pipe; 3800 } 3801 break; 3802 case ':': 3803 Char = getCharAndSize(CurPtr, SizeTmp); 3804 if (LangOpts.Digraphs && Char == '>') { 3805 Kind = tok::r_square; // ':>' -> ']' 3806 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3807 } else if ((LangOpts.CPlusPlus || 3808 LangOpts.DoubleSquareBracketAttributes) && 3809 Char == ':') { 3810 Kind = tok::coloncolon; 3811 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3812 } else { 3813 Kind = tok::colon; 3814 } 3815 break; 3816 case ';': 3817 Kind = tok::semi; 3818 break; 3819 case '=': 3820 Char = getCharAndSize(CurPtr, SizeTmp); 3821 if (Char == '=') { 3822 // If this is '====' and we're in a conflict marker, ignore it. 3823 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3824 goto LexNextToken; 3825 3826 Kind = tok::equalequal; 3827 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3828 } else { 3829 Kind = tok::equal; 3830 } 3831 break; 3832 case ',': 3833 Kind = tok::comma; 3834 break; 3835 case '#': 3836 Char = getCharAndSize(CurPtr, SizeTmp); 3837 if (Char == '#') { 3838 Kind = tok::hashhash; 3839 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3840 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 3841 Kind = tok::hashat; 3842 if (!isLexingRawMode()) 3843 Diag(BufferPtr, diag::ext_charize_microsoft); 3844 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3845 } else { 3846 // We parsed a # character. If this occurs at the start of the line, 3847 // it's actually the start of a preprocessing directive. Callback to 3848 // the preprocessor to handle it. 3849 // TODO: -fpreprocessed mode?? 3850 if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3851 goto HandleDirective; 3852 3853 Kind = tok::hash; 3854 } 3855 break; 3856 3857 case '@': 3858 // Objective C support. 3859 if (CurPtr[-1] == '@' && LangOpts.ObjC) 3860 Kind = tok::at; 3861 else 3862 Kind = tok::unknown; 3863 break; 3864 3865 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 3866 case '\\': 3867 if (!LangOpts.AsmPreprocessor) { 3868 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 3869 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3870 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3871 return true; // KeepWhitespaceMode 3872 3873 // We only saw whitespace, so just try again with this lexer. 3874 // (We manually eliminate the tail call to avoid recursion.) 3875 goto LexNextToken; 3876 } 3877 3878 return LexUnicode(Result, CodePoint, CurPtr); 3879 } 3880 } 3881 3882 Kind = tok::unknown; 3883 break; 3884 3885 default: { 3886 if (isASCII(Char)) { 3887 Kind = tok::unknown; 3888 break; 3889 } 3890 3891 llvm::UTF32 CodePoint; 3892 3893 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 3894 // an escaped newline. 3895 --CurPtr; 3896 llvm::ConversionResult Status = 3897 llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 3898 (const llvm::UTF8 *)BufferEnd, 3899 &CodePoint, 3900 llvm::strictConversion); 3901 if (Status == llvm::conversionOK) { 3902 if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3903 if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3904 return true; // KeepWhitespaceMode 3905 3906 // We only saw whitespace, so just try again with this lexer. 3907 // (We manually eliminate the tail call to avoid recursion.) 3908 goto LexNextToken; 3909 } 3910 return LexUnicode(Result, CodePoint, CurPtr); 3911 } 3912 3913 if (isLexingRawMode() || ParsingPreprocessorDirective || 3914 PP->isPreprocessedOutput()) { 3915 ++CurPtr; 3916 Kind = tok::unknown; 3917 break; 3918 } 3919 3920 // Non-ASCII characters tend to creep into source code unintentionally. 3921 // Instead of letting the parser complain about the unknown token, 3922 // just diagnose the invalid UTF-8, then drop the character. 3923 Diag(CurPtr, diag::err_invalid_utf8); 3924 3925 BufferPtr = CurPtr+1; 3926 // We're pretending the character didn't exist, so just try again with 3927 // this lexer. 3928 // (We manually eliminate the tail call to avoid recursion.) 3929 goto LexNextToken; 3930 } 3931 } 3932 3933 // Notify MIOpt that we read a non-whitespace/non-comment token. 3934 MIOpt.ReadToken(); 3935 3936 // Update the location of token as well as BufferPtr. 3937 FormTokenWithChars(Result, CurPtr, Kind); 3938 return true; 3939 3940 HandleDirective: 3941 // We parsed a # character and it's the start of a preprocessing directive. 3942 3943 FormTokenWithChars(Result, CurPtr, tok::hash); 3944 PP->HandleDirective(Result); 3945 3946 if (PP->hadModuleLoaderFatalFailure()) { 3947 // With a fatal failure in the module loader, we abort parsing. 3948 assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 3949 return true; 3950 } 3951 3952 // We parsed the directive; lex a token with the new state. 3953 return false; 3954 } 3955