10b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "clang/Lex/Lexer.h" 140b57cec5SDimitry Andric #include "UnicodeCharSets.h" 150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h" 16e8d8bef9SDimitry Andric #include "clang/Basic/Diagnostic.h" 170b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 18e8d8bef9SDimitry Andric #include "clang/Basic/LLVM.h" 190b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h" 200b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h" 210b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h" 220b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 230b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h" 240b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h" 250b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h" 260b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h" 270b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h" 280b57cec5SDimitry Andric #include "clang/Lex/Token.h" 290b57cec5SDimitry Andric #include "llvm/ADT/None.h" 300b57cec5SDimitry Andric #include "llvm/ADT/Optional.h" 315ffd83dbSDimitry Andric #include "llvm/ADT/STLExtras.h" 320b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h" 330b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 34e8d8bef9SDimitry Andric #include "llvm/ADT/StringSwitch.h" 350b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 360b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 370b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h" 38e8d8bef9SDimitry Andric #include "llvm/Support/MemoryBufferRef.h" 390b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h" 400b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h" 410b57cec5SDimitry Andric #include <algorithm> 420b57cec5SDimitry Andric #include <cassert> 430b57cec5SDimitry Andric #include <cstddef> 440b57cec5SDimitry Andric #include <cstdint> 450b57cec5SDimitry Andric #include <cstring> 460b57cec5SDimitry Andric #include <string> 470b57cec5SDimitry Andric #include <tuple> 480b57cec5SDimitry Andric #include <utility> 490b57cec5SDimitry Andric 500b57cec5SDimitry Andric using namespace clang; 510b57cec5SDimitry Andric 520b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 530b57cec5SDimitry Andric // Token Class Implementation 540b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 550b57cec5SDimitry Andric 560b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 570b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 580b57cec5SDimitry Andric if (isAnnotation()) 590b57cec5SDimitry Andric return false; 600b57cec5SDimitry Andric if (IdentifierInfo *II = getIdentifierInfo()) 610b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey; 620b57cec5SDimitry Andric return false; 630b57cec5SDimitry Andric } 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind. 660b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const { 670b57cec5SDimitry Andric if (isAnnotation()) 680b57cec5SDimitry Andric return tok::objc_not_keyword; 690b57cec5SDimitry Andric IdentifierInfo *specId = getIdentifierInfo(); 700b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 710b57cec5SDimitry Andric } 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 740b57cec5SDimitry Andric // Lexer Class Implementation 750b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 760b57cec5SDimitry Andric 770b57cec5SDimitry Andric void Lexer::anchor() {} 780b57cec5SDimitry Andric 790b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 800b57cec5SDimitry Andric const char *BufEnd) { 810b57cec5SDimitry Andric BufferStart = BufStart; 820b57cec5SDimitry Andric BufferPtr = BufPtr; 830b57cec5SDimitry Andric BufferEnd = BufEnd; 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric assert(BufEnd[0] == 0 && 860b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end" 870b57cec5SDimitry Andric " to simplify lexing!"); 880b57cec5SDimitry Andric 890b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act 900b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just 910b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present. 920b57cec5SDimitry Andric if (BufferStart == BufferPtr) { 930b57cec5SDimitry Andric // Determine the size of the BOM. 940b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart); 950b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 960b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 970b57cec5SDimitry Andric .Default(0); 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric // Skip the BOM. 1000b57cec5SDimitry Andric BufferPtr += BOMLength; 1010b57cec5SDimitry Andric } 1020b57cec5SDimitry Andric 1030b57cec5SDimitry Andric Is_PragmaLexer = false; 1040b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric // Start of the file is a start of line. 1070b57cec5SDimitry Andric IsAtStartOfLine = true; 1080b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 1090b57cec5SDimitry Andric 1100b57cec5SDimitry Andric HasLeadingSpace = false; 1110b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 1120b57cec5SDimitry Andric 1130b57cec5SDimitry Andric // We are not after parsing a #. 1140b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 1150b57cec5SDimitry Andric 1160b57cec5SDimitry Andric // We are not after parsing #include. 1170b57cec5SDimitry Andric ParsingFilename = false; 1180b57cec5SDimitry Andric 1190b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation 1200b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 1210b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 1220b57cec5SDimitry Andric // or otherwise skipping over tokens. 1230b57cec5SDimitry Andric LexingRawMode = false; 1240b57cec5SDimitry Andric 1250b57cec5SDimitry Andric // Default to not keeping comments. 1260b57cec5SDimitry Andric ExtendedTokenMode = 0; 127e8d8bef9SDimitry Andric 128e8d8bef9SDimitry Andric NewLinePtr = nullptr; 1290b57cec5SDimitry Andric } 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer 1320b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer 1330b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will 1340b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them. 135e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 136349cc55cSDimitry Andric Preprocessor &PP, bool IsFirstIncludeOfFile) 1370b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID), 1380b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 139349cc55cSDimitry Andric LangOpts(PP.getLangOpts()), IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 140e8d8bef9SDimitry Andric InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 141e8d8bef9SDimitry Andric InputFile.getBufferEnd()); 1420b57cec5SDimitry Andric 1430b57cec5SDimitry Andric resetExtendedTokenMode(); 1440b57cec5SDimitry Andric } 1450b57cec5SDimitry Andric 1460b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1470b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1480b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 1490b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 150349cc55cSDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd, 151349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 152349cc55cSDimitry Andric : FileLoc(fileloc), LangOpts(langOpts), 153349cc55cSDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 1540b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd); 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric // We *are* in raw mode. 1570b57cec5SDimitry Andric LexingRawMode = true; 1580b57cec5SDimitry Andric } 1590b57cec5SDimitry Andric 1600b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1610b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1620b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 163e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 164349cc55cSDimitry Andric const SourceManager &SM, const LangOptions &langOpts, 165349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 166e8d8bef9SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 167349cc55cSDimitry Andric FromFile.getBufferStart(), FromFile.getBufferEnd(), 168349cc55cSDimitry Andric IsFirstIncludeOfFile) {} 1690b57cec5SDimitry Andric 1700b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() { 1710b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor"); 1720b57cec5SDimitry Andric if (LangOpts.TraditionalCPP) 1730b57cec5SDimitry Andric SetKeepWhitespaceMode(true); 1740b57cec5SDimitry Andric else 1750b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState()); 1760b57cec5SDimitry Andric } 1770b57cec5SDimitry Andric 1780b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 1790b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method 1800b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done. 1810b57cec5SDimitry Andric /// 1820b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a 1830b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an 1840b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be 1850b57cec5SDimitry Andric /// "expanded from". 1860b57cec5SDimitry Andric /// 1870b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a 1880b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making 1890b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 1900b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc 1910b57cec5SDimitry Andric /// out of the critical path of the lexer! 1920b57cec5SDimitry Andric /// 1930b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 1940b57cec5SDimitry Andric SourceLocation ExpansionLocStart, 1950b57cec5SDimitry Andric SourceLocation ExpansionLocEnd, 1960b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) { 1970b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 1980b57cec5SDimitry Andric 1990b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally. 2000b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc); 201e8d8bef9SDimitry Andric llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 2020b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP); 2030b57cec5SDimitry Andric 2040b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we 2050b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a 2060b57cec5SDimitry Andric // scratch buffer. 2070b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc); 2080b57cec5SDimitry Andric 2090b57cec5SDimitry Andric L->BufferPtr = StrData; 2100b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen; 2110b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 2120b57cec5SDimitry Andric 2130b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that 2140b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed. 2150b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 2160b57cec5SDimitry Andric ExpansionLocStart, 2170b57cec5SDimitry Andric ExpansionLocEnd, TokLen); 2180b57cec5SDimitry Andric 2190b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will 2200b57cec5SDimitry Andric // return an EOD token. 2210b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true; 2220b57cec5SDimitry Andric 2230b57cec5SDimitry Andric // This lexer really is for _Pragma. 2240b57cec5SDimitry Andric L->Is_PragmaLexer = true; 2250b57cec5SDimitry Andric return L; 2260b57cec5SDimitry Andric } 2270b57cec5SDimitry Andric 228a7dea167SDimitry Andric bool Lexer::skipOver(unsigned NumBytes) { 229a7dea167SDimitry Andric IsAtPhysicalStartOfLine = true; 230a7dea167SDimitry Andric IsAtStartOfLine = true; 231a7dea167SDimitry Andric if ((BufferPtr + NumBytes) > BufferEnd) 232a7dea167SDimitry Andric return true; 233a7dea167SDimitry Andric BufferPtr += NumBytes; 234a7dea167SDimitry Andric return false; 235a7dea167SDimitry Andric } 236a7dea167SDimitry Andric 2370b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) { 2380b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size(); 2390b57cec5SDimitry Andric while (i < e) { 2400b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) { 2410b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\'); 2420b57cec5SDimitry Andric i += 2; 2430b57cec5SDimitry Andric ++e; 2440b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') { 2450b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 2460b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 2470b57cec5SDimitry Andric Str[i] != Str[i + 1]) { 2480b57cec5SDimitry Andric Str[i] = '\\'; 2490b57cec5SDimitry Andric Str[i + 1] = 'n'; 2500b57cec5SDimitry Andric } else { 2510b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'. 2520b57cec5SDimitry Andric Str[i] = '\\'; 2530b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n'); 2540b57cec5SDimitry Andric ++e; 2550b57cec5SDimitry Andric } 2560b57cec5SDimitry Andric i += 2; 2570b57cec5SDimitry Andric } else 2580b57cec5SDimitry Andric ++i; 2590b57cec5SDimitry Andric } 2600b57cec5SDimitry Andric } 2610b57cec5SDimitry Andric 2620b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) { 2635ffd83dbSDimitry Andric std::string Result = std::string(Str); 2640b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"'; 2650b57cec5SDimitry Andric StringifyImpl(Result, Quote); 2660b57cec5SDimitry Andric return Result; 2670b57cec5SDimitry Andric } 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 2700b57cec5SDimitry Andric 2710b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2720b57cec5SDimitry Andric // Token Spelling 2730b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the 2760b57cec5SDimitry Andric /// spelling of this token from the provided input buffer. 2770b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 2780b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) { 2790b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 2800b57cec5SDimitry Andric 2810b57cec5SDimitry Andric size_t Length = 0; 2820b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength(); 2830b57cec5SDimitry Andric 2840b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) { 2850b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote. 2860b57cec5SDimitry Andric while (BufPtr < BufEnd) { 2870b57cec5SDimitry Andric unsigned Size; 2880b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 2890b57cec5SDimitry Andric BufPtr += Size; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric if (Spelling[Length - 1] == '"') 2920b57cec5SDimitry Andric break; 2930b57cec5SDimitry Andric } 2940b57cec5SDimitry Andric 2950b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line 2960b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their 2970b57cec5SDimitry Andric // r-char-sequence. 2980b57cec5SDimitry Andric if (Length >= 2 && 2990b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 3000b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing 3010b57cec5SDimitry Andric // quote. 3020b57cec5SDimitry Andric const char *RawEnd = BufEnd; 3030b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"'); 3040b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1; 3050b57cec5SDimitry Andric 3060b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling. 3070b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength); 3080b57cec5SDimitry Andric Length += RawLength; 3090b57cec5SDimitry Andric BufPtr += RawLength; 3100b57cec5SDimitry Andric 3110b57cec5SDimitry Andric // The rest of the token is lexed normally. 3120b57cec5SDimitry Andric } 3130b57cec5SDimitry Andric } 3140b57cec5SDimitry Andric 3150b57cec5SDimitry Andric while (BufPtr < BufEnd) { 3160b57cec5SDimitry Andric unsigned Size; 3170b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 3180b57cec5SDimitry Andric BufPtr += Size; 3190b57cec5SDimitry Andric } 3200b57cec5SDimitry Andric 3210b57cec5SDimitry Andric assert(Length < Tok.getLength() && 3220b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!"); 3230b57cec5SDimitry Andric return Length; 3240b57cec5SDimitry Andric } 3250b57cec5SDimitry Andric 3260b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3270b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3280b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3290b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3300b57cec5SDimitry Andric /// UCNs, etc. 3310b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc, 3320b57cec5SDimitry Andric SmallVectorImpl<char> &buffer, 3330b57cec5SDimitry Andric const SourceManager &SM, 3340b57cec5SDimitry Andric const LangOptions &options, 3350b57cec5SDimitry Andric bool *invalid) { 3360b57cec5SDimitry Andric // Break down the source location. 3370b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 3380b57cec5SDimitry Andric 3390b57cec5SDimitry Andric // Try to the load the file buffer. 3400b57cec5SDimitry Andric bool invalidTemp = false; 3410b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 3420b57cec5SDimitry Andric if (invalidTemp) { 3430b57cec5SDimitry Andric if (invalid) *invalid = true; 3440b57cec5SDimitry Andric return {}; 3450b57cec5SDimitry Andric } 3460b57cec5SDimitry Andric 3470b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second; 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric // Lex from the start of the given location. 3500b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 3510b57cec5SDimitry Andric file.begin(), tokenBegin, file.end()); 3520b57cec5SDimitry Andric Token token; 3530b57cec5SDimitry Andric lexer.LexFromRawLexer(token); 3540b57cec5SDimitry Andric 3550b57cec5SDimitry Andric unsigned length = token.getLength(); 3560b57cec5SDimitry Andric 3570b57cec5SDimitry Andric // Common case: no need for cleaning. 3580b57cec5SDimitry Andric if (!token.needsCleaning()) 3590b57cec5SDimitry Andric return StringRef(tokenBegin, length); 3600b57cec5SDimitry Andric 3610b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string. 3620b57cec5SDimitry Andric buffer.resize(length); 3630b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 3640b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size()); 3650b57cec5SDimitry Andric } 3660b57cec5SDimitry Andric 3670b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3680b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3690b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3700b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3710b57cec5SDimitry Andric /// UCNs, etc. 3720b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 3730b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 3740b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 3750b57cec5SDimitry Andric 3760b57cec5SDimitry Andric bool CharDataInvalid = false; 3770b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 3780b57cec5SDimitry Andric &CharDataInvalid); 3790b57cec5SDimitry Andric if (Invalid) 3800b57cec5SDimitry Andric *Invalid = CharDataInvalid; 3810b57cec5SDimitry Andric if (CharDataInvalid) 3820b57cec5SDimitry Andric return {}; 3830b57cec5SDimitry Andric 3840b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 3850b57cec5SDimitry Andric if (!Tok.needsCleaning()) 3860b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength()); 3870b57cec5SDimitry Andric 3880b57cec5SDimitry Andric std::string Result; 3890b57cec5SDimitry Andric Result.resize(Tok.getLength()); 3900b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 3910b57cec5SDimitry Andric return Result; 3920b57cec5SDimitry Andric } 3930b57cec5SDimitry Andric 3940b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a 3950b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required 3960b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least 3970b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned. 3980b57cec5SDimitry Andric /// 3990b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in 4000b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer* 4010b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a 4020b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer 4030b57cec5SDimitry Andric /// if an internal buffer is returned. 4040b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 4050b57cec5SDimitry Andric const SourceManager &SourceMgr, 4060b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 4070b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 4080b57cec5SDimitry Andric 4090b57cec5SDimitry Andric const char *TokStart = nullptr; 4100b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo. 4110b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier)) 4120b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data(); 4130b57cec5SDimitry Andric else if (!Tok.hasUCN()) { 4140b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 4150b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick. 4160b57cec5SDimitry Andric Buffer = II->getNameStart(); 4170b57cec5SDimitry Andric return II->getLength(); 4180b57cec5SDimitry Andric } 4190b57cec5SDimitry Andric } 4200b57cec5SDimitry Andric 4210b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo. 4220b57cec5SDimitry Andric if (Tok.isLiteral()) 4230b57cec5SDimitry Andric TokStart = Tok.getLiteralData(); 4240b57cec5SDimitry Andric 4250b57cec5SDimitry Andric if (!TokStart) { 4260b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer. 4270b57cec5SDimitry Andric bool CharDataInvalid = false; 4280b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 4290b57cec5SDimitry Andric if (Invalid) 4300b57cec5SDimitry Andric *Invalid = CharDataInvalid; 4310b57cec5SDimitry Andric if (CharDataInvalid) { 4320b57cec5SDimitry Andric Buffer = ""; 4330b57cec5SDimitry Andric return 0; 4340b57cec5SDimitry Andric } 4350b57cec5SDimitry Andric } 4360b57cec5SDimitry Andric 4370b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 4380b57cec5SDimitry Andric if (!Tok.needsCleaning()) { 4390b57cec5SDimitry Andric Buffer = TokStart; 4400b57cec5SDimitry Andric return Tok.getLength(); 4410b57cec5SDimitry Andric } 4420b57cec5SDimitry Andric 4430b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string. 4440b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 4450b57cec5SDimitry Andric } 4460b57cec5SDimitry Andric 4470b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return 4480b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g. 4490b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes 4500b57cec5SDimitry Andric /// that are part of that. 4510b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 4520b57cec5SDimitry Andric const SourceManager &SM, 4530b57cec5SDimitry Andric const LangOptions &LangOpts) { 4540b57cec5SDimitry Andric Token TheTok; 4550b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts)) 4560b57cec5SDimitry Andric return 0; 4570b57cec5SDimitry Andric return TheTok.getLength(); 4580b57cec5SDimitry Andric } 4590b57cec5SDimitry Andric 4600b57cec5SDimitry Andric /// Relex the token at the specified location. 4610b57cec5SDimitry Andric /// \returns true if there was a failure, false on success. 4620b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 4630b57cec5SDimitry Andric const SourceManager &SM, 4640b57cec5SDimitry Andric const LangOptions &LangOpts, 4650b57cec5SDimitry Andric bool IgnoreWhiteSpace) { 4660b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')', 4670b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle 4680b57cec5SDimitry Andric // all obviously single-char tokens. This could use 4690b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 4700b57cec5SDimitry Andric // something. 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not 4730b57cec5SDimitry Andric // the token this macro expanded to. 4740b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc); 4750b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 4760b57cec5SDimitry Andric bool Invalid = false; 4770b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 4780b57cec5SDimitry Andric if (Invalid) 4790b57cec5SDimitry Andric return true; 4800b57cec5SDimitry Andric 4810b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second; 4820b57cec5SDimitry Andric 4830b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 4840b57cec5SDimitry Andric return true; 4850b57cec5SDimitry Andric 4860b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 4870b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 4880b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end()); 4890b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 4900b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result); 4910b57cec5SDimitry Andric return false; 4920b57cec5SDimitry Andric } 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains 4950b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid. 4960b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 4970b57cec5SDimitry Andric const char *BufStart = Buffer.data(); 4980b57cec5SDimitry Andric if (Offset >= Buffer.size()) 4990b57cec5SDimitry Andric return nullptr; 5000b57cec5SDimitry Andric 5010b57cec5SDimitry Andric const char *LexStart = BufStart + Offset; 5020b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) { 5030b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) && 5040b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) { 5050b57cec5SDimitry Andric // LexStart should point at first character of logical line. 5060b57cec5SDimitry Andric ++LexStart; 5070b57cec5SDimitry Andric break; 5080b57cec5SDimitry Andric } 5090b57cec5SDimitry Andric } 5100b57cec5SDimitry Andric return LexStart; 5110b57cec5SDimitry Andric } 5120b57cec5SDimitry Andric 5130b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 5140b57cec5SDimitry Andric const SourceManager &SM, 5150b57cec5SDimitry Andric const LangOptions &LangOpts) { 5160b57cec5SDimitry Andric assert(Loc.isFileID()); 5170b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 5180b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 5190b57cec5SDimitry Andric return Loc; 5200b57cec5SDimitry Andric 5210b57cec5SDimitry Andric bool Invalid = false; 5220b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 5230b57cec5SDimitry Andric if (Invalid) 5240b57cec5SDimitry Andric return Loc; 5250b57cec5SDimitry Andric 5260b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line 5270b57cec5SDimitry Andric // (or the buffer). We'll relex from that point. 5280b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second; 5290b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 5300b57cec5SDimitry Andric if (!LexStart || LexStart == StrData) 5310b57cec5SDimitry Andric return Loc; 5320b57cec5SDimitry Andric 5330b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 5340b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 5350b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 5360b57cec5SDimitry Andric Buffer.end()); 5370b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location. 5400b57cec5SDimitry Andric Token TheTok; 5410b57cec5SDimitry Andric do { 5420b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) { 5450b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're 5460b57cec5SDimitry Andric // looking for. If the current token encompasses our source location, 5470b57cec5SDimitry Andric // return the beginning of that token. 5480b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 5490b57cec5SDimitry Andric return TheTok.getLocation(); 5500b57cec5SDimitry Andric 5510b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means 5520b57cec5SDimitry Andric // that it points into whitespace. We're done here. 5530b57cec5SDimitry Andric break; 5540b57cec5SDimitry Andric } 5550b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof); 5560b57cec5SDimitry Andric 5570b57cec5SDimitry Andric // We've passed our source location; just return the original source location. 5580b57cec5SDimitry Andric return Loc; 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric 5610b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 5620b57cec5SDimitry Andric const SourceManager &SM, 5630b57cec5SDimitry Andric const LangOptions &LangOpts) { 5640b57cec5SDimitry Andric if (Loc.isFileID()) 5650b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts); 5660b57cec5SDimitry Andric 5670b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc)) 5680b57cec5SDimitry Andric return Loc; 5690b57cec5SDimitry Andric 5700b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc); 5710b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 5720b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 5730b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo = 5740b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc); 5750b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first && 5760b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second); 5770b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 5780b57cec5SDimitry Andric } 5790b57cec5SDimitry Andric 5800b57cec5SDimitry Andric namespace { 5810b57cec5SDimitry Andric 5820b57cec5SDimitry Andric enum PreambleDirectiveKind { 5830b57cec5SDimitry Andric PDK_Skipped, 5840b57cec5SDimitry Andric PDK_Unknown 5850b57cec5SDimitry Andric }; 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric } // namespace 5880b57cec5SDimitry Andric 5890b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 5900b57cec5SDimitry Andric const LangOptions &LangOpts, 5910b57cec5SDimitry Andric unsigned MaxLines) { 5920b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a 5930b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our 5940b57cec5SDimitry Andric // position within the file. 595fe6060f1SDimitry Andric const SourceLocation::UIntTy StartOffset = 1; 5960b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 5970b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 5980b57cec5SDimitry Andric Buffer.end()); 5990b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric bool InPreprocessorDirective = false; 6020b57cec5SDimitry Andric Token TheTok; 6030b57cec5SDimitry Andric SourceLocation ActiveCommentLoc; 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric unsigned MaxLineOffset = 0; 6060b57cec5SDimitry Andric if (MaxLines) { 6070b57cec5SDimitry Andric const char *CurPtr = Buffer.begin(); 6080b57cec5SDimitry Andric unsigned CurLine = 0; 6090b57cec5SDimitry Andric while (CurPtr != Buffer.end()) { 6100b57cec5SDimitry Andric char ch = *CurPtr++; 6110b57cec5SDimitry Andric if (ch == '\n') { 6120b57cec5SDimitry Andric ++CurLine; 6130b57cec5SDimitry Andric if (CurLine == MaxLines) 6140b57cec5SDimitry Andric break; 6150b57cec5SDimitry Andric } 6160b57cec5SDimitry Andric } 6170b57cec5SDimitry Andric if (CurPtr != Buffer.end()) 6180b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin(); 6190b57cec5SDimitry Andric } 6200b57cec5SDimitry Andric 6210b57cec5SDimitry Andric do { 6220b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6230b57cec5SDimitry Andric 6240b57cec5SDimitry Andric if (InPreprocessorDirective) { 6250b57cec5SDimitry Andric // If we've hit the end of the file, we're done. 6260b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) { 6270b57cec5SDimitry Andric break; 6280b57cec5SDimitry Andric } 6290b57cec5SDimitry Andric 6300b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this 6310b57cec5SDimitry Andric // token. 6320b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine()) 6330b57cec5SDimitry Andric continue; 6340b57cec5SDimitry Andric 6350b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look 6360b57cec5SDimitry Andric // at this token again below. 6370b57cec5SDimitry Andric InPreprocessorDirective = false; 6380b57cec5SDimitry Andric } 6390b57cec5SDimitry Andric 6400b57cec5SDimitry Andric // Keep track of the # of lines in the preamble. 6410b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) { 6420b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 6430b57cec5SDimitry Andric 6440b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble, 6450b57cec5SDimitry Andric // and we're about to exceed that limit, we're done. 6460b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset) 6470b57cec5SDimitry Andric break; 6480b57cec5SDimitry Andric } 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric // Comments are okay; skip over them. 6510b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) { 6520b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid()) 6530b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation(); 6540b57cec5SDimitry Andric continue; 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 6580b57cec5SDimitry Andric // This is the start of a preprocessor directive. 6590b57cec5SDimitry Andric Token HashTok = TheTok; 6600b57cec5SDimitry Andric InPreprocessorDirective = true; 6610b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation(); 6620b57cec5SDimitry Andric 6630b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens, 6640b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at 6650b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives. 6660b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6670b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 6680b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier(); 6690b57cec5SDimitry Andric PreambleDirectiveKind PDK 6700b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 6710b57cec5SDimitry Andric .Case("include", PDK_Skipped) 6720b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped) 6730b57cec5SDimitry Andric .Case("define", PDK_Skipped) 6740b57cec5SDimitry Andric .Case("undef", PDK_Skipped) 6750b57cec5SDimitry Andric .Case("line", PDK_Skipped) 6760b57cec5SDimitry Andric .Case("error", PDK_Skipped) 6770b57cec5SDimitry Andric .Case("pragma", PDK_Skipped) 6780b57cec5SDimitry Andric .Case("import", PDK_Skipped) 6790b57cec5SDimitry Andric .Case("include_next", PDK_Skipped) 6800b57cec5SDimitry Andric .Case("warning", PDK_Skipped) 6810b57cec5SDimitry Andric .Case("ident", PDK_Skipped) 6820b57cec5SDimitry Andric .Case("sccs", PDK_Skipped) 6830b57cec5SDimitry Andric .Case("assert", PDK_Skipped) 6840b57cec5SDimitry Andric .Case("unassert", PDK_Skipped) 6850b57cec5SDimitry Andric .Case("if", PDK_Skipped) 6860b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped) 6870b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped) 6880b57cec5SDimitry Andric .Case("elif", PDK_Skipped) 689fe6060f1SDimitry Andric .Case("elifdef", PDK_Skipped) 690fe6060f1SDimitry Andric .Case("elifndef", PDK_Skipped) 6910b57cec5SDimitry Andric .Case("else", PDK_Skipped) 6920b57cec5SDimitry Andric .Case("endif", PDK_Skipped) 6930b57cec5SDimitry Andric .Default(PDK_Unknown); 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric switch (PDK) { 6960b57cec5SDimitry Andric case PDK_Skipped: 6970b57cec5SDimitry Andric continue; 6980b57cec5SDimitry Andric 6990b57cec5SDimitry Andric case PDK_Unknown: 7000b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'. 7010b57cec5SDimitry Andric break; 7020b57cec5SDimitry Andric } 7030b57cec5SDimitry Andric } 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor 7060b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this 7070b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'. 7080b57cec5SDimitry Andric TheTok = HashTok; 7090b57cec5SDimitry Andric } 7100b57cec5SDimitry Andric 7110b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the 7120b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in 7130b57cec5SDimitry Andric // the preamble. 7140b57cec5SDimitry Andric break; 7150b57cec5SDimitry Andric } while (true); 7160b57cec5SDimitry Andric 7170b57cec5SDimitry Andric SourceLocation End; 7180b57cec5SDimitry Andric if (ActiveCommentLoc.isValid()) 7190b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment. 7200b57cec5SDimitry Andric else 7210b57cec5SDimitry Andric End = TheTok.getLocation(); 7220b57cec5SDimitry Andric 7230b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 7240b57cec5SDimitry Andric TheTok.isAtStartOfLine()); 7250b57cec5SDimitry Andric } 7260b57cec5SDimitry Andric 7270b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 7280b57cec5SDimitry Andric const SourceManager &SM, 7290b57cec5SDimitry Andric const LangOptions &LangOpts) { 7300b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion 7310b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and 7320b57cec5SDimitry Andric // trigraphs. 7330b57cec5SDimitry Andric bool Invalid = false; 7340b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 7350b57cec5SDimitry Andric 7360b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done. 7370b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 7380b57cec5SDimitry Andric return 0; 7390b57cec5SDimitry Andric 7400b57cec5SDimitry Andric unsigned PhysOffset = 0; 7410b57cec5SDimitry Andric 7420b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip 7430b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple 7440b57cec5SDimitry Andric // chars, this method is extremely fast. 7450b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 7460b57cec5SDimitry Andric if (CharNo == 0) 7470b57cec5SDimitry Andric return PhysOffset; 7480b57cec5SDimitry Andric ++TokPtr; 7490b57cec5SDimitry Andric --CharNo; 7500b57cec5SDimitry Andric ++PhysOffset; 7510b57cec5SDimitry Andric } 7520b57cec5SDimitry Andric 7530b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a 7540b57cec5SDimitry Andric // lexer to parse it correctly. 7550b57cec5SDimitry Andric for (; CharNo; --CharNo) { 7560b57cec5SDimitry Andric unsigned Size; 7570b57cec5SDimitry Andric Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 7580b57cec5SDimitry Andric TokPtr += Size; 7590b57cec5SDimitry Andric PhysOffset += Size; 7600b57cec5SDimitry Andric } 7610b57cec5SDimitry Andric 7620b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the 7630b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar 7640b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding 7650b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph. 7660b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 7670b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 7680b57cec5SDimitry Andric 7690b57cec5SDimitry Andric return PhysOffset; 7700b57cec5SDimitry Andric } 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric /// Computes the source location just past the end of the 7730b57cec5SDimitry Andric /// token at this source location. 7740b57cec5SDimitry Andric /// 7750b57cec5SDimitry Andric /// This routine can be used to produce a source location that 7760b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and 7770b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a 7780b57cec5SDimitry Andric /// token where it expected something different that it received. If 7790b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if 7800b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid 7810b57cec5SDimitry Andric /// source location. 7820b57cec5SDimitry Andric /// 7830b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source 7840b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source 7850b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces 7860b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc. 7870b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 7880b57cec5SDimitry Andric const SourceManager &SM, 7890b57cec5SDimitry Andric const LangOptions &LangOpts) { 7900b57cec5SDimitry Andric if (Loc.isInvalid()) 7910b57cec5SDimitry Andric return {}; 7920b57cec5SDimitry Andric 7930b57cec5SDimitry Andric if (Loc.isMacroID()) { 7940b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 7950b57cec5SDimitry Andric return {}; // Points inside the macro expansion. 7960b57cec5SDimitry Andric } 7970b57cec5SDimitry Andric 7980b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 7990b57cec5SDimitry Andric if (Len > Offset) 8000b57cec5SDimitry Andric Len = Len - Offset; 8010b57cec5SDimitry Andric else 8020b57cec5SDimitry Andric return Loc; 8030b57cec5SDimitry Andric 8040b57cec5SDimitry Andric return Loc.getLocWithOffset(Len); 8050b57cec5SDimitry Andric } 8060b57cec5SDimitry Andric 8070b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first 8080b57cec5SDimitry Andric /// token of the macro expansion. 8090b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 8100b57cec5SDimitry Andric const SourceManager &SM, 8110b57cec5SDimitry Andric const LangOptions &LangOpts, 8120b57cec5SDimitry Andric SourceLocation *MacroBegin) { 8130b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8140b57cec5SDimitry Andric 8150b57cec5SDimitry Andric SourceLocation expansionLoc; 8160b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 8170b57cec5SDimitry Andric return false; 8180b57cec5SDimitry Andric 8190b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8200b57cec5SDimitry Andric // No other macro expansions, this is the first. 8210b57cec5SDimitry Andric if (MacroBegin) 8220b57cec5SDimitry Andric *MacroBegin = expansionLoc; 8230b57cec5SDimitry Andric return true; 8240b57cec5SDimitry Andric } 8250b57cec5SDimitry Andric 8260b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 8270b57cec5SDimitry Andric } 8280b57cec5SDimitry Andric 8290b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last 8300b57cec5SDimitry Andric /// token of the macro expansion. 8310b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 8320b57cec5SDimitry Andric const SourceManager &SM, 8330b57cec5SDimitry Andric const LangOptions &LangOpts, 8340b57cec5SDimitry Andric SourceLocation *MacroEnd) { 8350b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8360b57cec5SDimitry Andric 8370b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc); 8380b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 8390b57cec5SDimitry Andric if (tokLen == 0) 8400b57cec5SDimitry Andric return false; 8410b57cec5SDimitry Andric 8420b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 8430b57cec5SDimitry Andric SourceLocation expansionLoc; 8440b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 8450b57cec5SDimitry Andric return false; 8460b57cec5SDimitry Andric 8470b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8480b57cec5SDimitry Andric // No other macro expansions. 8490b57cec5SDimitry Andric if (MacroEnd) 8500b57cec5SDimitry Andric *MacroEnd = expansionLoc; 8510b57cec5SDimitry Andric return true; 8520b57cec5SDimitry Andric } 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 8550b57cec5SDimitry Andric } 8560b57cec5SDimitry Andric 8570b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 8580b57cec5SDimitry Andric const SourceManager &SM, 8590b57cec5SDimitry Andric const LangOptions &LangOpts) { 8600b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8610b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8620b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID()); 8630b57cec5SDimitry Andric if (Range.isTokenRange()) { 8640b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 8650b57cec5SDimitry Andric if (End.isInvalid()) 8660b57cec5SDimitry Andric return {}; 8670b57cec5SDimitry Andric } 8680b57cec5SDimitry Andric 8690b57cec5SDimitry Andric // Break down the source locations. 8700b57cec5SDimitry Andric FileID FID; 8710b57cec5SDimitry Andric unsigned BeginOffs; 8720b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 8730b57cec5SDimitry Andric if (FID.isInvalid()) 8740b57cec5SDimitry Andric return {}; 8750b57cec5SDimitry Andric 8760b57cec5SDimitry Andric unsigned EndOffs; 8770b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) || 8780b57cec5SDimitry Andric BeginOffs > EndOffs) 8790b57cec5SDimitry Andric return {}; 8800b57cec5SDimitry Andric 8810b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End); 8820b57cec5SDimitry Andric } 8830b57cec5SDimitry Andric 884fe6060f1SDimitry Andric // Assumes that `Loc` is in an expansion. 885fe6060f1SDimitry Andric static bool isInExpansionTokenRange(const SourceLocation Loc, 886fe6060f1SDimitry Andric const SourceManager &SM) { 887fe6060f1SDimitry Andric return SM.getSLocEntry(SM.getFileID(Loc)) 888fe6060f1SDimitry Andric .getExpansion() 889fe6060f1SDimitry Andric .isExpansionTokenRange(); 890fe6060f1SDimitry Andric } 891fe6060f1SDimitry Andric 8920b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 8930b57cec5SDimitry Andric const SourceManager &SM, 8940b57cec5SDimitry Andric const LangOptions &LangOpts) { 8950b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8960b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8970b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid()) 8980b57cec5SDimitry Andric return {}; 8990b57cec5SDimitry Andric 9000b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID()) 9010b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9020b57cec5SDimitry Andric 9030b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) { 9040b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 9050b57cec5SDimitry Andric return {}; 9060b57cec5SDimitry Andric Range.setBegin(Begin); 9070b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9080b57cec5SDimitry Andric } 9090b57cec5SDimitry Andric 9100b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) { 911fe6060f1SDimitry Andric if (Range.isTokenRange()) { 912fe6060f1SDimitry Andric if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 913fe6060f1SDimitry Andric return {}; 914fe6060f1SDimitry Andric // Use the *original* end, not the expanded one in `End`. 915fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 916fe6060f1SDimitry Andric } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 9170b57cec5SDimitry Andric return {}; 9180b57cec5SDimitry Andric Range.setEnd(End); 9190b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9200b57cec5SDimitry Andric } 9210b57cec5SDimitry Andric 9220b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID()); 9230b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd; 9240b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 9250b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 9260b57cec5SDimitry Andric &MacroEnd)) || 9270b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 9280b57cec5SDimitry Andric &MacroEnd)))) { 9290b57cec5SDimitry Andric Range.setBegin(MacroBegin); 9300b57cec5SDimitry Andric Range.setEnd(MacroEnd); 931fe6060f1SDimitry Andric // Use the *original* `End`, not the expanded one in `MacroEnd`. 932fe6060f1SDimitry Andric if (Range.isTokenRange()) 933fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(End, SM)); 9340b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9350b57cec5SDimitry Andric } 9360b57cec5SDimitry Andric 9370b57cec5SDimitry Andric bool Invalid = false; 9380b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 9390b57cec5SDimitry Andric &Invalid); 9400b57cec5SDimitry Andric if (Invalid) 9410b57cec5SDimitry Andric return {}; 9420b57cec5SDimitry Andric 9430b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) { 9440b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 9450b57cec5SDimitry Andric &Invalid); 9460b57cec5SDimitry Andric if (Invalid) 9470b57cec5SDimitry Andric return {}; 9480b57cec5SDimitry Andric 9490b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() && 9500b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() == 9510b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) { 9520b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 9530b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End)); 9540b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts); 9550b57cec5SDimitry Andric } 9560b57cec5SDimitry Andric } 9570b57cec5SDimitry Andric 9580b57cec5SDimitry Andric return {}; 9590b57cec5SDimitry Andric } 9600b57cec5SDimitry Andric 9610b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range, 9620b57cec5SDimitry Andric const SourceManager &SM, 9630b57cec5SDimitry Andric const LangOptions &LangOpts, 9640b57cec5SDimitry Andric bool *Invalid) { 9650b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts); 9660b57cec5SDimitry Andric if (Range.isInvalid()) { 9670b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9680b57cec5SDimitry Andric return {}; 9690b57cec5SDimitry Andric } 9700b57cec5SDimitry Andric 9710b57cec5SDimitry Andric // Break down the source location. 9720b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 9730b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) { 9740b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9750b57cec5SDimitry Andric return {}; 9760b57cec5SDimitry Andric } 9770b57cec5SDimitry Andric 9780b57cec5SDimitry Andric unsigned EndOffs; 9790b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 9800b57cec5SDimitry Andric beginInfo.second > EndOffs) { 9810b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9820b57cec5SDimitry Andric return {}; 9830b57cec5SDimitry Andric } 9840b57cec5SDimitry Andric 9850b57cec5SDimitry Andric // Try to the load the file buffer. 9860b57cec5SDimitry Andric bool invalidTemp = false; 9870b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 9880b57cec5SDimitry Andric if (invalidTemp) { 9890b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9900b57cec5SDimitry Andric return {}; 9910b57cec5SDimitry Andric } 9920b57cec5SDimitry Andric 9930b57cec5SDimitry Andric if (Invalid) *Invalid = false; 9940b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second); 9950b57cec5SDimitry Andric } 9960b57cec5SDimitry Andric 9970b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 9980b57cec5SDimitry Andric const SourceManager &SM, 9990b57cec5SDimitry Andric const LangOptions &LangOpts) { 10000b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10010b57cec5SDimitry Andric 10020b57cec5SDimitry Andric // Find the location of the immediate macro expansion. 10030b57cec5SDimitry Andric while (true) { 10040b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc); 10050b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 10060b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 10070b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart(); 10080b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion()) 10090b57cec5SDimitry Andric break; 10100b57cec5SDimitry Andric 10110b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come 10120b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )" 10130b57cec5SDimitry Andric 10140b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the 10150b57cec5SDimitry Andric // macro expansion. 10160b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10170b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc(); 10180b57cec5SDimitry Andric if (SpellLoc.isFileID()) 10190b57cec5SDimitry Andric break; // No inner macro. 10200b57cec5SDimitry Andric 10210b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion 10220b57cec5SDimitry Andric // location, it means there is no inner macro. 10230b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc); 10240b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID)) 10250b57cec5SDimitry Andric break; 10260b57cec5SDimitry Andric 10270b57cec5SDimitry Andric // Argument came from inner macro. 10280b57cec5SDimitry Andric Loc = SpellLoc; 10290b57cec5SDimitry Andric } 10300b57cec5SDimitry Andric 10310b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10320b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10330b57cec5SDimitry Andric // expanding this macro. 10340b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc); 10350b57cec5SDimitry Andric 10360b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10370b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10380b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10390b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10400b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10410b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10420b57cec5SDimitry Andric } 10430b57cec5SDimitry Andric 10440b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics( 10450b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 10460b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10470b57cec5SDimitry Andric // Walk past macro argument expansions. 10480b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc)) 10490b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10500b57cec5SDimitry Andric 10510b57cec5SDimitry Andric // If the macro's spelling has no FileID, then it's actually a token paste 10520b57cec5SDimitry Andric // or stringization (or similar) and not a macro at all. 10530b57cec5SDimitry Andric if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 10540b57cec5SDimitry Andric return {}; 10550b57cec5SDimitry Andric 10560b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10570b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10580b57cec5SDimitry Andric // expanding this macro. 10590b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 10600b57cec5SDimitry Andric 10610b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10620b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10630b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10640b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10650b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10660b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10670b57cec5SDimitry Andric } 10680b57cec5SDimitry Andric 1069349cc55cSDimitry Andric bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1070349cc55cSDimitry Andric return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 10710b57cec5SDimitry Andric } 10720b57cec5SDimitry Andric 10730b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 10740b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0])); 10750b57cec5SDimitry Andric if (Str - 1 < BufferStart) 10760b57cec5SDimitry Andric return false; 10770b57cec5SDimitry Andric 10780b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') || 10790b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) { 10800b57cec5SDimitry Andric if (Str - 2 < BufferStart) 10810b57cec5SDimitry Andric return false; 10820b57cec5SDimitry Andric --Str; 10830b57cec5SDimitry Andric } 10840b57cec5SDimitry Andric --Str; 10850b57cec5SDimitry Andric 10860b57cec5SDimitry Andric // Rewind to first non-space character: 10870b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str)) 10880b57cec5SDimitry Andric --Str; 10890b57cec5SDimitry Andric 10900b57cec5SDimitry Andric return *Str == '\\'; 10910b57cec5SDimitry Andric } 10920b57cec5SDimitry Andric 10930b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc, 10940b57cec5SDimitry Andric const SourceManager &SM) { 10950b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID()) 10960b57cec5SDimitry Andric return {}; 10970b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 10980b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 10990b57cec5SDimitry Andric return {}; 11000b57cec5SDimitry Andric bool Invalid = false; 11010b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 11020b57cec5SDimitry Andric if (Invalid) 11030b57cec5SDimitry Andric return {}; 11040b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 11050b57cec5SDimitry Andric if (!Line) 11060b57cec5SDimitry Andric return {}; 11070b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data()); 11080b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 11090b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos 11100b57cec5SDimitry Andric ? "" 11110b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars); 11120b57cec5SDimitry Andric } 11130b57cec5SDimitry Andric 11140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11150b57cec5SDimitry Andric // Diagnostics forwarding code. 11160b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11170b57cec5SDimitry Andric 11180b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 11190b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping. 11200b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow 11210b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined. 11220b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 11230b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 11240b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 11250b57cec5SDimitry Andric SourceLocation FileLoc, 11260b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) { 11270b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion"); 11280b57cec5SDimitry Andric 11290b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like 11300b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the 11310b57cec5SDimitry Andric // spelling location. 11320b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 11330b57cec5SDimitry Andric 11340b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 11350b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset. 11360b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 11370b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 11380b57cec5SDimitry Andric 11390b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the 11400b57cec5SDimitry Andric // original _Pragma(...) sequence. 11410b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 11420b57cec5SDimitry Andric 11430b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 11440b57cec5SDimitry Andric } 11450b57cec5SDimitry Andric 11460b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified 11470b57cec5SDimitry Andric /// offset in the current file. 11480b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc, 11490b57cec5SDimitry Andric unsigned TokLen) const { 11500b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd && 11510b57cec5SDimitry Andric "Location out of range for this buffer!"); 11520b57cec5SDimitry Andric 11530b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return 11540b57cec5SDimitry Andric // the file id from FileLoc with the offset specified. 11550b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart; 11560b57cec5SDimitry Andric if (FileLoc.isFileID()) 11570b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo); 11580b57cec5SDimitry Andric 11590b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the 11600b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined. 11610b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers"); 11620b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 11630b57cec5SDimitry Andric } 11640b57cec5SDimitry Andric 11650b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source 11660b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering. 11670b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 11680b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID); 11690b57cec5SDimitry Andric } 11700b57cec5SDimitry Andric 11710b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11720b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code. 11730b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11740b57cec5SDimitry Andric 11750b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 11760b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 11770b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) { 11780b57cec5SDimitry Andric switch (Letter) { 11790b57cec5SDimitry Andric default: return 0; 11800b57cec5SDimitry Andric case '=': return '#'; 11810b57cec5SDimitry Andric case ')': return ']'; 11820b57cec5SDimitry Andric case '(': return '['; 11830b57cec5SDimitry Andric case '!': return '|'; 11840b57cec5SDimitry Andric case '\'': return '^'; 11850b57cec5SDimitry Andric case '>': return '}'; 11860b57cec5SDimitry Andric case '/': return '\\'; 11870b57cec5SDimitry Andric case '<': return '{'; 11880b57cec5SDimitry Andric case '-': return '~'; 11890b57cec5SDimitry Andric } 11900b57cec5SDimitry Andric } 11910b57cec5SDimitry Andric 11920b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when 11930b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 11940b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use 11950b57cec5SDimitry Andric /// whether trigraphs are enabled or not. 11960b57cec5SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L) { 11970b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP); 11980b57cec5SDimitry Andric if (!Res || !L) return Res; 11990b57cec5SDimitry Andric 12000b57cec5SDimitry Andric if (!L->getLangOpts().Trigraphs) { 12010b57cec5SDimitry Andric if (!L->isLexingRawMode()) 12020b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored); 12030b57cec5SDimitry Andric return 0; 12040b57cec5SDimitry Andric } 12050b57cec5SDimitry Andric 12060b57cec5SDimitry Andric if (!L->isLexingRawMode()) 12070b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 12080b57cec5SDimitry Andric return Res; 12090b57cec5SDimitry Andric } 12100b57cec5SDimitry Andric 12110b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline, 12120b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 12130b57cec5SDimitry Andric /// trigraph equivalent on entry to this function. 12140b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 12150b57cec5SDimitry Andric unsigned Size = 0; 12160b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) { 12170b57cec5SDimitry Andric ++Size; 12180b57cec5SDimitry Andric 12190b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 12200b57cec5SDimitry Andric continue; 12210b57cec5SDimitry Andric 12220b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half. 12230b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 12240b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size]) 12250b57cec5SDimitry Andric ++Size; 12260b57cec5SDimitry Andric 12270b57cec5SDimitry Andric return Size; 12280b57cec5SDimitry Andric } 12290b57cec5SDimitry Andric 12300b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else. 12310b57cec5SDimitry Andric return 0; 12320b57cec5SDimitry Andric } 12330b57cec5SDimitry Andric 12340b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 12350b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found, 12360b57cec5SDimitry Andric /// otherwise return P. 12370b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) { 12380b57cec5SDimitry Andric while (true) { 12390b57cec5SDimitry Andric const char *AfterEscape; 12400b57cec5SDimitry Andric if (*P == '\\') { 12410b57cec5SDimitry Andric AfterEscape = P+1; 12420b57cec5SDimitry Andric } else if (*P == '?') { 12430b57cec5SDimitry Andric // If not a trigraph for escape, bail out. 12440b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/') 12450b57cec5SDimitry Andric return P; 12460b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not 12470b57cec5SDimitry Andric // support trigraphs. 12480b57cec5SDimitry Andric AfterEscape = P+3; 12490b57cec5SDimitry Andric } else { 12500b57cec5SDimitry Andric return P; 12510b57cec5SDimitry Andric } 12520b57cec5SDimitry Andric 12530b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 12540b57cec5SDimitry Andric if (NewLineSize == 0) return P; 12550b57cec5SDimitry Andric P = AfterEscape+NewLineSize; 12560b57cec5SDimitry Andric } 12570b57cec5SDimitry Andric } 12580b57cec5SDimitry Andric 12590b57cec5SDimitry Andric Optional<Token> Lexer::findNextToken(SourceLocation Loc, 12600b57cec5SDimitry Andric const SourceManager &SM, 12610b57cec5SDimitry Andric const LangOptions &LangOpts) { 12620b57cec5SDimitry Andric if (Loc.isMacroID()) { 12630b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 12640b57cec5SDimitry Andric return None; 12650b57cec5SDimitry Andric } 12660b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 12670b57cec5SDimitry Andric 12680b57cec5SDimitry Andric // Break down the source location. 12690b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 12700b57cec5SDimitry Andric 12710b57cec5SDimitry Andric // Try to load the file buffer. 12720b57cec5SDimitry Andric bool InvalidTemp = false; 12730b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 12740b57cec5SDimitry Andric if (InvalidTemp) 12750b57cec5SDimitry Andric return None; 12760b57cec5SDimitry Andric 12770b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second; 12780b57cec5SDimitry Andric 12790b57cec5SDimitry Andric // Lex from the start of the given location. 12800b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 12810b57cec5SDimitry Andric TokenBegin, File.end()); 12820b57cec5SDimitry Andric // Find the token. 12830b57cec5SDimitry Andric Token Tok; 12840b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok); 12850b57cec5SDimitry Andric return Tok; 12860b57cec5SDimitry Andric } 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the 12890b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location 12900b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the 12910b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid. 12920b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken( 12930b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 12940b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 12950b57cec5SDimitry Andric Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 12960b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind)) 12970b57cec5SDimitry Andric return {}; 12980b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation(); 12990b57cec5SDimitry Andric 13000b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any. 13010b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0; 13020b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) { 13030b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 13040b57cec5SDimitry Andric unsigned char C = *TokenEnd; 13050b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) { 13060b57cec5SDimitry Andric C = *(++TokenEnd); 13070b57cec5SDimitry Andric NumWhitespaceChars++; 13080b57cec5SDimitry Andric } 13090b57cec5SDimitry Andric 13100b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r 13110b57cec5SDimitry Andric if (C == '\n' || C == '\r') { 13120b57cec5SDimitry Andric char PrevC = C; 13130b57cec5SDimitry Andric C = *(++TokenEnd); 13140b57cec5SDimitry Andric NumWhitespaceChars++; 13150b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC) 13160b57cec5SDimitry Andric NumWhitespaceChars++; 13170b57cec5SDimitry Andric } 13180b57cec5SDimitry Andric } 13190b57cec5SDimitry Andric 13200b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 13210b57cec5SDimitry Andric } 13220b57cec5SDimitry Andric 13230b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 13240b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases: 13250b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph, 13260b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?', 13270b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not. 13280b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between 13290b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return 13300b57cec5SDimitry Andric /// the char after it. 13310b57cec5SDimitry Andric /// 13320b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we 13330b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented 13340b57cec5SDimitry Andric /// Ptr by Size bytes. 13350b57cec5SDimitry Andric /// 13360b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 13370b57cec5SDimitry Andric /// be updated to match. 13380b57cec5SDimitry Andric char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 13390b57cec5SDimitry Andric Token *Tok) { 13400b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 13410b57cec5SDimitry Andric if (Ptr[0] == '\\') { 13420b57cec5SDimitry Andric ++Size; 13430b57cec5SDimitry Andric ++Ptr; 13440b57cec5SDimitry Andric Slash: 13450b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 13460b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 13470b57cec5SDimitry Andric 13480b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and 13490b57cec5SDimitry Andric // newline. 13500b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 13510b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13520b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13530b57cec5SDimitry Andric 13540b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline. 13550b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 13560b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space); 13570b57cec5SDimitry Andric 13580b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 13590b57cec5SDimitry Andric Size += EscapedNewLineSize; 13600b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 13610b57cec5SDimitry Andric 13620b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 13630b57cec5SDimitry Andric return getCharAndSizeSlow(Ptr, Size, Tok); 13640b57cec5SDimitry Andric } 13650b57cec5SDimitry Andric 13660b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 13670b57cec5SDimitry Andric return '\\'; 13680b57cec5SDimitry Andric } 13690b57cec5SDimitry Andric 13700b57cec5SDimitry Andric // If this is a trigraph, process it. 13710b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') { 13720b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit 13730b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it. 13740b57cec5SDimitry Andric if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 13750b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13760b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13770b57cec5SDimitry Andric 13780b57cec5SDimitry Andric Ptr += 3; 13790b57cec5SDimitry Andric Size += 3; 13800b57cec5SDimitry Andric if (C == '\\') goto Slash; 13810b57cec5SDimitry Andric return C; 13820b57cec5SDimitry Andric } 13830b57cec5SDimitry Andric } 13840b57cec5SDimitry Andric 13850b57cec5SDimitry Andric // If this is neither, return a single character. 13860b57cec5SDimitry Andric ++Size; 13870b57cec5SDimitry Andric return *Ptr; 13880b57cec5SDimitry Andric } 13890b57cec5SDimitry Andric 13900b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 13910b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 13920b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes. 13930b57cec5SDimitry Andric /// 13940b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 13950b57cec5SDimitry Andric /// be updated to match. 13960b57cec5SDimitry Andric char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 13970b57cec5SDimitry Andric const LangOptions &LangOpts) { 13980b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 13990b57cec5SDimitry Andric if (Ptr[0] == '\\') { 14000b57cec5SDimitry Andric ++Size; 14010b57cec5SDimitry Andric ++Ptr; 14020b57cec5SDimitry Andric Slash: 14030b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 14040b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 14050b57cec5SDimitry Andric 14060b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline. 14070b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 14080b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 14090b57cec5SDimitry Andric Size += EscapedNewLineSize; 14100b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 14110b57cec5SDimitry Andric 14120b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 14130b57cec5SDimitry Andric return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 14140b57cec5SDimitry Andric } 14150b57cec5SDimitry Andric 14160b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 14170b57cec5SDimitry Andric return '\\'; 14180b57cec5SDimitry Andric } 14190b57cec5SDimitry Andric 14200b57cec5SDimitry Andric // If this is a trigraph, process it. 14210b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 14220b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return 14230b57cec5SDimitry Andric // it. 14240b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) { 14250b57cec5SDimitry Andric Ptr += 3; 14260b57cec5SDimitry Andric Size += 3; 14270b57cec5SDimitry Andric if (C == '\\') goto Slash; 14280b57cec5SDimitry Andric return C; 14290b57cec5SDimitry Andric } 14300b57cec5SDimitry Andric } 14310b57cec5SDimitry Andric 14320b57cec5SDimitry Andric // If this is neither, return a single character. 14330b57cec5SDimitry Andric ++Size; 14340b57cec5SDimitry Andric return *Ptr; 14350b57cec5SDimitry Andric } 14360b57cec5SDimitry Andric 14370b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14380b57cec5SDimitry Andric // Helper methods for lexing. 14390b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14400b57cec5SDimitry Andric 14410b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file. 14420b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 14430b57cec5SDimitry Andric BufferPtr = BufferStart + Offset; 14440b57cec5SDimitry Andric if (BufferPtr > BufferEnd) 14450b57cec5SDimitry Andric BufferPtr = BufferEnd; 14460b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two 14470b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the 14480b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line. 14490b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine; 14500b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine; 14510b57cec5SDimitry Andric } 14520b57cec5SDimitry Andric 1453349cc55cSDimitry Andric static bool isUnicodeWhitespace(uint32_t Codepoint) { 1454349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1455349cc55cSDimitry Andric UnicodeWhitespaceCharRanges); 1456349cc55cSDimitry Andric return UnicodeWhitespaceChars.contains(Codepoint); 1457349cc55cSDimitry Andric } 1458349cc55cSDimitry Andric 14590b57cec5SDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 14600b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14610b57cec5SDimitry Andric return false; 1462480093f4SDimitry Andric } else if (LangOpts.DollarIdents && '$' == C) { 1463480093f4SDimitry Andric return true; 1464349cc55cSDimitry Andric } else if (LangOpts.CPlusPlus) { 1465349cc55cSDimitry Andric // A non-leading codepoint must have the XID_Continue property. 1466349cc55cSDimitry Andric // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1467349cc55cSDimitry Andric // so we need to check both tables. 1468349cc55cSDimitry Andric // '_' doesn't have the XID_Continue property but is allowed in C++. 1469349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1470349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1471349cc55cSDimitry Andric return C == '_' || XIDStartChars.contains(C) || 1472349cc55cSDimitry Andric XIDContinueChars.contains(C); 1473349cc55cSDimitry Andric } else if (LangOpts.C11) { 14740b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 14750b57cec5SDimitry Andric C11AllowedIDCharRanges); 14760b57cec5SDimitry Andric return C11AllowedIDChars.contains(C); 14770b57cec5SDimitry Andric } else { 14780b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 14790b57cec5SDimitry Andric C99AllowedIDCharRanges); 14800b57cec5SDimitry Andric return C99AllowedIDChars.contains(C); 14810b57cec5SDimitry Andric } 14820b57cec5SDimitry Andric } 14830b57cec5SDimitry Andric 14840b57cec5SDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 14850b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14860b57cec5SDimitry Andric return false; 1487349cc55cSDimitry Andric } 1488349cc55cSDimitry Andric if (LangOpts.CPlusPlus) { 1489349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1490349cc55cSDimitry Andric // '_' doesn't have the XID_Start property but is allowed in C++. 1491349cc55cSDimitry Andric return C == '_' || XIDStartChars.contains(C); 1492349cc55cSDimitry Andric } 1493349cc55cSDimitry Andric if (!isAllowedIDChar(C, LangOpts)) 1494349cc55cSDimitry Andric return false; 1495349cc55cSDimitry Andric if (LangOpts.C11) { 14960b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 14970b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges); 14980b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C); 1499349cc55cSDimitry Andric } 15000b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 15010b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 15020b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C); 15030b57cec5SDimitry Andric } 15040b57cec5SDimitry Andric 15050b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 15060b57cec5SDimitry Andric const char *End) { 15070b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 15080b57cec5SDimitry Andric L.getSourceLocation(End)); 15090b57cec5SDimitry Andric } 15100b57cec5SDimitry Andric 15110b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 15120b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) { 15130b57cec5SDimitry Andric // Check C99 compatibility. 15140b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 15150b57cec5SDimitry Andric enum { 15160b57cec5SDimitry Andric CannotAppearInIdentifier = 0, 15170b57cec5SDimitry Andric CannotStartIdentifier 15180b57cec5SDimitry Andric }; 15190b57cec5SDimitry Andric 15200b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 15210b57cec5SDimitry Andric C99AllowedIDCharRanges); 15220b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 15230b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 15240b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) { 15250b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 15260b57cec5SDimitry Andric << Range 15270b57cec5SDimitry Andric << CannotAppearInIdentifier; 15280b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 15290b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 15300b57cec5SDimitry Andric << Range 15310b57cec5SDimitry Andric << CannotStartIdentifier; 15320b57cec5SDimitry Andric } 15330b57cec5SDimitry Andric } 15340b57cec5SDimitry Andric } 15350b57cec5SDimitry Andric 15360b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier 15370b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier 15380b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier 15390b57cec5SDimitry Andric /// character and warn if so. 15400b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 15410b57cec5SDimitry Andric CharSourceRange Range) { 15420b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 15430b57cec5SDimitry Andric struct HomoglyphPair { 15440b57cec5SDimitry Andric uint32_t Character; 15450b57cec5SDimitry Andric char LooksLike; 15460b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; } 15470b57cec5SDimitry Andric }; 15480b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = { 15490b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN 15500b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 15510b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK 15520b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE 15530b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 15540b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER 15550b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER 15560b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION 15570b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES 15580b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR 15590b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS 15600b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN 15610b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH 15620b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS 15630b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR 15640b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES 15650b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND 15660b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO 15670b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR 15680b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON 15690b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 15700b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 15710b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 15720b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 15730b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 15740b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 15750b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 15760b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 15770b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 15780b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 15790b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA 15800b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 15810b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 15820b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 15830b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON 15840b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 15850b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 15860b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 15870b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 15880b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 15890b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 15900b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 15910b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 15920b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 15930b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 15940b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 15950b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 15960b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 15970b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE 15980b57cec5SDimitry Andric {0, 0} 15990b57cec5SDimitry Andric }; 16000b57cec5SDimitry Andric auto Homoglyph = 16010b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs), 16020b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 16030b57cec5SDimitry Andric if (Homoglyph->Character == C) { 16040b57cec5SDimitry Andric llvm::SmallString<5> CharBuf; 16050b57cec5SDimitry Andric { 16060b57cec5SDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 16070b57cec5SDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 16080b57cec5SDimitry Andric } 16090b57cec5SDimitry Andric if (Homoglyph->LooksLike) { 16100b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 16110b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 16120b57cec5SDimitry Andric << Range << CharBuf << LooksLikeStr; 16130b57cec5SDimitry Andric } else { 16140b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 16150b57cec5SDimitry Andric << Range << CharBuf; 16160b57cec5SDimitry Andric } 16170b57cec5SDimitry Andric } 16180b57cec5SDimitry Andric } 16190b57cec5SDimitry Andric 1620349cc55cSDimitry Andric static void diagnoseInvalidUnicodeCodepointInIdentifier( 1621349cc55cSDimitry Andric DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1622349cc55cSDimitry Andric CharSourceRange Range, bool IsFirst) { 1623349cc55cSDimitry Andric if (isASCII(CodePoint)) 1624349cc55cSDimitry Andric return; 1625349cc55cSDimitry Andric 1626349cc55cSDimitry Andric bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts); 1627349cc55cSDimitry Andric bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts); 1628349cc55cSDimitry Andric 1629349cc55cSDimitry Andric if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1630349cc55cSDimitry Andric return; 1631349cc55cSDimitry Andric 1632349cc55cSDimitry Andric bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1633349cc55cSDimitry Andric 1634349cc55cSDimitry Andric llvm::SmallString<5> CharBuf; 1635349cc55cSDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 1636349cc55cSDimitry Andric llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4); 1637349cc55cSDimitry Andric 1638349cc55cSDimitry Andric if (!IsFirst || InvalidOnlyAtStart) { 1639349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1640349cc55cSDimitry Andric << Range << CharBuf << int(InvalidOnlyAtStart) 1641349cc55cSDimitry Andric << FixItHint::CreateRemoval(Range); 1642349cc55cSDimitry Andric } else { 1643349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1644349cc55cSDimitry Andric << Range << CharBuf << FixItHint::CreateRemoval(Range); 1645349cc55cSDimitry Andric } 1646349cc55cSDimitry Andric } 1647349cc55cSDimitry Andric 16480b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 16490b57cec5SDimitry Andric Token &Result) { 16500b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size; 16510b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1652349cc55cSDimitry Andric if (CodePoint == 0) { 16530b57cec5SDimitry Andric return false; 1654349cc55cSDimitry Andric } 16550b57cec5SDimitry Andric 1656349cc55cSDimitry Andric if (!isAllowedIDChar(CodePoint, LangOpts)) { 1657349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1658349cc55cSDimitry Andric return false; 1659349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1660349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1661349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1662349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1663349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 1664349cc55cSDimitry Andric /*IsFirst=*/false); 1665349cc55cSDimitry Andric 1666349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1667349cc55cSDimitry Andric // a valid identifier part. 1668349cc55cSDimitry Andric // Carry on as if the codepoint was valid for recovery purposes. 1669349cc55cSDimitry Andric } else if (!isLexingRawMode()) 16700b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 16710b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 16720b57cec5SDimitry Andric /*IsFirst=*/false); 16730b57cec5SDimitry Andric 16740b57cec5SDimitry Andric Result.setFlag(Token::HasUCN); 16750b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 16760b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 16770b57cec5SDimitry Andric CurPtr = UCNPtr; 16780b57cec5SDimitry Andric else 16790b57cec5SDimitry Andric while (CurPtr != UCNPtr) 16800b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 16810b57cec5SDimitry Andric return true; 16820b57cec5SDimitry Andric } 16830b57cec5SDimitry Andric 16840b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 16850b57cec5SDimitry Andric const char *UnicodePtr = CurPtr; 16860b57cec5SDimitry Andric llvm::UTF32 CodePoint; 16870b57cec5SDimitry Andric llvm::ConversionResult Result = 16880b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 16890b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 16900b57cec5SDimitry Andric &CodePoint, 16910b57cec5SDimitry Andric llvm::strictConversion); 1692349cc55cSDimitry Andric if (Result != llvm::conversionOK) 16930b57cec5SDimitry Andric return false; 16940b57cec5SDimitry Andric 1695349cc55cSDimitry Andric if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) { 1696349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1697349cc55cSDimitry Andric return false; 1698349cc55cSDimitry Andric 1699349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1700349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1701349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1702349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1703349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); 1704349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1705349cc55cSDimitry Andric // a valid identifier part. Carry on as if the codepoint was 1706349cc55cSDimitry Andric // valid for recovery purposes. 1707349cc55cSDimitry Andric } else if (!isLexingRawMode()) { 17080b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 17090b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), 17100b57cec5SDimitry Andric /*IsFirst=*/false); 17110b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 17120b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr)); 17130b57cec5SDimitry Andric } 17140b57cec5SDimitry Andric 17150b57cec5SDimitry Andric CurPtr = UnicodePtr; 17160b57cec5SDimitry Andric return true; 17170b57cec5SDimitry Andric } 17180b57cec5SDimitry Andric 1719349cc55cSDimitry Andric bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1720349cc55cSDimitry Andric const char *CurPtr) { 1721349cc55cSDimitry Andric if (isAllowedInitiallyIDChar(C, LangOpts)) { 1722349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1723349cc55cSDimitry Andric !PP->isPreprocessedOutput()) { 1724349cc55cSDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1725349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), 1726349cc55cSDimitry Andric /*IsFirst=*/true); 1727349cc55cSDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1728349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 1729349cc55cSDimitry Andric } 1730349cc55cSDimitry Andric 1731349cc55cSDimitry Andric MIOpt.ReadToken(); 1732349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 1733349cc55cSDimitry Andric } 1734349cc55cSDimitry Andric 1735349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1736349cc55cSDimitry Andric !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1737349cc55cSDimitry Andric !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { 1738349cc55cSDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 1739349cc55cSDimitry Andric // Instead of letting the parser complain about the unknown token, 1740349cc55cSDimitry Andric // just drop the character. 1741349cc55cSDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually 1742349cc55cSDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that 1743349cc55cSDimitry Andric // we not throw away any possible preprocessor tokens, but there's a 1744349cc55cSDimitry Andric // loophole in the mapping of Unicode characters to basic character set 1745349cc55cSDimitry Andric // characters that allows us to map these particular characters to, say, 1746349cc55cSDimitry Andric // whitespace. 1747349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1748349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, C, 1749349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1750349cc55cSDimitry Andric BufferPtr = CurPtr; 1751349cc55cSDimitry Andric return false; 1752349cc55cSDimitry Andric } 1753349cc55cSDimitry Andric 1754349cc55cSDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show 1755349cc55cSDimitry Andric // up by accident. 1756349cc55cSDimitry Andric MIOpt.ReadToken(); 1757349cc55cSDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 1758349cc55cSDimitry Andric return true; 1759349cc55cSDimitry Andric } 1760349cc55cSDimitry Andric 1761349cc55cSDimitry Andric bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1762349cc55cSDimitry Andric // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1763349cc55cSDimitry Andric while (true) { 1764349cc55cSDimitry Andric unsigned char C = *CurPtr; 1765349cc55cSDimitry Andric // Fast path. 1766349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1767349cc55cSDimitry Andric ++CurPtr; 1768349cc55cSDimitry Andric continue; 1769349cc55cSDimitry Andric } 1770349cc55cSDimitry Andric 17710b57cec5SDimitry Andric unsigned Size; 1772349cc55cSDimitry Andric // Slow path: handle trigraph, unicode codepoints, UCNs. 1773349cc55cSDimitry Andric C = getCharAndSize(CurPtr, Size); 1774349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1775349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1776349cc55cSDimitry Andric continue; 1777349cc55cSDimitry Andric } 1778349cc55cSDimitry Andric if (C == '$') { 1779349cc55cSDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done. 1780349cc55cSDimitry Andric if (!LangOpts.DollarIdents) 1781349cc55cSDimitry Andric break; 1782349cc55cSDimitry Andric // Otherwise, emit a diagnostic and continue. 1783349cc55cSDimitry Andric if (!isLexingRawMode()) 1784349cc55cSDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier); 1785349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1786349cc55cSDimitry Andric continue; 1787349cc55cSDimitry Andric } 1788349cc55cSDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1789349cc55cSDimitry Andric continue; 1790349cc55cSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1791349cc55cSDimitry Andric continue; 1792349cc55cSDimitry Andric // Neither an expected Unicode codepoint nor a UCN. 1793349cc55cSDimitry Andric break; 1794349cc55cSDimitry Andric } 17950b57cec5SDimitry Andric 17960b57cec5SDimitry Andric const char *IdStart = BufferPtr; 17970b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 17980b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart); 17990b57cec5SDimitry Andric 18000b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to 18010b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it. 18020b57cec5SDimitry Andric if (LexingRawMode) 18030b57cec5SDimitry Andric return true; 18040b57cec5SDimitry Andric 18050b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind, 18060b57cec5SDimitry Andric // looking up the identifier in the identifier table. 18070b57cec5SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 18080b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code 18090b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it. 18100b57cec5SDimitry Andric 18110b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat 18120b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword. 18130b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'. 18140b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) { 18150b57cec5SDimitry Andric // Return the code-completion token. 18160b57cec5SDimitry Andric Result.setKind(tok::code_completion); 18170b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters. 18180b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in 18190b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that 18200b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 18210b57cec5SDimitry Andric // simpler. 18220b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0"); 18230b57cec5SDimitry Andric ++CurPtr; 18240b57cec5SDimitry Andric // Note that code completion token is not added as a separate character 18250b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need 18260b57cec5SDimitry Andric // to check if the buffer has ended. 18270b57cec5SDimitry Andric if (CurPtr < BufferEnd) { 1828349cc55cSDimitry Andric while (isAsciiIdentifierContinue(*CurPtr)) 18290b57cec5SDimitry Andric ++CurPtr; 18300b57cec5SDimitry Andric } 18310b57cec5SDimitry Andric BufferPtr = CurPtr; 18320b57cec5SDimitry Andric return true; 18330b57cec5SDimitry Andric } 18340b57cec5SDimitry Andric 18350b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the 18360b57cec5SDimitry Andric // preprocessor, which may macro expand it or something. 18370b57cec5SDimitry Andric if (II->isHandleIdentifierCase()) 18380b57cec5SDimitry Andric return PP->HandleIdentifier(Result); 18390b57cec5SDimitry Andric 18400b57cec5SDimitry Andric return true; 18410b57cec5SDimitry Andric } 18420b57cec5SDimitry Andric 18430b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant. 18440b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens). 18450b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 18460b57cec5SDimitry Andric unsigned Size; 18470b57cec5SDimitry Andric char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 18480b57cec5SDimitry Andric if (C1 != '0') 18490b57cec5SDimitry Andric return false; 18500b57cec5SDimitry Andric char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 18510b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X'); 18520b57cec5SDimitry Andric } 18530b57cec5SDimitry Andric 18540b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point 18550b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the 18560b57cec5SDimitry Andric /// constant. 18570b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 18580b57cec5SDimitry Andric unsigned Size; 18590b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 18600b57cec5SDimitry Andric char PrevCh = 0; 18610b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) { 18620b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 18630b57cec5SDimitry Andric PrevCh = C; 18640b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 18650b57cec5SDimitry Andric } 18660b57cec5SDimitry Andric 18670b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 18680b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 18690b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex. 18700b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 18710b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 18720b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 18730b57cec5SDimitry Andric } 18740b57cec5SDimitry Andric 18750b57cec5SDimitry Andric // If we have a hex FP constant, continue. 18760b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 18770b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 18780b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's 18790b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix. 18800b57cec5SDimitry Andric bool IsHexFloat = true; 18810b57cec5SDimitry Andric if (!LangOpts.C99) { 18820b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts)) 18830b57cec5SDimitry Andric IsHexFloat = false; 18840b57cec5SDimitry Andric else if (!getLangOpts().CPlusPlus17 && 18850b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr) 18860b57cec5SDimitry Andric IsHexFloat = false; 18870b57cec5SDimitry Andric } 18880b57cec5SDimitry Andric if (IsHexFloat) 18890b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 18900b57cec5SDimitry Andric } 18910b57cec5SDimitry Andric 18920b57cec5SDimitry Andric // If we have a digit separator, continue. 1893fe6060f1SDimitry Andric if (C == '\'' && (getLangOpts().CPlusPlus14 || getLangOpts().C2x)) { 18940b57cec5SDimitry Andric unsigned NextSize; 18950b57cec5SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 1896349cc55cSDimitry Andric if (isAsciiIdentifierContinue(Next)) { 18970b57cec5SDimitry Andric if (!isLexingRawMode()) 1898fe6060f1SDimitry Andric Diag(CurPtr, getLangOpts().CPlusPlus 1899fe6060f1SDimitry Andric ? diag::warn_cxx11_compat_digit_separator 1900fe6060f1SDimitry Andric : diag::warn_c2x_compat_digit_separator); 19010b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 19020b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result); 19030b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19040b57cec5SDimitry Andric } 19050b57cec5SDimitry Andric } 19060b57cec5SDimitry Andric 19070b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 19080b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 19090b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19100b57cec5SDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 19110b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19120b57cec5SDimitry Andric 19130b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 19140b57cec5SDimitry Andric const char *TokStart = BufferPtr; 19150b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 19160b57cec5SDimitry Andric Result.setLiteralData(TokStart); 19170b57cec5SDimitry Andric return true; 19180b57cec5SDimitry Andric } 19190b57cec5SDimitry Andric 19200b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 19210b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98. 19220b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 19230b57cec5SDimitry Andric bool IsStringLiteral) { 19240b57cec5SDimitry Andric assert(getLangOpts().CPlusPlus); 19250b57cec5SDimitry Andric 19260b57cec5SDimitry Andric // Maximally munch an identifier. 19270b57cec5SDimitry Andric unsigned Size; 19280b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 19290b57cec5SDimitry Andric bool Consumed = false; 19300b57cec5SDimitry Andric 1931349cc55cSDimitry Andric if (!isAsciiIdentifierStart(C)) { 19320b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 19330b57cec5SDimitry Andric Consumed = true; 19340b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 19350b57cec5SDimitry Andric Consumed = true; 19360b57cec5SDimitry Andric else 19370b57cec5SDimitry Andric return CurPtr; 19380b57cec5SDimitry Andric } 19390b57cec5SDimitry Andric 19400b57cec5SDimitry Andric if (!getLangOpts().CPlusPlus11) { 19410b57cec5SDimitry Andric if (!isLexingRawMode()) 19420b57cec5SDimitry Andric Diag(CurPtr, 19430b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal 19440b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal) 19450b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 19460b57cec5SDimitry Andric return CurPtr; 19470b57cec5SDimitry Andric } 19480b57cec5SDimitry Andric 19490b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 19500b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming 19510b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before 19520b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more 19530b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that. 19540b57cec5SDimitry Andric if (!Consumed) { 19550b57cec5SDimitry Andric bool IsUDSuffix = false; 19560b57cec5SDimitry Andric if (C == '_') 19570b57cec5SDimitry Andric IsUDSuffix = true; 19580b57cec5SDimitry Andric else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 19590b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a 19600b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be 19610b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator). 19620b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3; 19630b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C }; 19640b57cec5SDimitry Andric unsigned Consumed = Size; 19650b57cec5SDimitry Andric unsigned Chars = 1; 19660b57cec5SDimitry Andric while (true) { 19670b57cec5SDimitry Andric unsigned NextSize; 19680b57cec5SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 19690b57cec5SDimitry Andric getLangOpts()); 1970349cc55cSDimitry Andric if (!isAsciiIdentifierContinue(Next)) { 19715ffd83dbSDimitry Andric // End of suffix. Check whether this is on the allowed list. 19720b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars); 19730b57cec5SDimitry Andric IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), 19740b57cec5SDimitry Andric CompleteSuffix); 19750b57cec5SDimitry Andric break; 19760b57cec5SDimitry Andric } 19770b57cec5SDimitry Andric 19780b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength) 19790b57cec5SDimitry Andric // Too long: can't be a standard suffix. 19800b57cec5SDimitry Andric break; 19810b57cec5SDimitry Andric 19820b57cec5SDimitry Andric Buffer[Chars++] = Next; 19830b57cec5SDimitry Andric Consumed += NextSize; 19840b57cec5SDimitry Andric } 19850b57cec5SDimitry Andric } 19860b57cec5SDimitry Andric 19870b57cec5SDimitry Andric if (!IsUDSuffix) { 19880b57cec5SDimitry Andric if (!isLexingRawMode()) 19890b57cec5SDimitry Andric Diag(CurPtr, getLangOpts().MSVCCompat 19900b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal 19910b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal) 19920b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 19930b57cec5SDimitry Andric return CurPtr; 19940b57cec5SDimitry Andric } 19950b57cec5SDimitry Andric 19960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 19970b57cec5SDimitry Andric } 19980b57cec5SDimitry Andric 19990b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix); 20000b57cec5SDimitry Andric while (true) { 20010b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 2002349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 2003349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 2004349cc55cSDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2005349cc55cSDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 2006349cc55cSDimitry Andric } else 2007349cc55cSDimitry Andric break; 20080b57cec5SDimitry Andric } 20090b57cec5SDimitry Andric 20100b57cec5SDimitry Andric return CurPtr; 20110b57cec5SDimitry Andric } 20120b57cec5SDimitry Andric 20130b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 20140b57cec5SDimitry Andric /// either " or L" or u8" or u" or U". 20150b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 20160b57cec5SDimitry Andric tok::TokenKind Kind) { 20170b57cec5SDimitry Andric const char *AfterQuote = CurPtr; 20180b57cec5SDimitry Andric // Does this string contain the \0 character? 20190b57cec5SDimitry Andric const char *NulCharacter = nullptr; 20200b57cec5SDimitry Andric 20210b57cec5SDimitry Andric if (!isLexingRawMode() && 20220b57cec5SDimitry Andric (Kind == tok::utf8_string_literal || 20230b57cec5SDimitry Andric Kind == tok::utf16_string_literal || 20240b57cec5SDimitry Andric Kind == tok::utf32_string_literal)) 20250b57cec5SDimitry Andric Diag(BufferPtr, getLangOpts().CPlusPlus 20260b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 20270b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 20280b57cec5SDimitry Andric 20290b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 20300b57cec5SDimitry Andric while (C != '"') { 20310b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 20320b57cec5SDimitry Andric // getAndAdvanceChar. 20330b57cec5SDimitry Andric if (C == '\\') 20340b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20350b57cec5SDimitry Andric 20360b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 20370b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 20380b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 20390b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 20400b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 20410b57cec5SDimitry Andric return true; 20420b57cec5SDimitry Andric } 20430b57cec5SDimitry Andric 20440b57cec5SDimitry Andric if (C == 0) { 20450b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 20460b57cec5SDimitry Andric if (ParsingFilename) 20470b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 20480b57cec5SDimitry Andric else 20490b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 20500b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 20510b57cec5SDimitry Andric cutOffLexing(); 20520b57cec5SDimitry Andric return true; 20530b57cec5SDimitry Andric } 20540b57cec5SDimitry Andric 20550b57cec5SDimitry Andric NulCharacter = CurPtr-1; 20560b57cec5SDimitry Andric } 20570b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20580b57cec5SDimitry Andric } 20590b57cec5SDimitry Andric 20600b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 20610b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 20620b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 20630b57cec5SDimitry Andric 20640b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 20650b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 20660b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 20670b57cec5SDimitry Andric 20680b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var. 20690b57cec5SDimitry Andric const char *TokStart = BufferPtr; 20700b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 20710b57cec5SDimitry Andric Result.setLiteralData(TokStart); 20720b57cec5SDimitry Andric return true; 20730b57cec5SDimitry Andric } 20740b57cec5SDimitry Andric 20750b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 20760b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR". 20770b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 20780b57cec5SDimitry Andric tok::TokenKind Kind) { 20790b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 20800b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string, 20810b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs, 20820b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted. 20830b57cec5SDimitry Andric 20840b57cec5SDimitry Andric if (!isLexingRawMode()) 20850b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 20860b57cec5SDimitry Andric 20870b57cec5SDimitry Andric unsigned PrefixLen = 0; 20880b57cec5SDimitry Andric 20890b57cec5SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 20900b57cec5SDimitry Andric ++PrefixLen; 20910b57cec5SDimitry Andric 20920b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter. 20930b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') { 20940b57cec5SDimitry Andric if (!isLexingRawMode()) { 20950b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen]; 20960b57cec5SDimitry Andric if (PrefixLen == 16) { 20970b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long); 20980b57cec5SDimitry Andric } else { 20990b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 21000b57cec5SDimitry Andric << StringRef(PrefixEnd, 1); 21010b57cec5SDimitry Andric } 21020b57cec5SDimitry Andric } 21030b57cec5SDimitry Andric 21040b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 21050b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but 21060b57cec5SDimitry Andric // there's not much we can do about that. 21070b57cec5SDimitry Andric while (true) { 21080b57cec5SDimitry Andric char C = *CurPtr++; 21090b57cec5SDimitry Andric 21100b57cec5SDimitry Andric if (C == '"') 21110b57cec5SDimitry Andric break; 21120b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) { 21130b57cec5SDimitry Andric --CurPtr; 21140b57cec5SDimitry Andric break; 21150b57cec5SDimitry Andric } 21160b57cec5SDimitry Andric } 21170b57cec5SDimitry Andric 21180b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 21190b57cec5SDimitry Andric return true; 21200b57cec5SDimitry Andric } 21210b57cec5SDimitry Andric 21220b57cec5SDimitry Andric // Save prefix and move CurPtr past it 21230b57cec5SDimitry Andric const char *Prefix = CurPtr; 21240b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '(' 21250b57cec5SDimitry Andric 21260b57cec5SDimitry Andric while (true) { 21270b57cec5SDimitry Andric char C = *CurPtr++; 21280b57cec5SDimitry Andric 21290b57cec5SDimitry Andric if (C == ')') { 21300b57cec5SDimitry Andric // Check for prefix match and closing quote. 21310b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 21320b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"' 21330b57cec5SDimitry Andric break; 21340b57cec5SDimitry Andric } 21350b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 21360b57cec5SDimitry Andric if (!isLexingRawMode()) 21370b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string) 21380b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen); 21390b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 21400b57cec5SDimitry Andric return true; 21410b57cec5SDimitry Andric } 21420b57cec5SDimitry Andric } 21430b57cec5SDimitry Andric 21440b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 21450b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 21460b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 21470b57cec5SDimitry Andric 21480b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 21490b57cec5SDimitry Andric const char *TokStart = BufferPtr; 21500b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 21510b57cec5SDimitry Andric Result.setLiteralData(TokStart); 21520b57cec5SDimitry Andric return true; 21530b57cec5SDimitry Andric } 21540b57cec5SDimitry Andric 21550b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 21560b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames. 21570b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 21580b57cec5SDimitry Andric // Does this string contain the \0 character? 21590b57cec5SDimitry Andric const char *NulCharacter = nullptr; 21600b57cec5SDimitry Andric const char *AfterLessPos = CurPtr; 21610b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 21620b57cec5SDimitry Andric while (C != '>') { 21630b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 21640b57cec5SDimitry Andric // getAndAdvanceChar. 21650b57cec5SDimitry Andric if (C == '\\') 21660b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21670b57cec5SDimitry Andric 2168fe6060f1SDimitry Andric if (isVerticalWhitespace(C) || // Newline. 21690b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 21700b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone < 21710b57cec5SDimitry Andric // character. Return this as such. 21720b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less); 21730b57cec5SDimitry Andric return true; 21740b57cec5SDimitry Andric } 21750b57cec5SDimitry Andric 21760b57cec5SDimitry Andric if (C == 0) { 21770b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) { 21780b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 21790b57cec5SDimitry Andric cutOffLexing(); 21800b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 21810b57cec5SDimitry Andric return true; 21820b57cec5SDimitry Andric } 21830b57cec5SDimitry Andric NulCharacter = CurPtr-1; 21840b57cec5SDimitry Andric } 21850b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21860b57cec5SDimitry Andric } 21870b57cec5SDimitry Andric 21880b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 21890b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 21900b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 21910b57cec5SDimitry Andric 21920b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 21930b57cec5SDimitry Andric const char *TokStart = BufferPtr; 21940b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name); 21950b57cec5SDimitry Andric Result.setLiteralData(TokStart); 21960b57cec5SDimitry Andric return true; 21970b57cec5SDimitry Andric } 21980b57cec5SDimitry Andric 21990b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart, 22000b57cec5SDimitry Andric const char *CompletionPoint, 22010b57cec5SDimitry Andric bool IsAngled) { 22020b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash. 22030b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart); 22045ffd83dbSDimitry Andric llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 22055ffd83dbSDimitry Andric auto Slash = PartialPath.find_last_of(SlashChars); 22060b57cec5SDimitry Andric StringRef Dir = 22070b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 22080b57cec5SDimitry Andric const char *StartOfFilename = 22090b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 22100b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point. 22110b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 22120b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 22135ffd83dbSDimitry Andric // We should replace the characters up to the closing quote or closest slash, 22145ffd83dbSDimitry Andric // if any. 22150b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) { 22160b57cec5SDimitry Andric char Next = *(CompletionPoint + 1); 22170b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n') 22180b57cec5SDimitry Andric break; 22190b57cec5SDimitry Andric ++CompletionPoint; 22200b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"')) 22210b57cec5SDimitry Andric break; 22225ffd83dbSDimitry Andric if (llvm::is_contained(SlashChars, Next)) 22235ffd83dbSDimitry Andric break; 22240b57cec5SDimitry Andric } 22255ffd83dbSDimitry Andric 22260b57cec5SDimitry Andric PP->setCodeCompletionTokenRange( 22270b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 22280b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 22290b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled); 22300b57cec5SDimitry Andric } 22310b57cec5SDimitry Andric 22320b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having 22330b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'. 22340b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 22350b57cec5SDimitry Andric tok::TokenKind Kind) { 22360b57cec5SDimitry Andric // Does this character contain the \0 character? 22370b57cec5SDimitry Andric const char *NulCharacter = nullptr; 22380b57cec5SDimitry Andric 22390b57cec5SDimitry Andric if (!isLexingRawMode()) { 22400b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 22410b57cec5SDimitry Andric Diag(BufferPtr, getLangOpts().CPlusPlus 22420b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 22430b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 22440b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant) 22450b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 22460b57cec5SDimitry Andric } 22470b57cec5SDimitry Andric 22480b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 22490b57cec5SDimitry Andric if (C == '\'') { 22500b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 22510b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character); 22520b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 22530b57cec5SDimitry Andric return true; 22540b57cec5SDimitry Andric } 22550b57cec5SDimitry Andric 22560b57cec5SDimitry Andric while (C != '\'') { 22570b57cec5SDimitry Andric // Skip escaped characters. 22580b57cec5SDimitry Andric if (C == '\\') 22590b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22600b57cec5SDimitry Andric 22610b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 22620b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 22630b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 22640b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 22650b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 22660b57cec5SDimitry Andric return true; 22670b57cec5SDimitry Andric } 22680b57cec5SDimitry Andric 22690b57cec5SDimitry Andric if (C == 0) { 22700b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 22710b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 22720b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 22730b57cec5SDimitry Andric cutOffLexing(); 22740b57cec5SDimitry Andric return true; 22750b57cec5SDimitry Andric } 22760b57cec5SDimitry Andric 22770b57cec5SDimitry Andric NulCharacter = CurPtr-1; 22780b57cec5SDimitry Andric } 22790b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22800b57cec5SDimitry Andric } 22810b57cec5SDimitry Andric 22820b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 22830b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 22840b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false); 22850b57cec5SDimitry Andric 22860b57cec5SDimitry Andric // If a nul character existed in the character, warn about it. 22870b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 22880b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0; 22890b57cec5SDimitry Andric 22900b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 22910b57cec5SDimitry Andric const char *TokStart = BufferPtr; 22920b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 22930b57cec5SDimitry Andric Result.setLiteralData(TokStart); 22940b57cec5SDimitry Andric return true; 22950b57cec5SDimitry Andric } 22960b57cec5SDimitry Andric 22970b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 22980b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return. 22990b57cec5SDimitry Andric /// 23000b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 23010b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 23020b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 23030b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace. 23040b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 23050b57cec5SDimitry Andric 23060b57cec5SDimitry Andric unsigned char Char = *CurPtr; 23070b57cec5SDimitry Andric 2308e8d8bef9SDimitry Andric const char *lastNewLine = nullptr; 2309e8d8bef9SDimitry Andric auto setLastNewLine = [&](const char *Ptr) { 2310e8d8bef9SDimitry Andric lastNewLine = Ptr; 2311e8d8bef9SDimitry Andric if (!NewLinePtr) 2312e8d8bef9SDimitry Andric NewLinePtr = Ptr; 2313e8d8bef9SDimitry Andric }; 2314e8d8bef9SDimitry Andric if (SawNewline) 2315e8d8bef9SDimitry Andric setLastNewLine(CurPtr - 1); 2316e8d8bef9SDimitry Andric 23170b57cec5SDimitry Andric // Skip consecutive spaces efficiently. 23180b57cec5SDimitry Andric while (true) { 23190b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively. 23200b57cec5SDimitry Andric while (isHorizontalWhitespace(Char)) 23210b57cec5SDimitry Andric Char = *++CurPtr; 23220b57cec5SDimitry Andric 23230b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done. 23240b57cec5SDimitry Andric if (!isVerticalWhitespace(Char)) 23250b57cec5SDimitry Andric break; 23260b57cec5SDimitry Andric 23270b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 23280b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this. 23290b57cec5SDimitry Andric BufferPtr = CurPtr; 23300b57cec5SDimitry Andric return false; 23310b57cec5SDimitry Andric } 23320b57cec5SDimitry Andric 23330b57cec5SDimitry Andric // OK, but handle newline. 2334e8d8bef9SDimitry Andric if (*CurPtr == '\n') 2335e8d8bef9SDimitry Andric setLastNewLine(CurPtr); 23360b57cec5SDimitry Andric SawNewline = true; 23370b57cec5SDimitry Andric Char = *++CurPtr; 23380b57cec5SDimitry Andric } 23390b57cec5SDimitry Andric 23400b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now. 23410b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 23420b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 23430b57cec5SDimitry Andric if (SawNewline) { 23440b57cec5SDimitry Andric IsAtStartOfLine = true; 23450b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 23460b57cec5SDimitry Andric } 23470b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 23480b57cec5SDimitry Andric return true; 23490b57cec5SDimitry Andric } 23500b57cec5SDimitry Andric 23510b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space. 23520b57cec5SDimitry Andric char PrevChar = CurPtr[-1]; 23530b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 23540b57cec5SDimitry Andric 23550b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 23560b57cec5SDimitry Andric if (SawNewline) { 23570b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 23580b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 2359e8d8bef9SDimitry Andric 2360e8d8bef9SDimitry Andric if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2361e8d8bef9SDimitry Andric if (auto *Handler = PP->getEmptylineHandler()) 2362e8d8bef9SDimitry Andric Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2363e8d8bef9SDimitry Andric getSourceLocation(lastNewLine))); 2364e8d8bef9SDimitry Andric } 23650b57cec5SDimitry Andric } 23660b57cec5SDimitry Andric 23670b57cec5SDimitry Andric BufferPtr = CurPtr; 23680b57cec5SDimitry Andric return false; 23690b57cec5SDimitry Andric } 23700b57cec5SDimitry Andric 23710b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the 23720b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and 23730b57cec5SDimitry Andric /// return. 23740b57cec5SDimitry Andric /// 23750b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 23760b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 23770b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 23780b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 23790b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an 23800b57cec5SDimitry Andric // extension warning. 23810b57cec5SDimitry Andric if (!LangOpts.LineComment && !isLexingRawMode()) { 23820b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment); 23830b57cec5SDimitry Andric 23840b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation 23850b57cec5SDimitry Andric // unit. 23860b57cec5SDimitry Andric LangOpts.LineComment = true; 23870b57cec5SDimitry Andric } 23880b57cec5SDimitry Andric 23890b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that 23900b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in 23910b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop. 23920b57cec5SDimitry Andric // 23930b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer) 23940b57cec5SDimitry Andric // character that ends the line comment. 23950b57cec5SDimitry Andric char C; 23960b57cec5SDimitry Andric while (true) { 23970b57cec5SDimitry Andric C = *CurPtr; 23980b57cec5SDimitry Andric // Skip over characters in the fast loop. 23990b57cec5SDimitry Andric while (C != 0 && // Potentially EOF. 24000b57cec5SDimitry Andric C != '\n' && C != '\r') // Newline or DOS-style newline. 24010b57cec5SDimitry Andric C = *++CurPtr; 24020b57cec5SDimitry Andric 24030b57cec5SDimitry Andric const char *NextLine = CurPtr; 24040b57cec5SDimitry Andric if (C != 0) { 24050b57cec5SDimitry Andric // We found a newline, see if it's escaped. 24060b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1; 24070b57cec5SDimitry Andric bool HasSpace = false; 24080b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 24090b57cec5SDimitry Andric --EscapePtr; 24100b57cec5SDimitry Andric HasSpace = true; 24110b57cec5SDimitry Andric } 24120b57cec5SDimitry Andric 24130b57cec5SDimitry Andric if (*EscapePtr == '\\') 24140b57cec5SDimitry Andric // Escaped newline. 24150b57cec5SDimitry Andric CurPtr = EscapePtr; 24160b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 24170b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs) 24180b57cec5SDimitry Andric // Trigraph-escaped newline. 24190b57cec5SDimitry Andric CurPtr = EscapePtr-2; 24200b57cec5SDimitry Andric else 24210b57cec5SDimitry Andric break; // This is a newline, we're done. 24220b57cec5SDimitry Andric 24230b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 24240b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode()) 24250b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space); 24260b57cec5SDimitry Andric } 24270b57cec5SDimitry Andric 24280b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 24290b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting 24300b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline, 24310b57cec5SDimitry Andric // we'll handle it below. 24320b57cec5SDimitry Andric const char *OldPtr = CurPtr; 24330b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode(); 24340b57cec5SDimitry Andric LexingRawMode = true; 24350b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 24360b57cec5SDimitry Andric LexingRawMode = OldRawMode; 24370b57cec5SDimitry Andric 24380b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed. 24390b57cec5SDimitry Andric // We're done and can skip forward to the newline. 24400b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) { 24410b57cec5SDimitry Andric CurPtr = NextLine; 24420b57cec5SDimitry Andric break; 24430b57cec5SDimitry Andric } 24440b57cec5SDimitry Andric 24450b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or 24460b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic 24470b57cec5SDimitry Andric // unless the next line is also a // comment. 24480b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' && 24490b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 24500b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr) 24510b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 24520b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next 24530b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic. 24540b57cec5SDimitry Andric if (isWhitespace(C)) { 24550b57cec5SDimitry Andric const char *ForwardPtr = CurPtr; 24560b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace. 24570b57cec5SDimitry Andric ++ForwardPtr; 24580b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 24590b57cec5SDimitry Andric break; 24600b57cec5SDimitry Andric } 24610b57cec5SDimitry Andric 24620b57cec5SDimitry Andric if (!isLexingRawMode()) 24630b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment); 24640b57cec5SDimitry Andric break; 24650b57cec5SDimitry Andric } 24660b57cec5SDimitry Andric } 24670b57cec5SDimitry Andric 24680b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 24690b57cec5SDimitry Andric --CurPtr; 24700b57cec5SDimitry Andric break; 24710b57cec5SDimitry Andric } 24720b57cec5SDimitry Andric 24730b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 24740b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 24750b57cec5SDimitry Andric cutOffLexing(); 24760b57cec5SDimitry Andric return false; 24770b57cec5SDimitry Andric } 24780b57cec5SDimitry Andric } 24790b57cec5SDimitry Andric 24800b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the 24810b57cec5SDimitry Andric // comment unless we're in a #if 0 block. 24820b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 24830b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 24840b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 24850b57cec5SDimitry Andric BufferPtr = CurPtr; 24860b57cec5SDimitry Andric return true; // A token has to be returned. 24870b57cec5SDimitry Andric } 24880b57cec5SDimitry Andric 24890b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 24900b57cec5SDimitry Andric if (inKeepCommentMode()) 24910b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr); 24920b57cec5SDimitry Andric 24930b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 24940b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token. 24950b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 24960b57cec5SDimitry Andric BufferPtr = CurPtr; 24970b57cec5SDimitry Andric return false; 24980b57cec5SDimitry Andric } 24990b57cec5SDimitry Andric 25000b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or 25010b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't 25020b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that 25030b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the 25040b57cec5SDimitry Andric /// comment above in that mode. 2505e8d8bef9SDimitry Andric NewLinePtr = CurPtr++; 25060b57cec5SDimitry Andric 25070b57cec5SDimitry Andric // The next returned token is at the start of the line. 25080b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 25090b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 25100b57cec5SDimitry Andric // No leading whitespace seen so far. 25110b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 25120b57cec5SDimitry Andric BufferPtr = CurPtr; 25130b57cec5SDimitry Andric return false; 25140b57cec5SDimitry Andric } 25150b57cec5SDimitry Andric 25160b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate 25170b57cec5SDimitry Andric /// way and return it. 25180b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 25190b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment 25200b57cec5SDimitry Andric // directly. 25210b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 25220b57cec5SDimitry Andric 25230b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode) 25240b57cec5SDimitry Andric return true; 25250b57cec5SDimitry Andric 25260b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into 25270b57cec5SDimitry Andric // a C-style block comment. 25280b57cec5SDimitry Andric bool Invalid = false; 25290b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid); 25300b57cec5SDimitry Andric if (Invalid) 25310b57cec5SDimitry Andric return true; 25320b57cec5SDimitry Andric 25330b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 25340b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*". 25350b57cec5SDimitry Andric Spelling += "*/"; // add suffix. 25360b57cec5SDimitry Andric 25370b57cec5SDimitry Andric Result.setKind(tok::comment); 25380b57cec5SDimitry Andric PP->CreateString(Spelling, Result, 25390b57cec5SDimitry Andric Result.getLocation(), Result.getLocation()); 25400b57cec5SDimitry Andric return true; 25410b57cec5SDimitry Andric } 25420b57cec5SDimitry Andric 25430b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 25440b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 25450b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment. 25460b57cec5SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 25470b57cec5SDimitry Andric Lexer *L) { 25480b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 25490b57cec5SDimitry Andric 2550fe6060f1SDimitry Andric // Position of the first trigraph in the ending sequence. 2551*04eeddc0SDimitry Andric const char *TrigraphPos = nullptr; 2552fe6060f1SDimitry Andric // Position of the first whitespace after a '\' in the ending sequence. 2553*04eeddc0SDimitry Andric const char *SpacePos = nullptr; 2554fe6060f1SDimitry Andric 2555fe6060f1SDimitry Andric while (true) { 25560b57cec5SDimitry Andric // Back up off the newline. 25570b57cec5SDimitry Andric --CurPtr; 25580b57cec5SDimitry Andric 25590b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character. 25600b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 25610b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline. 25620b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1]) 25630b57cec5SDimitry Andric return false; 25640b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline. 25650b57cec5SDimitry Andric --CurPtr; 25660b57cec5SDimitry Andric } 25670b57cec5SDimitry Andric 25680b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace 25690b57cec5SDimitry Andric // between the slash and newline. 25700b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2571fe6060f1SDimitry Andric SpacePos = CurPtr; 25720b57cec5SDimitry Andric --CurPtr; 25730b57cec5SDimitry Andric } 25740b57cec5SDimitry Andric 2575fe6060f1SDimitry Andric // If we have a slash, this is an escaped newline. 25760b57cec5SDimitry Andric if (*CurPtr == '\\') { 2577fe6060f1SDimitry Andric --CurPtr; 2578fe6060f1SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2579fe6060f1SDimitry Andric // This is a trigraph encoding of a slash. 2580fe6060f1SDimitry Andric TrigraphPos = CurPtr - 2; 2581fe6060f1SDimitry Andric CurPtr -= 3; 25820b57cec5SDimitry Andric } else { 25830b57cec5SDimitry Andric return false; 2584fe6060f1SDimitry Andric } 25850b57cec5SDimitry Andric 2586fe6060f1SDimitry Andric // If the character preceding the escaped newline is a '*', then after line 2587fe6060f1SDimitry Andric // splicing we have a '*/' ending the comment. 2588fe6060f1SDimitry Andric if (*CurPtr == '*') 2589fe6060f1SDimitry Andric break; 25900b57cec5SDimitry Andric 2591fe6060f1SDimitry Andric if (*CurPtr != '\n' && *CurPtr != '\r') 2592fe6060f1SDimitry Andric return false; 2593fe6060f1SDimitry Andric } 2594fe6060f1SDimitry Andric 2595fe6060f1SDimitry Andric if (TrigraphPos) { 25960b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and 25970b57cec5SDimitry Andric // ignore this * character. 25980b57cec5SDimitry Andric if (!L->getLangOpts().Trigraphs) { 25990b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2600fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 26010b57cec5SDimitry Andric return false; 26020b57cec5SDimitry Andric } 26030b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2604fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 26050b57cec5SDimitry Andric } 26060b57cec5SDimitry Andric 26070b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters. 26080b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2609fe6060f1SDimitry Andric L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 26100b57cec5SDimitry Andric 26110b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 2612fe6060f1SDimitry Andric if (SpacePos && !L->isLexingRawMode()) 2613fe6060f1SDimitry Andric L->Diag(SpacePos, diag::backslash_newline_space); 26140b57cec5SDimitry Andric 26150b57cec5SDimitry Andric return true; 26160b57cec5SDimitry Andric } 26170b57cec5SDimitry Andric 26180b57cec5SDimitry Andric #ifdef __SSE2__ 26190b57cec5SDimitry Andric #include <emmintrin.h> 26200b57cec5SDimitry Andric #elif __ALTIVEC__ 26210b57cec5SDimitry Andric #include <altivec.h> 26220b57cec5SDimitry Andric #undef bool 26230b57cec5SDimitry Andric #endif 26240b57cec5SDimitry Andric 26250b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment. 26260b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment. 26270b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block 26280b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing 26290b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between 26300b57cec5SDimitry Andric /// the terminating * and /. 26310b57cec5SDimitry Andric /// 26320b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 26330b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 26340b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 26350b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 26360b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once 26370b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common 26380b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their 26390b57cec5SDimitry Andric // comments. 26400b57cec5SDimitry Andric 26410b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle 26420b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline 26430b57cec5SDimitry Andric // after it. 26440b57cec5SDimitry Andric unsigned CharSize; 26450b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize); 26460b57cec5SDimitry Andric CurPtr += CharSize; 26470b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) { 26480b57cec5SDimitry Andric if (!isLexingRawMode()) 26490b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 26500b57cec5SDimitry Andric --CurPtr; 26510b57cec5SDimitry Andric 26520b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 26530b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 26540b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 26550b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 26560b57cec5SDimitry Andric return true; 26570b57cec5SDimitry Andric } 26580b57cec5SDimitry Andric 26590b57cec5SDimitry Andric BufferPtr = CurPtr; 26600b57cec5SDimitry Andric return false; 26610b57cec5SDimitry Andric } 26620b57cec5SDimitry Andric 26630b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so, 26640b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it. 26650b57cec5SDimitry Andric if (C == '/') 26660b57cec5SDimitry Andric C = *CurPtr++; 26670b57cec5SDimitry Andric 26680b57cec5SDimitry Andric while (true) { 26690b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a 26700b57cec5SDimitry Andric // (probably ending) '/' character. 26710b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd && 26720b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it 26730b57cec5SDimitry Andric // doesn't check for '\0'. 26740b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 26750b57cec5SDimitry Andric // While not aligned to a 16-byte boundary. 26760b57cec5SDimitry Andric while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 26770b57cec5SDimitry Andric C = *CurPtr++; 26780b57cec5SDimitry Andric 26790b57cec5SDimitry Andric if (C == '/') goto FoundSlash; 26800b57cec5SDimitry Andric 26810b57cec5SDimitry Andric #ifdef __SSE2__ 26820b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/'); 26830b57cec5SDimitry Andric while (CurPtr+16 <= BufferEnd) { 26840b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 26850b57cec5SDimitry Andric Slashes)); 26860b57cec5SDimitry Andric if (cmp != 0) { 26870b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's 26880b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of 26890b57cec5SDimitry Andric // the outer loop. 26900b57cec5SDimitry Andric CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 26910b57cec5SDimitry Andric goto FoundSlash; 26920b57cec5SDimitry Andric } 26930b57cec5SDimitry Andric CurPtr += 16; 26940b57cec5SDimitry Andric } 26950b57cec5SDimitry Andric #elif __ALTIVEC__ 26960b57cec5SDimitry Andric __vector unsigned char Slashes = { 26970b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/', 26980b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/' 26990b57cec5SDimitry Andric }; 27000b57cec5SDimitry Andric while (CurPtr + 16 <= BufferEnd && 270113138422SDimitry Andric !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) 27020b57cec5SDimitry Andric CurPtr += 16; 27030b57cec5SDimitry Andric #else 27040b57cec5SDimitry Andric // Scan for '/' quickly. Many block comments are very large. 27050b57cec5SDimitry Andric while (CurPtr[0] != '/' && 27060b57cec5SDimitry Andric CurPtr[1] != '/' && 27070b57cec5SDimitry Andric CurPtr[2] != '/' && 27080b57cec5SDimitry Andric CurPtr[3] != '/' && 27090b57cec5SDimitry Andric CurPtr+4 < BufferEnd) { 27100b57cec5SDimitry Andric CurPtr += 4; 27110b57cec5SDimitry Andric } 27120b57cec5SDimitry Andric #endif 27130b57cec5SDimitry Andric 27140b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one. 27150b57cec5SDimitry Andric C = *CurPtr++; 27160b57cec5SDimitry Andric } 27170b57cec5SDimitry Andric 27180b57cec5SDimitry Andric // Loop to scan the remainder. 27190b57cec5SDimitry Andric while (C != '/' && C != '\0') 27200b57cec5SDimitry Andric C = *CurPtr++; 27210b57cec5SDimitry Andric 27220b57cec5SDimitry Andric if (C == '/') { 27230b57cec5SDimitry Andric FoundSlash: 27240b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done! 27250b57cec5SDimitry Andric break; 27260b57cec5SDimitry Andric 27270b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 27280b57cec5SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 27290b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the 27300b57cec5SDimitry Andric // * and /. We're done! 27310b57cec5SDimitry Andric break; 27320b57cec5SDimitry Andric } 27330b57cec5SDimitry Andric } 27340b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') { 27350b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this 27360b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with 27370b57cec5SDimitry Andric // embedded escaped newlines, but oh well. 27380b57cec5SDimitry Andric if (!isLexingRawMode()) 27390b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment); 27400b57cec5SDimitry Andric } 27410b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) { 27420b57cec5SDimitry Andric if (!isLexingRawMode()) 27430b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 27440b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately 27450b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the 27460b57cec5SDimitry Andric // comment, which surely would confuse the parser. 27470b57cec5SDimitry Andric --CurPtr; 27480b57cec5SDimitry Andric 27490b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 27500b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 27510b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 27520b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 27530b57cec5SDimitry Andric return true; 27540b57cec5SDimitry Andric } 27550b57cec5SDimitry Andric 27560b57cec5SDimitry Andric BufferPtr = CurPtr; 27570b57cec5SDimitry Andric return false; 27580b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 27590b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 27600b57cec5SDimitry Andric cutOffLexing(); 27610b57cec5SDimitry Andric return false; 27620b57cec5SDimitry Andric } 27630b57cec5SDimitry Andric 27640b57cec5SDimitry Andric C = *CurPtr++; 27650b57cec5SDimitry Andric } 27660b57cec5SDimitry Andric 27670b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block. 27680b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 27690b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 27700b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 27710b57cec5SDimitry Andric BufferPtr = CurPtr; 27720b57cec5SDimitry Andric return true; // A token has to be returned. 27730b57cec5SDimitry Andric } 27740b57cec5SDimitry Andric 27750b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 27760b57cec5SDimitry Andric if (inKeepCommentMode()) { 27770b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 27780b57cec5SDimitry Andric return true; 27790b57cec5SDimitry Andric } 27800b57cec5SDimitry Andric 27810b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be 27820b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it 27830b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would 27840b57cec5SDimitry Andric // have already returned above with the comment as a token. 27850b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 27860b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 27870b57cec5SDimitry Andric return false; 27880b57cec5SDimitry Andric } 27890b57cec5SDimitry Andric 27900b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token. 27910b57cec5SDimitry Andric BufferPtr = CurPtr; 27920b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 27930b57cec5SDimitry Andric return false; 27940b57cec5SDimitry Andric } 27950b57cec5SDimitry Andric 27960b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 27970b57cec5SDimitry Andric // Primary Lexing Entry Points 27980b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 27990b57cec5SDimitry Andric 28000b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 28010b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode. 28020b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 28030b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false && 28040b57cec5SDimitry Andric "Must be in a preprocessing directive!"); 28050b57cec5SDimitry Andric Token Tmp; 2806480093f4SDimitry Andric Tmp.startToken(); 28070b57cec5SDimitry Andric 28080b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 28090b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 28100b57cec5SDimitry Andric while (true) { 28110b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp); 28120b57cec5SDimitry Andric switch (Char) { 28130b57cec5SDimitry Andric default: 28140b57cec5SDimitry Andric if (Result) 28150b57cec5SDimitry Andric Result->push_back(Char); 28160b57cec5SDimitry Andric break; 28170b57cec5SDimitry Andric case 0: // Null. 28180b57cec5SDimitry Andric // Found end of file? 28190b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) { 28200b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 28210b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 28220b57cec5SDimitry Andric cutOffLexing(); 28230b57cec5SDimitry Andric return; 28240b57cec5SDimitry Andric } 28250b57cec5SDimitry Andric 28260b57cec5SDimitry Andric // Nope, normal character, continue. 28270b57cec5SDimitry Andric if (Result) 28280b57cec5SDimitry Andric Result->push_back(Char); 28290b57cec5SDimitry Andric break; 28300b57cec5SDimitry Andric } 28310b57cec5SDimitry Andric // FALL THROUGH. 28320b57cec5SDimitry Andric LLVM_FALLTHROUGH; 28330b57cec5SDimitry Andric case '\r': 28340b57cec5SDimitry Andric case '\n': 28350b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n. 28360b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 28370b57cec5SDimitry Andric BufferPtr = CurPtr-1; 28380b57cec5SDimitry Andric 28390b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition. 28400b57cec5SDimitry Andric Lex(Tmp); 28410b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) { 28420b57cec5SDimitry Andric if (PP) 28430b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 28440b57cec5SDimitry Andric Lex(Tmp); 28450b57cec5SDimitry Andric } 28460b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!"); 28470b57cec5SDimitry Andric 28480b57cec5SDimitry Andric // Finally, we're done; 28490b57cec5SDimitry Andric return; 28500b57cec5SDimitry Andric } 28510b57cec5SDimitry Andric } 28520b57cec5SDimitry Andric } 28530b57cec5SDimitry Andric 28540b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this 28550b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required. 28560b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be 28570b57cec5SDimitry Andric /// called again. 28580b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 28590b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive, 28600b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will 28610b57cec5SDimitry Andric // then be the end of file. 28620b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 28630b57cec5SDimitry Andric // Done parsing the "line". 28640b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 28650b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 28660b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod); 28670b57cec5SDimitry Andric 28680b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 28690b57cec5SDimitry Andric if (PP) 28700b57cec5SDimitry Andric resetExtendedTokenMode(); 28710b57cec5SDimitry Andric return true; // Have a token. 28720b57cec5SDimitry Andric } 28730b57cec5SDimitry Andric 28740b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller 28750b57cec5SDimitry Andric // that put us in raw mode handle the event. 28760b57cec5SDimitry Andric if (isLexingRawMode()) { 28770b57cec5SDimitry Andric Result.startToken(); 28780b57cec5SDimitry Andric BufferPtr = BufferEnd; 28790b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof); 28800b57cec5SDimitry Andric return true; 28810b57cec5SDimitry Andric } 28820b57cec5SDimitry Andric 28830b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 28840b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack); 2885fe6060f1SDimitry Andric // If the preamble cuts off the end of a header guard, consider it guarded. 2886fe6060f1SDimitry Andric // The guard is valid for the preamble content itself, and for tools the 2887fe6060f1SDimitry Andric // most useful answer is "yes, this file has a header guard". 2888fe6060f1SDimitry Andric if (!ConditionalStack.empty()) 2889fe6060f1SDimitry Andric MIOpt.ExitTopLevelConditional(); 28900b57cec5SDimitry Andric ConditionalStack.clear(); 28910b57cec5SDimitry Andric } 28920b57cec5SDimitry Andric 28930b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline. 28940b57cec5SDimitry Andric 28950b57cec5SDimitry Andric // If we are in a #if directive, emit an error. 28960b57cec5SDimitry Andric while (!ConditionalStack.empty()) { 28970b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc) 28980b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc, 28990b57cec5SDimitry Andric diag::err_pp_unterminated_conditional); 29000b57cec5SDimitry Andric ConditionalStack.pop_back(); 29010b57cec5SDimitry Andric } 29020b57cec5SDimitry Andric 29036e75b2fbSDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd); 29040b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 29050b57cec5SDimitry Andric // a pedwarn. 29060b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 29070b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics(); 29080b57cec5SDimitry Andric unsigned DiagID; 29090b57cec5SDimitry Andric 29100b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 29110b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2 29120b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic, 29130b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning. 29140b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 29150b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof; 29160b57cec5SDimitry Andric } else { 29170b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof; 29180b57cec5SDimitry Andric } 29190b57cec5SDimitry Andric } else { 29200b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof; 29210b57cec5SDimitry Andric } 29220b57cec5SDimitry Andric 29230b57cec5SDimitry Andric Diag(BufferEnd, DiagID) 29240b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n"); 29250b57cec5SDimitry Andric } 29260b57cec5SDimitry Andric 29270b57cec5SDimitry Andric BufferPtr = CurPtr; 29280b57cec5SDimitry Andric 29290b57cec5SDimitry Andric // Finally, let the preprocessor handle this. 29306e75b2fbSDimitry Andric return PP->HandleEndOfFile(Result, EndLoc, isPragmaLexer()); 29310b57cec5SDimitry Andric } 29320b57cec5SDimitry Andric 29330b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 29340b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something 29350b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the 29360b57cec5SDimitry Andric /// lexer. 29370b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() { 29380b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 29390b57cec5SDimitry Andric 29400b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token 29410b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF 29420b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack. 29430b57cec5SDimitry Andric LexingRawMode = true; 29440b57cec5SDimitry Andric 29450b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it. 29460b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr; 29470b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective; 29480b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine; 29490b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 29500b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace; 29510b57cec5SDimitry Andric 29520b57cec5SDimitry Andric Token Tok; 29530b57cec5SDimitry Andric Lex(Tok); 29540b57cec5SDimitry Andric 29550b57cec5SDimitry Andric // Restore state that may have changed. 29560b57cec5SDimitry Andric BufferPtr = TmpBufferPtr; 29570b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode; 29580b57cec5SDimitry Andric HasLeadingSpace = leadingSpace; 29590b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine; 29600b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 29610b57cec5SDimitry Andric 29620b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode. 29630b57cec5SDimitry Andric LexingRawMode = false; 29640b57cec5SDimitry Andric 29650b57cec5SDimitry Andric if (Tok.is(tok::eof)) 29660b57cec5SDimitry Andric return 2; 29670b57cec5SDimitry Andric return Tok.is(tok::l_paren); 29680b57cec5SDimitry Andric } 29690b57cec5SDimitry Andric 29700b57cec5SDimitry Andric /// Find the end of a version control conflict marker. 29710b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 29720b57cec5SDimitry Andric ConflictMarkerKind CMK) { 29730b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 29740b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 29750b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 29760b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator); 29770b57cec5SDimitry Andric while (Pos != StringRef::npos) { 29780b57cec5SDimitry Andric // Must occur at start of line. 29790b57cec5SDimitry Andric if (Pos == 0 || 29800b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 29810b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 29820b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator); 29830b57cec5SDimitry Andric continue; 29840b57cec5SDimitry Andric } 29850b57cec5SDimitry Andric return RestOfBuffer.data()+Pos; 29860b57cec5SDimitry Andric } 29870b57cec5SDimitry Andric return nullptr; 29880b57cec5SDimitry Andric } 29890b57cec5SDimitry Andric 29900b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version 29910b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 29920b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false 29930b57cec5SDimitry Andric /// if not. 29940b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 29950b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 29960b57cec5SDimitry Andric if (CurPtr != BufferStart && 29970b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 29980b57cec5SDimitry Andric return false; 29990b57cec5SDimitry Andric 30000b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>. 30010b57cec5SDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 30020b57cec5SDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 30030b57cec5SDimitry Andric return false; 30040b57cec5SDimitry Andric 30050b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 30060b57cec5SDimitry Andric // it. 30070b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode()) 30080b57cec5SDimitry Andric return false; 30090b57cec5SDimitry Andric 30100b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 30110b57cec5SDimitry Andric 30120b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the 30130b57cec5SDimitry Andric // start of a line to terminate this conflict marker. 30140b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 30150b57cec5SDimitry Andric // We found a match. We are really in a conflict marker. 30160b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line. 30170b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker); 30180b57cec5SDimitry Andric CurrentConflictMarkerState = Kind; 30190b57cec5SDimitry Andric 30200b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the 30210b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n. 30220b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') { 30230b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line"); 30240b57cec5SDimitry Andric ++CurPtr; 30250b57cec5SDimitry Andric } 30260b57cec5SDimitry Andric BufferPtr = CurPtr; 30270b57cec5SDimitry Andric return true; 30280b57cec5SDimitry Andric } 30290b57cec5SDimitry Andric 30300b57cec5SDimitry Andric // No end of conflict marker found. 30310b57cec5SDimitry Andric return false; 30320b57cec5SDimitry Andric } 30330b57cec5SDimitry Andric 30340b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 30350b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 30360b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of 30370b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not. 30380b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 30390b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 30400b57cec5SDimitry Andric if (CurPtr != BufferStart && 30410b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 30420b57cec5SDimitry Andric return false; 30430b57cec5SDimitry Andric 30440b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 30450b57cec5SDimitry Andric // it. 30460b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode()) 30470b57cec5SDimitry Andric return false; 30480b57cec5SDimitry Andric 30490b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row). 30500b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i) 30510b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0]) 30520b57cec5SDimitry Andric return false; 30530b57cec5SDimitry Andric 30540b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could 30550b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 30560b57cec5SDimitry Andric // be the end of conflict marker. 30570b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 30580b57cec5SDimitry Andric CurrentConflictMarkerState)) { 30590b57cec5SDimitry Andric CurPtr = End; 30600b57cec5SDimitry Andric 30610b57cec5SDimitry Andric // Skip ahead to the end of line. 30620b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 30630b57cec5SDimitry Andric ++CurPtr; 30640b57cec5SDimitry Andric 30650b57cec5SDimitry Andric BufferPtr = CurPtr; 30660b57cec5SDimitry Andric 30670b57cec5SDimitry Andric // No longer in the conflict marker. 30680b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 30690b57cec5SDimitry Andric return true; 30700b57cec5SDimitry Andric } 30710b57cec5SDimitry Andric 30720b57cec5SDimitry Andric return false; 30730b57cec5SDimitry Andric } 30740b57cec5SDimitry Andric 30750b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr, 30760b57cec5SDimitry Andric const char *BufferEnd) { 30770b57cec5SDimitry Andric if (CurPtr == BufferEnd) 30780b57cec5SDimitry Andric return nullptr; 30790b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character. 30800b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) { 30810b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>') 30820b57cec5SDimitry Andric return CurPtr + 2; 30830b57cec5SDimitry Andric } 30840b57cec5SDimitry Andric return nullptr; 30850b57cec5SDimitry Andric } 30860b57cec5SDimitry Andric 30870b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 30880b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 30890b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 30900b57cec5SDimitry Andric return false; 30910b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 30920b57cec5SDimitry Andric if (!End) 30930b57cec5SDimitry Andric return false; 30940b57cec5SDimitry Andric const char *Start = CurPtr - 1; 30950b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders) 30960b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source); 30970b57cec5SDimitry Andric Result.startToken(); 30980b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier); 30990b57cec5SDimitry Andric Result.setRawIdentifierData(Start); 31000b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result); 31010b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder); 31020b57cec5SDimitry Andric BufferPtr = End; 31030b57cec5SDimitry Andric return true; 31040b57cec5SDimitry Andric } 31050b57cec5SDimitry Andric 31060b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 31070b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) { 31080b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 31090b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc(); 31100b57cec5SDimitry Andric } 31110b57cec5SDimitry Andric 31120b57cec5SDimitry Andric return false; 31130b57cec5SDimitry Andric } 31140b57cec5SDimitry Andric 31150b57cec5SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 31160b57cec5SDimitry Andric Token *Result) { 31170b57cec5SDimitry Andric unsigned CharSize; 31180b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 3119349cc55cSDimitry Andric bool Delimited = false; 3120349cc55cSDimitry Andric bool FoundEndDelimiter = false; 3121349cc55cSDimitry Andric unsigned Count = 0; 3122349cc55cSDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 31230b57cec5SDimitry Andric 31240b57cec5SDimitry Andric unsigned NumHexDigits; 31250b57cec5SDimitry Andric if (Kind == 'u') 31260b57cec5SDimitry Andric NumHexDigits = 4; 31270b57cec5SDimitry Andric else if (Kind == 'U') 31280b57cec5SDimitry Andric NumHexDigits = 8; 31290b57cec5SDimitry Andric else 31300b57cec5SDimitry Andric return 0; 31310b57cec5SDimitry Andric 31320b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3133349cc55cSDimitry Andric if (Diagnose) 31340b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 31350b57cec5SDimitry Andric return 0; 31360b57cec5SDimitry Andric } 31370b57cec5SDimitry Andric 31380b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize; 31390b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1]; 31400b57cec5SDimitry Andric 31410b57cec5SDimitry Andric uint32_t CodePoint = 0; 3142349cc55cSDimitry Andric while (Count != NumHexDigits || Delimited) { 31430b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize); 3144349cc55cSDimitry Andric if (!Delimited && C == '{') { 3145349cc55cSDimitry Andric Delimited = true; 3146349cc55cSDimitry Andric CurPtr += CharSize; 3147349cc55cSDimitry Andric continue; 3148349cc55cSDimitry Andric } 3149349cc55cSDimitry Andric 3150349cc55cSDimitry Andric if (Delimited && C == '}') { 3151349cc55cSDimitry Andric CurPtr += CharSize; 3152349cc55cSDimitry Andric FoundEndDelimiter = true; 3153349cc55cSDimitry Andric break; 3154349cc55cSDimitry Andric } 31550b57cec5SDimitry Andric 31560b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C); 31570b57cec5SDimitry Andric if (Value == -1U) { 3158349cc55cSDimitry Andric if (!Delimited) 3159349cc55cSDimitry Andric break; 3160349cc55cSDimitry Andric if (Diagnose) 3161349cc55cSDimitry Andric Diag(BufferPtr, diag::warn_delimited_ucn_incomplete) 3162349cc55cSDimitry Andric << StringRef(&C, 1); 3163349cc55cSDimitry Andric return 0; 3164349cc55cSDimitry Andric } 31650b57cec5SDimitry Andric 3166349cc55cSDimitry Andric if (CodePoint & 0xF000'0000) { 3167349cc55cSDimitry Andric if (Diagnose) 3168349cc55cSDimitry Andric Diag(KindLoc, diag::err_escape_too_large) << 0; 3169349cc55cSDimitry Andric return 0; 3170349cc55cSDimitry Andric } 3171349cc55cSDimitry Andric 3172349cc55cSDimitry Andric CodePoint <<= 4; 3173349cc55cSDimitry Andric CodePoint |= Value; 3174349cc55cSDimitry Andric CurPtr += CharSize; 3175349cc55cSDimitry Andric Count++; 3176349cc55cSDimitry Andric } 3177349cc55cSDimitry Andric 3178349cc55cSDimitry Andric if (Count == 0) { 3179349cc55cSDimitry Andric if (Diagnose) 3180349cc55cSDimitry Andric Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3181349cc55cSDimitry Andric : diag::warn_ucn_escape_no_digits) 3182349cc55cSDimitry Andric << StringRef(KindLoc, 1); 3183349cc55cSDimitry Andric return 0; 3184349cc55cSDimitry Andric } 3185349cc55cSDimitry Andric 3186349cc55cSDimitry Andric if (!Delimited && Count != NumHexDigits) { 3187349cc55cSDimitry Andric if (Diagnose) { 3188349cc55cSDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 31890b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u. 3190349cc55cSDimitry Andric if (Count == 4 && NumHexDigits == 8) { 31910b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 31920b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight) 31930b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u"); 31940b57cec5SDimitry Andric } 31950b57cec5SDimitry Andric } 31960b57cec5SDimitry Andric return 0; 31970b57cec5SDimitry Andric } 31980b57cec5SDimitry Andric 3199349cc55cSDimitry Andric if (Delimited && PP) { 3200349cc55cSDimitry Andric Diag(BufferPtr, diag::ext_delimited_escape_sequence); 32010b57cec5SDimitry Andric } 32020b57cec5SDimitry Andric 32030b57cec5SDimitry Andric if (Result) { 32040b57cec5SDimitry Andric Result->setFlag(Token::HasUCN); 3205349cc55cSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0))) 32060b57cec5SDimitry Andric StartPtr = CurPtr; 32070b57cec5SDimitry Andric else 32080b57cec5SDimitry Andric while (StartPtr != CurPtr) 32090b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 32100b57cec5SDimitry Andric } else { 32110b57cec5SDimitry Andric StartPtr = CurPtr; 32120b57cec5SDimitry Andric } 32130b57cec5SDimitry Andric 32140b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode 32150b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) 32160b57cec5SDimitry Andric return CodePoint; 32170b57cec5SDimitry Andric 32180b57cec5SDimitry Andric // C99 6.4.3p2: A universal character name shall not specify a character whose 32190b57cec5SDimitry Andric // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 32200b57cec5SDimitry Andric // 0060 (`), nor one in the range D800 through DFFF inclusive.) 32210b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a 32220b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the 32230b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 32240b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the 32250b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 32260b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the 32270b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 32280b57cec5SDimitry Andric // basic source character set, the program is ill-formed. 32290b57cec5SDimitry Andric if (CodePoint < 0xA0) { 32300b57cec5SDimitry Andric if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 32310b57cec5SDimitry Andric return CodePoint; 32320b57cec5SDimitry Andric 32330b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad 32340b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 32350b57cec5SDimitry Andric if (Result && PP) { 32360b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F) 32370b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character); 32380b57cec5SDimitry Andric else { 32390b57cec5SDimitry Andric char C = static_cast<char>(CodePoint); 32400b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 32410b57cec5SDimitry Andric } 32420b57cec5SDimitry Andric } 32430b57cec5SDimitry Andric 32440b57cec5SDimitry Andric return 0; 32450b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 32460b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 32470b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad 32480b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 32490b57cec5SDimitry Andric if (Result && PP) { 32500b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 32510b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 32520b57cec5SDimitry Andric else 32530b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid); 32540b57cec5SDimitry Andric } 32550b57cec5SDimitry Andric return 0; 32560b57cec5SDimitry Andric } 32570b57cec5SDimitry Andric 32580b57cec5SDimitry Andric return CodePoint; 32590b57cec5SDimitry Andric } 32600b57cec5SDimitry Andric 32610b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 32620b57cec5SDimitry Andric const char *CurPtr) { 32630b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3264349cc55cSDimitry Andric isUnicodeWhitespace(C)) { 32650b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace) 32660b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr); 32670b57cec5SDimitry Andric 32680b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 32690b57cec5SDimitry Andric return true; 32700b57cec5SDimitry Andric } 32710b57cec5SDimitry Andric return false; 32720b57cec5SDimitry Andric } 32730b57cec5SDimitry Andric 32740b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 32750b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine(); 32760b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace(); 32770b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 32780b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine. 32790b57cec5SDimitry Andric } 32800b57cec5SDimitry Andric 32810b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) { 32820b57cec5SDimitry Andric // Start a new token. 32830b57cec5SDimitry Andric Result.startToken(); 32840b57cec5SDimitry Andric 32850b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal. 32860b57cec5SDimitry Andric if (IsAtStartOfLine) { 32870b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 32880b57cec5SDimitry Andric IsAtStartOfLine = false; 32890b57cec5SDimitry Andric } 32900b57cec5SDimitry Andric 32910b57cec5SDimitry Andric if (HasLeadingSpace) { 32920b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 32930b57cec5SDimitry Andric HasLeadingSpace = false; 32940b57cec5SDimitry Andric } 32950b57cec5SDimitry Andric 32960b57cec5SDimitry Andric if (HasLeadingEmptyMacro) { 32970b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro); 32980b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 32990b57cec5SDimitry Andric } 33000b57cec5SDimitry Andric 33010b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 33020b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false; 33030b57cec5SDimitry Andric bool isRawLex = isLexingRawMode(); 33040b57cec5SDimitry Andric (void) isRawLex; 33050b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 33060b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.) 33070b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 33080b57cec5SDimitry Andric return returnedToken; 33090b57cec5SDimitry Andric } 33100b57cec5SDimitry Andric 33110b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an 33120b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer 33130b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing 33140b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes 33150b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this. 33160b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 33170b57cec5SDimitry Andric LexNextToken: 33180b57cec5SDimitry Andric // New token, can't need cleaning yet. 33190b57cec5SDimitry Andric Result.clearFlag(Token::NeedsCleaning); 33200b57cec5SDimitry Andric Result.setIdentifierInfo(nullptr); 33210b57cec5SDimitry Andric 33220b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 33230b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 33240b57cec5SDimitry Andric 33250b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens. 3326fe6060f1SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 3327fe6060f1SDimitry Andric do { 33280b57cec5SDimitry Andric ++CurPtr; 3329fe6060f1SDimitry Andric } while (isHorizontalWhitespace(*CurPtr)); 33300b57cec5SDimitry Andric 33310b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just 33320b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the 33330b57cec5SDimitry Andric // whitespace. 33340b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 33350b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 33360b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 33370b57cec5SDimitry Andric return true; 33380b57cec5SDimitry Andric } 33390b57cec5SDimitry Andric 33400b57cec5SDimitry Andric BufferPtr = CurPtr; 33410b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 33420b57cec5SDimitry Andric } 33430b57cec5SDimitry Andric 33440b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 33450b57cec5SDimitry Andric 33460b57cec5SDimitry Andric // Read a character, advancing over it. 33470b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result); 33480b57cec5SDimitry Andric tok::TokenKind Kind; 33490b57cec5SDimitry Andric 3350e8d8bef9SDimitry Andric if (!isVerticalWhitespace(Char)) 3351e8d8bef9SDimitry Andric NewLinePtr = nullptr; 3352e8d8bef9SDimitry Andric 33530b57cec5SDimitry Andric switch (Char) { 33540b57cec5SDimitry Andric case 0: // Null. 33550b57cec5SDimitry Andric // Found end of file? 33560b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd) 33570b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 33580b57cec5SDimitry Andric 33590b57cec5SDimitry Andric // Check if we are performing code completion. 33600b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 33610b57cec5SDimitry Andric // Return the code-completion token. 33620b57cec5SDimitry Andric Result.startToken(); 33630b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion); 33640b57cec5SDimitry Andric return true; 33650b57cec5SDimitry Andric } 33660b57cec5SDimitry Andric 33670b57cec5SDimitry Andric if (!isLexingRawMode()) 33680b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file); 33690b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 33700b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 33710b57cec5SDimitry Andric return true; // KeepWhitespaceMode 33720b57cec5SDimitry Andric 33730b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer. 33740b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 33750b57cec5SDimitry Andric goto LexNextToken; 33760b57cec5SDimitry Andric 33770b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z". 33780b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file. 33790b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) { 33800b57cec5SDimitry Andric if (!isLexingRawMode()) 33810b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 33820b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 33830b57cec5SDimitry Andric } 33840b57cec5SDimitry Andric 33850b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage. 33860b57cec5SDimitry Andric Kind = tok::unknown; 33870b57cec5SDimitry Andric break; 33880b57cec5SDimitry Andric 33890b57cec5SDimitry Andric case '\r': 33900b57cec5SDimitry Andric if (CurPtr[0] == '\n') 33910b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 33920b57cec5SDimitry Andric LLVM_FALLTHROUGH; 33930b57cec5SDimitry Andric case '\n': 33940b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 33950b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token. 33960b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 33970b57cec5SDimitry Andric // Done parsing the "line". 33980b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 33990b57cec5SDimitry Andric 34000b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 34010b57cec5SDimitry Andric if (PP) 34020b57cec5SDimitry Andric resetExtendedTokenMode(); 34030b57cec5SDimitry Andric 34040b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line. 34050b57cec5SDimitry Andric IsAtStartOfLine = true; 34060b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 3407e8d8bef9SDimitry Andric NewLinePtr = CurPtr - 1; 34080b57cec5SDimitry Andric 34090b57cec5SDimitry Andric Kind = tok::eod; 34100b57cec5SDimitry Andric break; 34110b57cec5SDimitry Andric } 34120b57cec5SDimitry Andric 34130b57cec5SDimitry Andric // No leading whitespace seen so far. 34140b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 34150b57cec5SDimitry Andric 34160b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 34170b57cec5SDimitry Andric return true; // KeepWhitespaceMode 34180b57cec5SDimitry Andric 34190b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 34200b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 34210b57cec5SDimitry Andric goto LexNextToken; 34220b57cec5SDimitry Andric case ' ': 34230b57cec5SDimitry Andric case '\t': 34240b57cec5SDimitry Andric case '\f': 34250b57cec5SDimitry Andric case '\v': 34260b57cec5SDimitry Andric SkipHorizontalWhitespace: 34270b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 34280b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 34290b57cec5SDimitry Andric return true; // KeepWhitespaceMode 34300b57cec5SDimitry Andric 34310b57cec5SDimitry Andric SkipIgnoredUnits: 34320b57cec5SDimitry Andric CurPtr = BufferPtr; 34330b57cec5SDimitry Andric 34340b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently 34350b57cec5SDimitry Andric // too (without going through the big switch stmt). 34360b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 34370b57cec5SDimitry Andric LangOpts.LineComment && 34380b57cec5SDimitry Andric (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 34390b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 34400b57cec5SDimitry Andric return true; // There is a token to return. 34410b57cec5SDimitry Andric goto SkipIgnoredUnits; 34420b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 34430b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 34440b57cec5SDimitry Andric return true; // There is a token to return. 34450b57cec5SDimitry Andric goto SkipIgnoredUnits; 34460b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) { 34470b57cec5SDimitry Andric goto SkipHorizontalWhitespace; 34480b57cec5SDimitry Andric } 34490b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 34500b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 34510b57cec5SDimitry Andric goto LexNextToken; 34520b57cec5SDimitry Andric 34530b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants. 34540b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants. 34550b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4': 34560b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': 34570b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34580b57cec5SDimitry Andric MIOpt.ReadToken(); 34590b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 34600b57cec5SDimitry Andric 34610b57cec5SDimitry Andric case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 34620b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34630b57cec5SDimitry Andric MIOpt.ReadToken(); 34640b57cec5SDimitry Andric 34650b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 34660b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 34670b57cec5SDimitry Andric 34680b57cec5SDimitry Andric // UTF-16 string literal 34690b57cec5SDimitry Andric if (Char == '"') 34700b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 34710b57cec5SDimitry Andric tok::utf16_string_literal); 34720b57cec5SDimitry Andric 34730b57cec5SDimitry Andric // UTF-16 character constant 34740b57cec5SDimitry Andric if (Char == '\'') 34750b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 34760b57cec5SDimitry Andric tok::utf16_char_constant); 34770b57cec5SDimitry Andric 34780b57cec5SDimitry Andric // UTF-16 raw string literal 34790b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 34800b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 34810b57cec5SDimitry Andric return LexRawStringLiteral(Result, 34820b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 34830b57cec5SDimitry Andric SizeTmp2, Result), 34840b57cec5SDimitry Andric tok::utf16_string_literal); 34850b57cec5SDimitry Andric 34860b57cec5SDimitry Andric if (Char == '8') { 34870b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 34880b57cec5SDimitry Andric 34890b57cec5SDimitry Andric // UTF-8 string literal 34900b57cec5SDimitry Andric if (Char2 == '"') 34910b57cec5SDimitry Andric return LexStringLiteral(Result, 34920b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 34930b57cec5SDimitry Andric SizeTmp2, Result), 34940b57cec5SDimitry Andric tok::utf8_string_literal); 34950b57cec5SDimitry Andric if (Char2 == '\'' && LangOpts.CPlusPlus17) 34960b57cec5SDimitry Andric return LexCharConstant( 34970b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 34980b57cec5SDimitry Andric SizeTmp2, Result), 34990b57cec5SDimitry Andric tok::utf8_char_constant); 35000b57cec5SDimitry Andric 35010b57cec5SDimitry Andric if (Char2 == 'R' && LangOpts.CPlusPlus11) { 35020b57cec5SDimitry Andric unsigned SizeTmp3; 35030b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 35040b57cec5SDimitry Andric // UTF-8 raw string literal 35050b57cec5SDimitry Andric if (Char3 == '"') { 35060b57cec5SDimitry Andric return LexRawStringLiteral(Result, 35070b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 35080b57cec5SDimitry Andric SizeTmp2, Result), 35090b57cec5SDimitry Andric SizeTmp3, Result), 35100b57cec5SDimitry Andric tok::utf8_string_literal); 35110b57cec5SDimitry Andric } 35120b57cec5SDimitry Andric } 35130b57cec5SDimitry Andric } 35140b57cec5SDimitry Andric } 35150b57cec5SDimitry Andric 35160b57cec5SDimitry Andric // treat u like the start of an identifier. 3517349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 35180b57cec5SDimitry Andric 35190b57cec5SDimitry Andric case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 35200b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 35210b57cec5SDimitry Andric MIOpt.ReadToken(); 35220b57cec5SDimitry Andric 35230b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 35240b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35250b57cec5SDimitry Andric 35260b57cec5SDimitry Andric // UTF-32 string literal 35270b57cec5SDimitry Andric if (Char == '"') 35280b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 35290b57cec5SDimitry Andric tok::utf32_string_literal); 35300b57cec5SDimitry Andric 35310b57cec5SDimitry Andric // UTF-32 character constant 35320b57cec5SDimitry Andric if (Char == '\'') 35330b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 35340b57cec5SDimitry Andric tok::utf32_char_constant); 35350b57cec5SDimitry Andric 35360b57cec5SDimitry Andric // UTF-32 raw string literal 35370b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 35380b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 35390b57cec5SDimitry Andric return LexRawStringLiteral(Result, 35400b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 35410b57cec5SDimitry Andric SizeTmp2, Result), 35420b57cec5SDimitry Andric tok::utf32_string_literal); 35430b57cec5SDimitry Andric } 35440b57cec5SDimitry Andric 35450b57cec5SDimitry Andric // treat U like the start of an identifier. 3546349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 35470b57cec5SDimitry Andric 35480b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal 35490b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 35500b57cec5SDimitry Andric MIOpt.ReadToken(); 35510b57cec5SDimitry Andric 35520b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 35530b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35540b57cec5SDimitry Andric 35550b57cec5SDimitry Andric if (Char == '"') 35560b57cec5SDimitry Andric return LexRawStringLiteral(Result, 35570b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result), 35580b57cec5SDimitry Andric tok::string_literal); 35590b57cec5SDimitry Andric } 35600b57cec5SDimitry Andric 35610b57cec5SDimitry Andric // treat R like the start of an identifier. 3562349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 35630b57cec5SDimitry Andric 35640b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 35650b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 35660b57cec5SDimitry Andric MIOpt.ReadToken(); 35670b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35680b57cec5SDimitry Andric 35690b57cec5SDimitry Andric // Wide string literal. 35700b57cec5SDimitry Andric if (Char == '"') 35710b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 35720b57cec5SDimitry Andric tok::wide_string_literal); 35730b57cec5SDimitry Andric 35740b57cec5SDimitry Andric // Wide raw string literal. 35750b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && Char == 'R' && 35760b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 35770b57cec5SDimitry Andric return LexRawStringLiteral(Result, 35780b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 35790b57cec5SDimitry Andric SizeTmp2, Result), 35800b57cec5SDimitry Andric tok::wide_string_literal); 35810b57cec5SDimitry Andric 35820b57cec5SDimitry Andric // Wide character constant. 35830b57cec5SDimitry Andric if (Char == '\'') 35840b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 35850b57cec5SDimitry Andric tok::wide_char_constant); 35860b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier. 35870b57cec5SDimitry Andric LLVM_FALLTHROUGH; 35880b57cec5SDimitry Andric 35890b57cec5SDimitry Andric // C99 6.4.2: Identifiers. 35900b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 35910b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 35920b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 35930b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z': 35940b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 35950b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 35960b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 35970b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z': 35980b57cec5SDimitry Andric case '_': 35990b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36000b57cec5SDimitry Andric MIOpt.ReadToken(); 3601349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 36020b57cec5SDimitry Andric 36030b57cec5SDimitry Andric case '$': // $ in identifiers. 36040b57cec5SDimitry Andric if (LangOpts.DollarIdents) { 36050b57cec5SDimitry Andric if (!isLexingRawMode()) 36060b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier); 36070b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36080b57cec5SDimitry Andric MIOpt.ReadToken(); 3609349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 36100b57cec5SDimitry Andric } 36110b57cec5SDimitry Andric 36120b57cec5SDimitry Andric Kind = tok::unknown; 36130b57cec5SDimitry Andric break; 36140b57cec5SDimitry Andric 36150b57cec5SDimitry Andric // C99 6.4.4: Character Constants. 36160b57cec5SDimitry Andric case '\'': 36170b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36180b57cec5SDimitry Andric MIOpt.ReadToken(); 36190b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant); 36200b57cec5SDimitry Andric 36210b57cec5SDimitry Andric // C99 6.4.5: String Literals. 36220b57cec5SDimitry Andric case '"': 36230b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36240b57cec5SDimitry Andric MIOpt.ReadToken(); 36250b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr, 36260b57cec5SDimitry Andric ParsingFilename ? tok::header_name 36270b57cec5SDimitry Andric : tok::string_literal); 36280b57cec5SDimitry Andric 36290b57cec5SDimitry Andric // C99 6.4.6: Punctuators. 36300b57cec5SDimitry Andric case '?': 36310b57cec5SDimitry Andric Kind = tok::question; 36320b57cec5SDimitry Andric break; 36330b57cec5SDimitry Andric case '[': 36340b57cec5SDimitry Andric Kind = tok::l_square; 36350b57cec5SDimitry Andric break; 36360b57cec5SDimitry Andric case ']': 36370b57cec5SDimitry Andric Kind = tok::r_square; 36380b57cec5SDimitry Andric break; 36390b57cec5SDimitry Andric case '(': 36400b57cec5SDimitry Andric Kind = tok::l_paren; 36410b57cec5SDimitry Andric break; 36420b57cec5SDimitry Andric case ')': 36430b57cec5SDimitry Andric Kind = tok::r_paren; 36440b57cec5SDimitry Andric break; 36450b57cec5SDimitry Andric case '{': 36460b57cec5SDimitry Andric Kind = tok::l_brace; 36470b57cec5SDimitry Andric break; 36480b57cec5SDimitry Andric case '}': 36490b57cec5SDimitry Andric Kind = tok::r_brace; 36500b57cec5SDimitry Andric break; 36510b57cec5SDimitry Andric case '.': 36520b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36530b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') { 36540b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36550b57cec5SDimitry Andric MIOpt.ReadToken(); 36560b57cec5SDimitry Andric 36570b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 36580b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') { 36590b57cec5SDimitry Andric Kind = tok::periodstar; 36600b57cec5SDimitry Andric CurPtr += SizeTmp; 36610b57cec5SDimitry Andric } else if (Char == '.' && 36620b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 36630b57cec5SDimitry Andric Kind = tok::ellipsis; 36640b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36650b57cec5SDimitry Andric SizeTmp2, Result); 36660b57cec5SDimitry Andric } else { 36670b57cec5SDimitry Andric Kind = tok::period; 36680b57cec5SDimitry Andric } 36690b57cec5SDimitry Andric break; 36700b57cec5SDimitry Andric case '&': 36710b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36720b57cec5SDimitry Andric if (Char == '&') { 36730b57cec5SDimitry Andric Kind = tok::ampamp; 36740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36750b57cec5SDimitry Andric } else if (Char == '=') { 36760b57cec5SDimitry Andric Kind = tok::ampequal; 36770b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36780b57cec5SDimitry Andric } else { 36790b57cec5SDimitry Andric Kind = tok::amp; 36800b57cec5SDimitry Andric } 36810b57cec5SDimitry Andric break; 36820b57cec5SDimitry Andric case '*': 36830b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 36840b57cec5SDimitry Andric Kind = tok::starequal; 36850b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36860b57cec5SDimitry Andric } else { 36870b57cec5SDimitry Andric Kind = tok::star; 36880b57cec5SDimitry Andric } 36890b57cec5SDimitry Andric break; 36900b57cec5SDimitry Andric case '+': 36910b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36920b57cec5SDimitry Andric if (Char == '+') { 36930b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36940b57cec5SDimitry Andric Kind = tok::plusplus; 36950b57cec5SDimitry Andric } else if (Char == '=') { 36960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36970b57cec5SDimitry Andric Kind = tok::plusequal; 36980b57cec5SDimitry Andric } else { 36990b57cec5SDimitry Andric Kind = tok::plus; 37000b57cec5SDimitry Andric } 37010b57cec5SDimitry Andric break; 37020b57cec5SDimitry Andric case '-': 37030b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37040b57cec5SDimitry Andric if (Char == '-') { // -- 37050b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37060b57cec5SDimitry Andric Kind = tok::minusminus; 37070b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus && 37080b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 37090b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37100b57cec5SDimitry Andric SizeTmp2, Result); 37110b57cec5SDimitry Andric Kind = tok::arrowstar; 37120b57cec5SDimitry Andric } else if (Char == '>') { // -> 37130b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37140b57cec5SDimitry Andric Kind = tok::arrow; 37150b57cec5SDimitry Andric } else if (Char == '=') { // -= 37160b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37170b57cec5SDimitry Andric Kind = tok::minusequal; 37180b57cec5SDimitry Andric } else { 37190b57cec5SDimitry Andric Kind = tok::minus; 37200b57cec5SDimitry Andric } 37210b57cec5SDimitry Andric break; 37220b57cec5SDimitry Andric case '~': 37230b57cec5SDimitry Andric Kind = tok::tilde; 37240b57cec5SDimitry Andric break; 37250b57cec5SDimitry Andric case '!': 37260b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 37270b57cec5SDimitry Andric Kind = tok::exclaimequal; 37280b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37290b57cec5SDimitry Andric } else { 37300b57cec5SDimitry Andric Kind = tok::exclaim; 37310b57cec5SDimitry Andric } 37320b57cec5SDimitry Andric break; 37330b57cec5SDimitry Andric case '/': 37340b57cec5SDimitry Andric // 6.4.9: Comments 37350b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37360b57cec5SDimitry Andric if (Char == '/') { // Line comment. 37370b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally 37380b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though, 37390b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the 37400b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex 37410b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as 37420b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'. 37430b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment. 37440b57cec5SDimitry Andric // However, we never do this if we are just preprocessing. 37450b57cec5SDimitry Andric bool TreatAsComment = LangOpts.LineComment && 37460b57cec5SDimitry Andric (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 37470b57cec5SDimitry Andric if (!TreatAsComment) 37480b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput())) 37490b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 37500b57cec5SDimitry Andric 37510b57cec5SDimitry Andric if (TreatAsComment) { 37520b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 37530b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 37540b57cec5SDimitry Andric return true; // There is a token to return. 37550b57cec5SDimitry Andric 37560b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be 37570b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through 37580b57cec5SDimitry Andric // the big switch, handle it efficiently now. 37590b57cec5SDimitry Andric goto SkipIgnoredUnits; 37600b57cec5SDimitry Andric } 37610b57cec5SDimitry Andric } 37620b57cec5SDimitry Andric 37630b57cec5SDimitry Andric if (Char == '*') { // /**/ comment. 37640b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 37650b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 37660b57cec5SDimitry Andric return true; // There is a token to return. 37670b57cec5SDimitry Andric 37680b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 37690b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 37700b57cec5SDimitry Andric goto LexNextToken; 37710b57cec5SDimitry Andric } 37720b57cec5SDimitry Andric 37730b57cec5SDimitry Andric if (Char == '=') { 37740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37750b57cec5SDimitry Andric Kind = tok::slashequal; 37760b57cec5SDimitry Andric } else { 37770b57cec5SDimitry Andric Kind = tok::slash; 37780b57cec5SDimitry Andric } 37790b57cec5SDimitry Andric break; 37800b57cec5SDimitry Andric case '%': 37810b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37820b57cec5SDimitry Andric if (Char == '=') { 37830b57cec5SDimitry Andric Kind = tok::percentequal; 37840b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37850b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') { 37860b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}' 37870b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37880b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { 37890b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37900b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37910b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 37920b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##' 37930b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37940b57cec5SDimitry Andric SizeTmp2, Result); 37950b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 37960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37970b57cec5SDimitry Andric if (!isLexingRawMode()) 37980b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 37990b57cec5SDimitry Andric Kind = tok::hashat; 38000b57cec5SDimitry Andric } else { // '%:' -> '#' 38010b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 38020b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 38030b57cec5SDimitry Andric // the preprocessor to handle it. 38040b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 38050b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 38060b57cec5SDimitry Andric goto HandleDirective; 38070b57cec5SDimitry Andric 38080b57cec5SDimitry Andric Kind = tok::hash; 38090b57cec5SDimitry Andric } 38100b57cec5SDimitry Andric } else { 38110b57cec5SDimitry Andric Kind = tok::percent; 38120b57cec5SDimitry Andric } 38130b57cec5SDimitry Andric break; 38140b57cec5SDimitry Andric case '<': 38150b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38160b57cec5SDimitry Andric if (ParsingFilename) { 38170b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr); 38180b57cec5SDimitry Andric } else if (Char == '<') { 38190b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 38200b57cec5SDimitry Andric if (After == '=') { 38210b57cec5SDimitry Andric Kind = tok::lesslessequal; 38220b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38230b57cec5SDimitry Andric SizeTmp2, Result); 38240b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 38250b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker, 38260b57cec5SDimitry Andric // recognize it as such and recover nicely. 38270b57cec5SDimitry Andric goto LexNextToken; 38280b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 38290b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker, 38300b57cec5SDimitry Andric // ignore it. 38310b57cec5SDimitry Andric goto LexNextToken; 38320b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') { 38330b57cec5SDimitry Andric Kind = tok::lesslessless; 38340b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38350b57cec5SDimitry Andric SizeTmp2, Result); 38360b57cec5SDimitry Andric } else { 38370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38380b57cec5SDimitry Andric Kind = tok::lessless; 38390b57cec5SDimitry Andric } 38400b57cec5SDimitry Andric } else if (Char == '=') { 38410b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 38420b57cec5SDimitry Andric if (After == '>') { 38435ffd83dbSDimitry Andric if (getLangOpts().CPlusPlus20) { 38440b57cec5SDimitry Andric if (!isLexingRawMode()) 38450b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 38460b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38470b57cec5SDimitry Andric SizeTmp2, Result); 38480b57cec5SDimitry Andric Kind = tok::spaceship; 38490b57cec5SDimitry Andric break; 38500b57cec5SDimitry Andric } 38510b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a 38520b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode. 38530b57cec5SDimitry Andric if (getLangOpts().CPlusPlus && !isLexingRawMode()) { 38545ffd83dbSDimitry Andric Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 38550b57cec5SDimitry Andric << FixItHint::CreateInsertion( 38560b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 38570b57cec5SDimitry Andric } 38580b57cec5SDimitry Andric } 38590b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38600b57cec5SDimitry Andric Kind = tok::lessequal; 38610b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 38620b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && 38630b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 38640b57cec5SDimitry Andric // C++0x [lex.pptoken]p3: 38650b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent 38660b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor 38670b57cec5SDimitry Andric // token by itself and not as the first character of the alternative 38680b57cec5SDimitry Andric // token <:. 38690b57cec5SDimitry Andric unsigned SizeTmp3; 38700b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 38710b57cec5SDimitry Andric if (After != ':' && After != '>') { 38720b57cec5SDimitry Andric Kind = tok::less; 38730b57cec5SDimitry Andric if (!isLexingRawMode()) 38740b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 38750b57cec5SDimitry Andric break; 38760b57cec5SDimitry Andric } 38770b57cec5SDimitry Andric } 38780b57cec5SDimitry Andric 38790b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38800b57cec5SDimitry Andric Kind = tok::l_square; 38810b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 38820b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38830b57cec5SDimitry Andric Kind = tok::l_brace; 38840b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 38850b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) { 38860b57cec5SDimitry Andric return true; 38870b57cec5SDimitry Andric } else { 38880b57cec5SDimitry Andric Kind = tok::less; 38890b57cec5SDimitry Andric } 38900b57cec5SDimitry Andric break; 38910b57cec5SDimitry Andric case '>': 38920b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38930b57cec5SDimitry Andric if (Char == '=') { 38940b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38950b57cec5SDimitry Andric Kind = tok::greaterequal; 38960b57cec5SDimitry Andric } else if (Char == '>') { 38970b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 38980b57cec5SDimitry Andric if (After == '=') { 38990b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39000b57cec5SDimitry Andric SizeTmp2, Result); 39010b57cec5SDimitry Andric Kind = tok::greatergreaterequal; 39020b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 39030b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such 39040b57cec5SDimitry Andric // and recover nicely. 39050b57cec5SDimitry Andric goto LexNextToken; 39060b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 39070b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 39080b57cec5SDimitry Andric goto LexNextToken; 39090b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') { 39100b57cec5SDimitry Andric Kind = tok::greatergreatergreater; 39110b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39120b57cec5SDimitry Andric SizeTmp2, Result); 39130b57cec5SDimitry Andric } else { 39140b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39150b57cec5SDimitry Andric Kind = tok::greatergreater; 39160b57cec5SDimitry Andric } 39170b57cec5SDimitry Andric } else { 39180b57cec5SDimitry Andric Kind = tok::greater; 39190b57cec5SDimitry Andric } 39200b57cec5SDimitry Andric break; 39210b57cec5SDimitry Andric case '^': 39220b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39230b57cec5SDimitry Andric if (Char == '=') { 39240b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39250b57cec5SDimitry Andric Kind = tok::caretequal; 39260b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') { 39270b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39280b57cec5SDimitry Andric Kind = tok::caretcaret; 39290b57cec5SDimitry Andric } else { 39300b57cec5SDimitry Andric Kind = tok::caret; 39310b57cec5SDimitry Andric } 39320b57cec5SDimitry Andric break; 39330b57cec5SDimitry Andric case '|': 39340b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39350b57cec5SDimitry Andric if (Char == '=') { 39360b57cec5SDimitry Andric Kind = tok::pipeequal; 39370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39380b57cec5SDimitry Andric } else if (Char == '|') { 39390b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it. 39400b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 39410b57cec5SDimitry Andric goto LexNextToken; 39420b57cec5SDimitry Andric Kind = tok::pipepipe; 39430b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39440b57cec5SDimitry Andric } else { 39450b57cec5SDimitry Andric Kind = tok::pipe; 39460b57cec5SDimitry Andric } 39470b57cec5SDimitry Andric break; 39480b57cec5SDimitry Andric case ':': 39490b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39500b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') { 39510b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']' 39520b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39530b57cec5SDimitry Andric } else if ((LangOpts.CPlusPlus || 39540b57cec5SDimitry Andric LangOpts.DoubleSquareBracketAttributes) && 39550b57cec5SDimitry Andric Char == ':') { 39560b57cec5SDimitry Andric Kind = tok::coloncolon; 39570b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39580b57cec5SDimitry Andric } else { 39590b57cec5SDimitry Andric Kind = tok::colon; 39600b57cec5SDimitry Andric } 39610b57cec5SDimitry Andric break; 39620b57cec5SDimitry Andric case ';': 39630b57cec5SDimitry Andric Kind = tok::semi; 39640b57cec5SDimitry Andric break; 39650b57cec5SDimitry Andric case '=': 39660b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39670b57cec5SDimitry Andric if (Char == '=') { 39680b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it. 39690b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 39700b57cec5SDimitry Andric goto LexNextToken; 39710b57cec5SDimitry Andric 39720b57cec5SDimitry Andric Kind = tok::equalequal; 39730b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39740b57cec5SDimitry Andric } else { 39750b57cec5SDimitry Andric Kind = tok::equal; 39760b57cec5SDimitry Andric } 39770b57cec5SDimitry Andric break; 39780b57cec5SDimitry Andric case ',': 39790b57cec5SDimitry Andric Kind = tok::comma; 39800b57cec5SDimitry Andric break; 39810b57cec5SDimitry Andric case '#': 39820b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39830b57cec5SDimitry Andric if (Char == '#') { 39840b57cec5SDimitry Andric Kind = tok::hashhash; 39850b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39860b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 39870b57cec5SDimitry Andric Kind = tok::hashat; 39880b57cec5SDimitry Andric if (!isLexingRawMode()) 39890b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 39900b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39910b57cec5SDimitry Andric } else { 39920b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 39930b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 39940b57cec5SDimitry Andric // the preprocessor to handle it. 39950b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 39960b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 39970b57cec5SDimitry Andric goto HandleDirective; 39980b57cec5SDimitry Andric 39990b57cec5SDimitry Andric Kind = tok::hash; 40000b57cec5SDimitry Andric } 40010b57cec5SDimitry Andric break; 40020b57cec5SDimitry Andric 40030b57cec5SDimitry Andric case '@': 40040b57cec5SDimitry Andric // Objective C support. 40050b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC) 40060b57cec5SDimitry Andric Kind = tok::at; 40070b57cec5SDimitry Andric else 40080b57cec5SDimitry Andric Kind = tok::unknown; 40090b57cec5SDimitry Andric break; 40100b57cec5SDimitry Andric 40110b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 40120b57cec5SDimitry Andric case '\\': 40130b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) { 40140b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 40150b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 40160b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 40170b57cec5SDimitry Andric return true; // KeepWhitespaceMode 40180b57cec5SDimitry Andric 40190b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 40200b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 40210b57cec5SDimitry Andric goto LexNextToken; 40220b57cec5SDimitry Andric } 40230b57cec5SDimitry Andric 4024349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 40250b57cec5SDimitry Andric } 40260b57cec5SDimitry Andric } 40270b57cec5SDimitry Andric 40280b57cec5SDimitry Andric Kind = tok::unknown; 40290b57cec5SDimitry Andric break; 40300b57cec5SDimitry Andric 40310b57cec5SDimitry Andric default: { 40320b57cec5SDimitry Andric if (isASCII(Char)) { 40330b57cec5SDimitry Andric Kind = tok::unknown; 40340b57cec5SDimitry Andric break; 40350b57cec5SDimitry Andric } 40360b57cec5SDimitry Andric 40370b57cec5SDimitry Andric llvm::UTF32 CodePoint; 40380b57cec5SDimitry Andric 40390b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 40400b57cec5SDimitry Andric // an escaped newline. 40410b57cec5SDimitry Andric --CurPtr; 40420b57cec5SDimitry Andric llvm::ConversionResult Status = 40430b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 40440b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 40450b57cec5SDimitry Andric &CodePoint, 40460b57cec5SDimitry Andric llvm::strictConversion); 40470b57cec5SDimitry Andric if (Status == llvm::conversionOK) { 40480b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 40490b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 40500b57cec5SDimitry Andric return true; // KeepWhitespaceMode 40510b57cec5SDimitry Andric 40520b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 40530b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 40540b57cec5SDimitry Andric goto LexNextToken; 40550b57cec5SDimitry Andric } 4056349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 40570b57cec5SDimitry Andric } 40580b57cec5SDimitry Andric 40590b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective || 40600b57cec5SDimitry Andric PP->isPreprocessedOutput()) { 40610b57cec5SDimitry Andric ++CurPtr; 40620b57cec5SDimitry Andric Kind = tok::unknown; 40630b57cec5SDimitry Andric break; 40640b57cec5SDimitry Andric } 40650b57cec5SDimitry Andric 40660b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 40670b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 40680b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character. 40690b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8); 40700b57cec5SDimitry Andric 40710b57cec5SDimitry Andric BufferPtr = CurPtr+1; 40720b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with 40730b57cec5SDimitry Andric // this lexer. 40740b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 40750b57cec5SDimitry Andric goto LexNextToken; 40760b57cec5SDimitry Andric } 40770b57cec5SDimitry Andric } 40780b57cec5SDimitry Andric 40790b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 40800b57cec5SDimitry Andric MIOpt.ReadToken(); 40810b57cec5SDimitry Andric 40820b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 40830b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 40840b57cec5SDimitry Andric return true; 40850b57cec5SDimitry Andric 40860b57cec5SDimitry Andric HandleDirective: 40870b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive. 40880b57cec5SDimitry Andric 40890b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash); 40900b57cec5SDimitry Andric PP->HandleDirective(Result); 40910b57cec5SDimitry Andric 40920b57cec5SDimitry Andric if (PP->hadModuleLoaderFatalFailure()) { 40930b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing. 40940b57cec5SDimitry Andric assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 40950b57cec5SDimitry Andric return true; 40960b57cec5SDimitry Andric } 40970b57cec5SDimitry Andric 40980b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state. 40990b57cec5SDimitry Andric return false; 41000b57cec5SDimitry Andric } 4101