10b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "clang/Lex/Lexer.h" 140b57cec5SDimitry Andric #include "UnicodeCharSets.h" 150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h" 160b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 170b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h" 180b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h" 190b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h" 200b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 210b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h" 220b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h" 230b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h" 240b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h" 250b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h" 260b57cec5SDimitry Andric #include "clang/Lex/Token.h" 270b57cec5SDimitry Andric #include "clang/Basic/Diagnostic.h" 280b57cec5SDimitry Andric #include "clang/Basic/LLVM.h" 290b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 300b57cec5SDimitry Andric #include "llvm/ADT/None.h" 310b57cec5SDimitry Andric #include "llvm/ADT/Optional.h" 32*5ffd83dbSDimitry Andric #include "llvm/ADT/STLExtras.h" 330b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h" 340b57cec5SDimitry Andric #include "llvm/ADT/StringSwitch.h" 350b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 360b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 370b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 380b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h" 390b57cec5SDimitry Andric #include "llvm/Support/MemoryBuffer.h" 400b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h" 410b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h" 420b57cec5SDimitry Andric #include <algorithm> 430b57cec5SDimitry Andric #include <cassert> 440b57cec5SDimitry Andric #include <cstddef> 450b57cec5SDimitry Andric #include <cstdint> 460b57cec5SDimitry Andric #include <cstring> 470b57cec5SDimitry Andric #include <string> 480b57cec5SDimitry Andric #include <tuple> 490b57cec5SDimitry Andric #include <utility> 500b57cec5SDimitry Andric 510b57cec5SDimitry Andric using namespace clang; 520b57cec5SDimitry Andric 530b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 540b57cec5SDimitry Andric // Token Class Implementation 550b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 560b57cec5SDimitry Andric 570b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 580b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 590b57cec5SDimitry Andric if (isAnnotation()) 600b57cec5SDimitry Andric return false; 610b57cec5SDimitry Andric if (IdentifierInfo *II = getIdentifierInfo()) 620b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey; 630b57cec5SDimitry Andric return false; 640b57cec5SDimitry Andric } 650b57cec5SDimitry Andric 660b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind. 670b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const { 680b57cec5SDimitry Andric if (isAnnotation()) 690b57cec5SDimitry Andric return tok::objc_not_keyword; 700b57cec5SDimitry Andric IdentifierInfo *specId = getIdentifierInfo(); 710b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 720b57cec5SDimitry Andric } 730b57cec5SDimitry Andric 740b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 750b57cec5SDimitry Andric // Lexer Class Implementation 760b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric void Lexer::anchor() {} 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 810b57cec5SDimitry Andric const char *BufEnd) { 820b57cec5SDimitry Andric BufferStart = BufStart; 830b57cec5SDimitry Andric BufferPtr = BufPtr; 840b57cec5SDimitry Andric BufferEnd = BufEnd; 850b57cec5SDimitry Andric 860b57cec5SDimitry Andric assert(BufEnd[0] == 0 && 870b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end" 880b57cec5SDimitry Andric " to simplify lexing!"); 890b57cec5SDimitry Andric 900b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act 910b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just 920b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present. 930b57cec5SDimitry Andric if (BufferStart == BufferPtr) { 940b57cec5SDimitry Andric // Determine the size of the BOM. 950b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart); 960b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 970b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 980b57cec5SDimitry Andric .Default(0); 990b57cec5SDimitry Andric 1000b57cec5SDimitry Andric // Skip the BOM. 1010b57cec5SDimitry Andric BufferPtr += BOMLength; 1020b57cec5SDimitry Andric } 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric Is_PragmaLexer = false; 1050b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric // Start of the file is a start of line. 1080b57cec5SDimitry Andric IsAtStartOfLine = true; 1090b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 1100b57cec5SDimitry Andric 1110b57cec5SDimitry Andric HasLeadingSpace = false; 1120b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 1130b57cec5SDimitry Andric 1140b57cec5SDimitry Andric // We are not after parsing a #. 1150b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 1160b57cec5SDimitry Andric 1170b57cec5SDimitry Andric // We are not after parsing #include. 1180b57cec5SDimitry Andric ParsingFilename = false; 1190b57cec5SDimitry Andric 1200b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation 1210b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 1220b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 1230b57cec5SDimitry Andric // or otherwise skipping over tokens. 1240b57cec5SDimitry Andric LexingRawMode = false; 1250b57cec5SDimitry Andric 1260b57cec5SDimitry Andric // Default to not keeping comments. 1270b57cec5SDimitry Andric ExtendedTokenMode = 0; 1280b57cec5SDimitry Andric } 1290b57cec5SDimitry Andric 1300b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer 1310b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer 1320b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will 1330b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them. 1340b57cec5SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 1350b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID), 1360b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 1370b57cec5SDimitry Andric LangOpts(PP.getLangOpts()) { 1380b57cec5SDimitry Andric InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 1390b57cec5SDimitry Andric InputFile->getBufferEnd()); 1400b57cec5SDimitry Andric 1410b57cec5SDimitry Andric resetExtendedTokenMode(); 1420b57cec5SDimitry Andric } 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1450b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1460b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 1470b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 1480b57cec5SDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd) 1490b57cec5SDimitry Andric : FileLoc(fileloc), LangOpts(langOpts) { 1500b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd); 1510b57cec5SDimitry Andric 1520b57cec5SDimitry Andric // We *are* in raw mode. 1530b57cec5SDimitry Andric LexingRawMode = true; 1540b57cec5SDimitry Andric } 1550b57cec5SDimitry Andric 1560b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1570b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1580b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 1590b57cec5SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 1600b57cec5SDimitry Andric const SourceManager &SM, const LangOptions &langOpts) 1610b57cec5SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), 1620b57cec5SDimitry Andric FromFile->getBufferStart(), FromFile->getBufferEnd()) {} 1630b57cec5SDimitry Andric 1640b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() { 1650b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor"); 1660b57cec5SDimitry Andric if (LangOpts.TraditionalCPP) 1670b57cec5SDimitry Andric SetKeepWhitespaceMode(true); 1680b57cec5SDimitry Andric else 1690b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState()); 1700b57cec5SDimitry Andric } 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 1730b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method 1740b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done. 1750b57cec5SDimitry Andric /// 1760b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a 1770b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an 1780b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be 1790b57cec5SDimitry Andric /// "expanded from". 1800b57cec5SDimitry Andric /// 1810b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a 1820b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making 1830b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 1840b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc 1850b57cec5SDimitry Andric /// out of the critical path of the lexer! 1860b57cec5SDimitry Andric /// 1870b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 1880b57cec5SDimitry Andric SourceLocation ExpansionLocStart, 1890b57cec5SDimitry Andric SourceLocation ExpansionLocEnd, 1900b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) { 1910b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 1920b57cec5SDimitry Andric 1930b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally. 1940b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc); 1950b57cec5SDimitry Andric const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 1960b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP); 1970b57cec5SDimitry Andric 1980b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we 1990b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a 2000b57cec5SDimitry Andric // scratch buffer. 2010b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc); 2020b57cec5SDimitry Andric 2030b57cec5SDimitry Andric L->BufferPtr = StrData; 2040b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen; 2050b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 2060b57cec5SDimitry Andric 2070b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that 2080b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed. 2090b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 2100b57cec5SDimitry Andric ExpansionLocStart, 2110b57cec5SDimitry Andric ExpansionLocEnd, TokLen); 2120b57cec5SDimitry Andric 2130b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will 2140b57cec5SDimitry Andric // return an EOD token. 2150b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true; 2160b57cec5SDimitry Andric 2170b57cec5SDimitry Andric // This lexer really is for _Pragma. 2180b57cec5SDimitry Andric L->Is_PragmaLexer = true; 2190b57cec5SDimitry Andric return L; 2200b57cec5SDimitry Andric } 2210b57cec5SDimitry Andric 222a7dea167SDimitry Andric bool Lexer::skipOver(unsigned NumBytes) { 223a7dea167SDimitry Andric IsAtPhysicalStartOfLine = true; 224a7dea167SDimitry Andric IsAtStartOfLine = true; 225a7dea167SDimitry Andric if ((BufferPtr + NumBytes) > BufferEnd) 226a7dea167SDimitry Andric return true; 227a7dea167SDimitry Andric BufferPtr += NumBytes; 228a7dea167SDimitry Andric return false; 229a7dea167SDimitry Andric } 230a7dea167SDimitry Andric 2310b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) { 2320b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size(); 2330b57cec5SDimitry Andric while (i < e) { 2340b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) { 2350b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\'); 2360b57cec5SDimitry Andric i += 2; 2370b57cec5SDimitry Andric ++e; 2380b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') { 2390b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 2400b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 2410b57cec5SDimitry Andric Str[i] != Str[i + 1]) { 2420b57cec5SDimitry Andric Str[i] = '\\'; 2430b57cec5SDimitry Andric Str[i + 1] = 'n'; 2440b57cec5SDimitry Andric } else { 2450b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'. 2460b57cec5SDimitry Andric Str[i] = '\\'; 2470b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n'); 2480b57cec5SDimitry Andric ++e; 2490b57cec5SDimitry Andric } 2500b57cec5SDimitry Andric i += 2; 2510b57cec5SDimitry Andric } else 2520b57cec5SDimitry Andric ++i; 2530b57cec5SDimitry Andric } 2540b57cec5SDimitry Andric } 2550b57cec5SDimitry Andric 2560b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) { 257*5ffd83dbSDimitry Andric std::string Result = std::string(Str); 2580b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"'; 2590b57cec5SDimitry Andric StringifyImpl(Result, Quote); 2600b57cec5SDimitry Andric return Result; 2610b57cec5SDimitry Andric } 2620b57cec5SDimitry Andric 2630b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 2640b57cec5SDimitry Andric 2650b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2660b57cec5SDimitry Andric // Token Spelling 2670b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the 2700b57cec5SDimitry Andric /// spelling of this token from the provided input buffer. 2710b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 2720b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) { 2730b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric size_t Length = 0; 2760b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength(); 2770b57cec5SDimitry Andric 2780b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) { 2790b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote. 2800b57cec5SDimitry Andric while (BufPtr < BufEnd) { 2810b57cec5SDimitry Andric unsigned Size; 2820b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 2830b57cec5SDimitry Andric BufPtr += Size; 2840b57cec5SDimitry Andric 2850b57cec5SDimitry Andric if (Spelling[Length - 1] == '"') 2860b57cec5SDimitry Andric break; 2870b57cec5SDimitry Andric } 2880b57cec5SDimitry Andric 2890b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line 2900b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their 2910b57cec5SDimitry Andric // r-char-sequence. 2920b57cec5SDimitry Andric if (Length >= 2 && 2930b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 2940b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing 2950b57cec5SDimitry Andric // quote. 2960b57cec5SDimitry Andric const char *RawEnd = BufEnd; 2970b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"'); 2980b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1; 2990b57cec5SDimitry Andric 3000b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling. 3010b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength); 3020b57cec5SDimitry Andric Length += RawLength; 3030b57cec5SDimitry Andric BufPtr += RawLength; 3040b57cec5SDimitry Andric 3050b57cec5SDimitry Andric // The rest of the token is lexed normally. 3060b57cec5SDimitry Andric } 3070b57cec5SDimitry Andric } 3080b57cec5SDimitry Andric 3090b57cec5SDimitry Andric while (BufPtr < BufEnd) { 3100b57cec5SDimitry Andric unsigned Size; 3110b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 3120b57cec5SDimitry Andric BufPtr += Size; 3130b57cec5SDimitry Andric } 3140b57cec5SDimitry Andric 3150b57cec5SDimitry Andric assert(Length < Tok.getLength() && 3160b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!"); 3170b57cec5SDimitry Andric return Length; 3180b57cec5SDimitry Andric } 3190b57cec5SDimitry Andric 3200b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3210b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3220b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3230b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3240b57cec5SDimitry Andric /// UCNs, etc. 3250b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc, 3260b57cec5SDimitry Andric SmallVectorImpl<char> &buffer, 3270b57cec5SDimitry Andric const SourceManager &SM, 3280b57cec5SDimitry Andric const LangOptions &options, 3290b57cec5SDimitry Andric bool *invalid) { 3300b57cec5SDimitry Andric // Break down the source location. 3310b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 3320b57cec5SDimitry Andric 3330b57cec5SDimitry Andric // Try to the load the file buffer. 3340b57cec5SDimitry Andric bool invalidTemp = false; 3350b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 3360b57cec5SDimitry Andric if (invalidTemp) { 3370b57cec5SDimitry Andric if (invalid) *invalid = true; 3380b57cec5SDimitry Andric return {}; 3390b57cec5SDimitry Andric } 3400b57cec5SDimitry Andric 3410b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second; 3420b57cec5SDimitry Andric 3430b57cec5SDimitry Andric // Lex from the start of the given location. 3440b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 3450b57cec5SDimitry Andric file.begin(), tokenBegin, file.end()); 3460b57cec5SDimitry Andric Token token; 3470b57cec5SDimitry Andric lexer.LexFromRawLexer(token); 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric unsigned length = token.getLength(); 3500b57cec5SDimitry Andric 3510b57cec5SDimitry Andric // Common case: no need for cleaning. 3520b57cec5SDimitry Andric if (!token.needsCleaning()) 3530b57cec5SDimitry Andric return StringRef(tokenBegin, length); 3540b57cec5SDimitry Andric 3550b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string. 3560b57cec5SDimitry Andric buffer.resize(length); 3570b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 3580b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size()); 3590b57cec5SDimitry Andric } 3600b57cec5SDimitry Andric 3610b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3620b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3630b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3640b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3650b57cec5SDimitry Andric /// UCNs, etc. 3660b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 3670b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 3680b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 3690b57cec5SDimitry Andric 3700b57cec5SDimitry Andric bool CharDataInvalid = false; 3710b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 3720b57cec5SDimitry Andric &CharDataInvalid); 3730b57cec5SDimitry Andric if (Invalid) 3740b57cec5SDimitry Andric *Invalid = CharDataInvalid; 3750b57cec5SDimitry Andric if (CharDataInvalid) 3760b57cec5SDimitry Andric return {}; 3770b57cec5SDimitry Andric 3780b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 3790b57cec5SDimitry Andric if (!Tok.needsCleaning()) 3800b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength()); 3810b57cec5SDimitry Andric 3820b57cec5SDimitry Andric std::string Result; 3830b57cec5SDimitry Andric Result.resize(Tok.getLength()); 3840b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 3850b57cec5SDimitry Andric return Result; 3860b57cec5SDimitry Andric } 3870b57cec5SDimitry Andric 3880b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a 3890b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required 3900b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least 3910b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned. 3920b57cec5SDimitry Andric /// 3930b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in 3940b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer* 3950b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a 3960b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer 3970b57cec5SDimitry Andric /// if an internal buffer is returned. 3980b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 3990b57cec5SDimitry Andric const SourceManager &SourceMgr, 4000b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 4010b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 4020b57cec5SDimitry Andric 4030b57cec5SDimitry Andric const char *TokStart = nullptr; 4040b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo. 4050b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier)) 4060b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data(); 4070b57cec5SDimitry Andric else if (!Tok.hasUCN()) { 4080b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 4090b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick. 4100b57cec5SDimitry Andric Buffer = II->getNameStart(); 4110b57cec5SDimitry Andric return II->getLength(); 4120b57cec5SDimitry Andric } 4130b57cec5SDimitry Andric } 4140b57cec5SDimitry Andric 4150b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo. 4160b57cec5SDimitry Andric if (Tok.isLiteral()) 4170b57cec5SDimitry Andric TokStart = Tok.getLiteralData(); 4180b57cec5SDimitry Andric 4190b57cec5SDimitry Andric if (!TokStart) { 4200b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer. 4210b57cec5SDimitry Andric bool CharDataInvalid = false; 4220b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 4230b57cec5SDimitry Andric if (Invalid) 4240b57cec5SDimitry Andric *Invalid = CharDataInvalid; 4250b57cec5SDimitry Andric if (CharDataInvalid) { 4260b57cec5SDimitry Andric Buffer = ""; 4270b57cec5SDimitry Andric return 0; 4280b57cec5SDimitry Andric } 4290b57cec5SDimitry Andric } 4300b57cec5SDimitry Andric 4310b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 4320b57cec5SDimitry Andric if (!Tok.needsCleaning()) { 4330b57cec5SDimitry Andric Buffer = TokStart; 4340b57cec5SDimitry Andric return Tok.getLength(); 4350b57cec5SDimitry Andric } 4360b57cec5SDimitry Andric 4370b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string. 4380b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 4390b57cec5SDimitry Andric } 4400b57cec5SDimitry Andric 4410b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return 4420b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g. 4430b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes 4440b57cec5SDimitry Andric /// that are part of that. 4450b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 4460b57cec5SDimitry Andric const SourceManager &SM, 4470b57cec5SDimitry Andric const LangOptions &LangOpts) { 4480b57cec5SDimitry Andric Token TheTok; 4490b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts)) 4500b57cec5SDimitry Andric return 0; 4510b57cec5SDimitry Andric return TheTok.getLength(); 4520b57cec5SDimitry Andric } 4530b57cec5SDimitry Andric 4540b57cec5SDimitry Andric /// Relex the token at the specified location. 4550b57cec5SDimitry Andric /// \returns true if there was a failure, false on success. 4560b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 4570b57cec5SDimitry Andric const SourceManager &SM, 4580b57cec5SDimitry Andric const LangOptions &LangOpts, 4590b57cec5SDimitry Andric bool IgnoreWhiteSpace) { 4600b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')', 4610b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle 4620b57cec5SDimitry Andric // all obviously single-char tokens. This could use 4630b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 4640b57cec5SDimitry Andric // something. 4650b57cec5SDimitry Andric 4660b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not 4670b57cec5SDimitry Andric // the token this macro expanded to. 4680b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc); 4690b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 4700b57cec5SDimitry Andric bool Invalid = false; 4710b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 4720b57cec5SDimitry Andric if (Invalid) 4730b57cec5SDimitry Andric return true; 4740b57cec5SDimitry Andric 4750b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second; 4760b57cec5SDimitry Andric 4770b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 4780b57cec5SDimitry Andric return true; 4790b57cec5SDimitry Andric 4800b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 4810b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 4820b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end()); 4830b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 4840b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result); 4850b57cec5SDimitry Andric return false; 4860b57cec5SDimitry Andric } 4870b57cec5SDimitry Andric 4880b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains 4890b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid. 4900b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 4910b57cec5SDimitry Andric const char *BufStart = Buffer.data(); 4920b57cec5SDimitry Andric if (Offset >= Buffer.size()) 4930b57cec5SDimitry Andric return nullptr; 4940b57cec5SDimitry Andric 4950b57cec5SDimitry Andric const char *LexStart = BufStart + Offset; 4960b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) { 4970b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) && 4980b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) { 4990b57cec5SDimitry Andric // LexStart should point at first character of logical line. 5000b57cec5SDimitry Andric ++LexStart; 5010b57cec5SDimitry Andric break; 5020b57cec5SDimitry Andric } 5030b57cec5SDimitry Andric } 5040b57cec5SDimitry Andric return LexStart; 5050b57cec5SDimitry Andric } 5060b57cec5SDimitry Andric 5070b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 5080b57cec5SDimitry Andric const SourceManager &SM, 5090b57cec5SDimitry Andric const LangOptions &LangOpts) { 5100b57cec5SDimitry Andric assert(Loc.isFileID()); 5110b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 5120b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 5130b57cec5SDimitry Andric return Loc; 5140b57cec5SDimitry Andric 5150b57cec5SDimitry Andric bool Invalid = false; 5160b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 5170b57cec5SDimitry Andric if (Invalid) 5180b57cec5SDimitry Andric return Loc; 5190b57cec5SDimitry Andric 5200b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line 5210b57cec5SDimitry Andric // (or the buffer). We'll relex from that point. 5220b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second; 5230b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 5240b57cec5SDimitry Andric if (!LexStart || LexStart == StrData) 5250b57cec5SDimitry Andric return Loc; 5260b57cec5SDimitry Andric 5270b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 5280b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 5290b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 5300b57cec5SDimitry Andric Buffer.end()); 5310b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5320b57cec5SDimitry Andric 5330b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location. 5340b57cec5SDimitry Andric Token TheTok; 5350b57cec5SDimitry Andric do { 5360b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 5370b57cec5SDimitry Andric 5380b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) { 5390b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're 5400b57cec5SDimitry Andric // looking for. If the current token encompasses our source location, 5410b57cec5SDimitry Andric // return the beginning of that token. 5420b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 5430b57cec5SDimitry Andric return TheTok.getLocation(); 5440b57cec5SDimitry Andric 5450b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means 5460b57cec5SDimitry Andric // that it points into whitespace. We're done here. 5470b57cec5SDimitry Andric break; 5480b57cec5SDimitry Andric } 5490b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof); 5500b57cec5SDimitry Andric 5510b57cec5SDimitry Andric // We've passed our source location; just return the original source location. 5520b57cec5SDimitry Andric return Loc; 5530b57cec5SDimitry Andric } 5540b57cec5SDimitry Andric 5550b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 5560b57cec5SDimitry Andric const SourceManager &SM, 5570b57cec5SDimitry Andric const LangOptions &LangOpts) { 5580b57cec5SDimitry Andric if (Loc.isFileID()) 5590b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts); 5600b57cec5SDimitry Andric 5610b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc)) 5620b57cec5SDimitry Andric return Loc; 5630b57cec5SDimitry Andric 5640b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc); 5650b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 5660b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 5670b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo = 5680b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc); 5690b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first && 5700b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second); 5710b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 5720b57cec5SDimitry Andric } 5730b57cec5SDimitry Andric 5740b57cec5SDimitry Andric namespace { 5750b57cec5SDimitry Andric 5760b57cec5SDimitry Andric enum PreambleDirectiveKind { 5770b57cec5SDimitry Andric PDK_Skipped, 5780b57cec5SDimitry Andric PDK_Unknown 5790b57cec5SDimitry Andric }; 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric } // namespace 5820b57cec5SDimitry Andric 5830b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 5840b57cec5SDimitry Andric const LangOptions &LangOpts, 5850b57cec5SDimitry Andric unsigned MaxLines) { 5860b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a 5870b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our 5880b57cec5SDimitry Andric // position within the file. 5890b57cec5SDimitry Andric const unsigned StartOffset = 1; 5900b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 5910b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 5920b57cec5SDimitry Andric Buffer.end()); 5930b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5940b57cec5SDimitry Andric 5950b57cec5SDimitry Andric bool InPreprocessorDirective = false; 5960b57cec5SDimitry Andric Token TheTok; 5970b57cec5SDimitry Andric SourceLocation ActiveCommentLoc; 5980b57cec5SDimitry Andric 5990b57cec5SDimitry Andric unsigned MaxLineOffset = 0; 6000b57cec5SDimitry Andric if (MaxLines) { 6010b57cec5SDimitry Andric const char *CurPtr = Buffer.begin(); 6020b57cec5SDimitry Andric unsigned CurLine = 0; 6030b57cec5SDimitry Andric while (CurPtr != Buffer.end()) { 6040b57cec5SDimitry Andric char ch = *CurPtr++; 6050b57cec5SDimitry Andric if (ch == '\n') { 6060b57cec5SDimitry Andric ++CurLine; 6070b57cec5SDimitry Andric if (CurLine == MaxLines) 6080b57cec5SDimitry Andric break; 6090b57cec5SDimitry Andric } 6100b57cec5SDimitry Andric } 6110b57cec5SDimitry Andric if (CurPtr != Buffer.end()) 6120b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin(); 6130b57cec5SDimitry Andric } 6140b57cec5SDimitry Andric 6150b57cec5SDimitry Andric do { 6160b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6170b57cec5SDimitry Andric 6180b57cec5SDimitry Andric if (InPreprocessorDirective) { 6190b57cec5SDimitry Andric // If we've hit the end of the file, we're done. 6200b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) { 6210b57cec5SDimitry Andric break; 6220b57cec5SDimitry Andric } 6230b57cec5SDimitry Andric 6240b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this 6250b57cec5SDimitry Andric // token. 6260b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine()) 6270b57cec5SDimitry Andric continue; 6280b57cec5SDimitry Andric 6290b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look 6300b57cec5SDimitry Andric // at this token again below. 6310b57cec5SDimitry Andric InPreprocessorDirective = false; 6320b57cec5SDimitry Andric } 6330b57cec5SDimitry Andric 6340b57cec5SDimitry Andric // Keep track of the # of lines in the preamble. 6350b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) { 6360b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 6370b57cec5SDimitry Andric 6380b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble, 6390b57cec5SDimitry Andric // and we're about to exceed that limit, we're done. 6400b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset) 6410b57cec5SDimitry Andric break; 6420b57cec5SDimitry Andric } 6430b57cec5SDimitry Andric 6440b57cec5SDimitry Andric // Comments are okay; skip over them. 6450b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) { 6460b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid()) 6470b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation(); 6480b57cec5SDimitry Andric continue; 6490b57cec5SDimitry Andric } 6500b57cec5SDimitry Andric 6510b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 6520b57cec5SDimitry Andric // This is the start of a preprocessor directive. 6530b57cec5SDimitry Andric Token HashTok = TheTok; 6540b57cec5SDimitry Andric InPreprocessorDirective = true; 6550b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation(); 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens, 6580b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at 6590b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives. 6600b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6610b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 6620b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier(); 6630b57cec5SDimitry Andric PreambleDirectiveKind PDK 6640b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 6650b57cec5SDimitry Andric .Case("include", PDK_Skipped) 6660b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped) 6670b57cec5SDimitry Andric .Case("define", PDK_Skipped) 6680b57cec5SDimitry Andric .Case("undef", PDK_Skipped) 6690b57cec5SDimitry Andric .Case("line", PDK_Skipped) 6700b57cec5SDimitry Andric .Case("error", PDK_Skipped) 6710b57cec5SDimitry Andric .Case("pragma", PDK_Skipped) 6720b57cec5SDimitry Andric .Case("import", PDK_Skipped) 6730b57cec5SDimitry Andric .Case("include_next", PDK_Skipped) 6740b57cec5SDimitry Andric .Case("warning", PDK_Skipped) 6750b57cec5SDimitry Andric .Case("ident", PDK_Skipped) 6760b57cec5SDimitry Andric .Case("sccs", PDK_Skipped) 6770b57cec5SDimitry Andric .Case("assert", PDK_Skipped) 6780b57cec5SDimitry Andric .Case("unassert", PDK_Skipped) 6790b57cec5SDimitry Andric .Case("if", PDK_Skipped) 6800b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped) 6810b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped) 6820b57cec5SDimitry Andric .Case("elif", PDK_Skipped) 6830b57cec5SDimitry Andric .Case("else", PDK_Skipped) 6840b57cec5SDimitry Andric .Case("endif", PDK_Skipped) 6850b57cec5SDimitry Andric .Default(PDK_Unknown); 6860b57cec5SDimitry Andric 6870b57cec5SDimitry Andric switch (PDK) { 6880b57cec5SDimitry Andric case PDK_Skipped: 6890b57cec5SDimitry Andric continue; 6900b57cec5SDimitry Andric 6910b57cec5SDimitry Andric case PDK_Unknown: 6920b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'. 6930b57cec5SDimitry Andric break; 6940b57cec5SDimitry Andric } 6950b57cec5SDimitry Andric } 6960b57cec5SDimitry Andric 6970b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor 6980b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this 6990b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'. 7000b57cec5SDimitry Andric TheTok = HashTok; 7010b57cec5SDimitry Andric } 7020b57cec5SDimitry Andric 7030b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the 7040b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in 7050b57cec5SDimitry Andric // the preamble. 7060b57cec5SDimitry Andric break; 7070b57cec5SDimitry Andric } while (true); 7080b57cec5SDimitry Andric 7090b57cec5SDimitry Andric SourceLocation End; 7100b57cec5SDimitry Andric if (ActiveCommentLoc.isValid()) 7110b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment. 7120b57cec5SDimitry Andric else 7130b57cec5SDimitry Andric End = TheTok.getLocation(); 7140b57cec5SDimitry Andric 7150b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 7160b57cec5SDimitry Andric TheTok.isAtStartOfLine()); 7170b57cec5SDimitry Andric } 7180b57cec5SDimitry Andric 7190b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 7200b57cec5SDimitry Andric const SourceManager &SM, 7210b57cec5SDimitry Andric const LangOptions &LangOpts) { 7220b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion 7230b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and 7240b57cec5SDimitry Andric // trigraphs. 7250b57cec5SDimitry Andric bool Invalid = false; 7260b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 7270b57cec5SDimitry Andric 7280b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done. 7290b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 7300b57cec5SDimitry Andric return 0; 7310b57cec5SDimitry Andric 7320b57cec5SDimitry Andric unsigned PhysOffset = 0; 7330b57cec5SDimitry Andric 7340b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip 7350b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple 7360b57cec5SDimitry Andric // chars, this method is extremely fast. 7370b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 7380b57cec5SDimitry Andric if (CharNo == 0) 7390b57cec5SDimitry Andric return PhysOffset; 7400b57cec5SDimitry Andric ++TokPtr; 7410b57cec5SDimitry Andric --CharNo; 7420b57cec5SDimitry Andric ++PhysOffset; 7430b57cec5SDimitry Andric } 7440b57cec5SDimitry Andric 7450b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a 7460b57cec5SDimitry Andric // lexer to parse it correctly. 7470b57cec5SDimitry Andric for (; CharNo; --CharNo) { 7480b57cec5SDimitry Andric unsigned Size; 7490b57cec5SDimitry Andric Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 7500b57cec5SDimitry Andric TokPtr += Size; 7510b57cec5SDimitry Andric PhysOffset += Size; 7520b57cec5SDimitry Andric } 7530b57cec5SDimitry Andric 7540b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the 7550b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar 7560b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding 7570b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph. 7580b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 7590b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 7600b57cec5SDimitry Andric 7610b57cec5SDimitry Andric return PhysOffset; 7620b57cec5SDimitry Andric } 7630b57cec5SDimitry Andric 7640b57cec5SDimitry Andric /// Computes the source location just past the end of the 7650b57cec5SDimitry Andric /// token at this source location. 7660b57cec5SDimitry Andric /// 7670b57cec5SDimitry Andric /// This routine can be used to produce a source location that 7680b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and 7690b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a 7700b57cec5SDimitry Andric /// token where it expected something different that it received. If 7710b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if 7720b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid 7730b57cec5SDimitry Andric /// source location. 7740b57cec5SDimitry Andric /// 7750b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source 7760b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source 7770b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces 7780b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc. 7790b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 7800b57cec5SDimitry Andric const SourceManager &SM, 7810b57cec5SDimitry Andric const LangOptions &LangOpts) { 7820b57cec5SDimitry Andric if (Loc.isInvalid()) 7830b57cec5SDimitry Andric return {}; 7840b57cec5SDimitry Andric 7850b57cec5SDimitry Andric if (Loc.isMacroID()) { 7860b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 7870b57cec5SDimitry Andric return {}; // Points inside the macro expansion. 7880b57cec5SDimitry Andric } 7890b57cec5SDimitry Andric 7900b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 7910b57cec5SDimitry Andric if (Len > Offset) 7920b57cec5SDimitry Andric Len = Len - Offset; 7930b57cec5SDimitry Andric else 7940b57cec5SDimitry Andric return Loc; 7950b57cec5SDimitry Andric 7960b57cec5SDimitry Andric return Loc.getLocWithOffset(Len); 7970b57cec5SDimitry Andric } 7980b57cec5SDimitry Andric 7990b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first 8000b57cec5SDimitry Andric /// token of the macro expansion. 8010b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 8020b57cec5SDimitry Andric const SourceManager &SM, 8030b57cec5SDimitry Andric const LangOptions &LangOpts, 8040b57cec5SDimitry Andric SourceLocation *MacroBegin) { 8050b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8060b57cec5SDimitry Andric 8070b57cec5SDimitry Andric SourceLocation expansionLoc; 8080b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 8090b57cec5SDimitry Andric return false; 8100b57cec5SDimitry Andric 8110b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8120b57cec5SDimitry Andric // No other macro expansions, this is the first. 8130b57cec5SDimitry Andric if (MacroBegin) 8140b57cec5SDimitry Andric *MacroBegin = expansionLoc; 8150b57cec5SDimitry Andric return true; 8160b57cec5SDimitry Andric } 8170b57cec5SDimitry Andric 8180b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 8190b57cec5SDimitry Andric } 8200b57cec5SDimitry Andric 8210b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last 8220b57cec5SDimitry Andric /// token of the macro expansion. 8230b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 8240b57cec5SDimitry Andric const SourceManager &SM, 8250b57cec5SDimitry Andric const LangOptions &LangOpts, 8260b57cec5SDimitry Andric SourceLocation *MacroEnd) { 8270b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8280b57cec5SDimitry Andric 8290b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc); 8300b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 8310b57cec5SDimitry Andric if (tokLen == 0) 8320b57cec5SDimitry Andric return false; 8330b57cec5SDimitry Andric 8340b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 8350b57cec5SDimitry Andric SourceLocation expansionLoc; 8360b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 8370b57cec5SDimitry Andric return false; 8380b57cec5SDimitry Andric 8390b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8400b57cec5SDimitry Andric // No other macro expansions. 8410b57cec5SDimitry Andric if (MacroEnd) 8420b57cec5SDimitry Andric *MacroEnd = expansionLoc; 8430b57cec5SDimitry Andric return true; 8440b57cec5SDimitry Andric } 8450b57cec5SDimitry Andric 8460b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 8470b57cec5SDimitry Andric } 8480b57cec5SDimitry Andric 8490b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 8500b57cec5SDimitry Andric const SourceManager &SM, 8510b57cec5SDimitry Andric const LangOptions &LangOpts) { 8520b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8530b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8540b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID()); 8550b57cec5SDimitry Andric if (Range.isTokenRange()) { 8560b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 8570b57cec5SDimitry Andric if (End.isInvalid()) 8580b57cec5SDimitry Andric return {}; 8590b57cec5SDimitry Andric } 8600b57cec5SDimitry Andric 8610b57cec5SDimitry Andric // Break down the source locations. 8620b57cec5SDimitry Andric FileID FID; 8630b57cec5SDimitry Andric unsigned BeginOffs; 8640b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 8650b57cec5SDimitry Andric if (FID.isInvalid()) 8660b57cec5SDimitry Andric return {}; 8670b57cec5SDimitry Andric 8680b57cec5SDimitry Andric unsigned EndOffs; 8690b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) || 8700b57cec5SDimitry Andric BeginOffs > EndOffs) 8710b57cec5SDimitry Andric return {}; 8720b57cec5SDimitry Andric 8730b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End); 8740b57cec5SDimitry Andric } 8750b57cec5SDimitry Andric 8760b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 8770b57cec5SDimitry Andric const SourceManager &SM, 8780b57cec5SDimitry Andric const LangOptions &LangOpts) { 8790b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8800b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8810b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid()) 8820b57cec5SDimitry Andric return {}; 8830b57cec5SDimitry Andric 8840b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID()) 8850b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 8860b57cec5SDimitry Andric 8870b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) { 8880b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 8890b57cec5SDimitry Andric return {}; 8900b57cec5SDimitry Andric Range.setBegin(Begin); 8910b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 8920b57cec5SDimitry Andric } 8930b57cec5SDimitry Andric 8940b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) { 8950b57cec5SDimitry Andric if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 8960b57cec5SDimitry Andric &End)) || 8970b57cec5SDimitry Andric (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 8980b57cec5SDimitry Andric &End))) 8990b57cec5SDimitry Andric return {}; 9000b57cec5SDimitry Andric Range.setEnd(End); 9010b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9020b57cec5SDimitry Andric } 9030b57cec5SDimitry Andric 9040b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID()); 9050b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd; 9060b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 9070b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 9080b57cec5SDimitry Andric &MacroEnd)) || 9090b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 9100b57cec5SDimitry Andric &MacroEnd)))) { 9110b57cec5SDimitry Andric Range.setBegin(MacroBegin); 9120b57cec5SDimitry Andric Range.setEnd(MacroEnd); 9130b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9140b57cec5SDimitry Andric } 9150b57cec5SDimitry Andric 9160b57cec5SDimitry Andric bool Invalid = false; 9170b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 9180b57cec5SDimitry Andric &Invalid); 9190b57cec5SDimitry Andric if (Invalid) 9200b57cec5SDimitry Andric return {}; 9210b57cec5SDimitry Andric 9220b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) { 9230b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 9240b57cec5SDimitry Andric &Invalid); 9250b57cec5SDimitry Andric if (Invalid) 9260b57cec5SDimitry Andric return {}; 9270b57cec5SDimitry Andric 9280b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() && 9290b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() == 9300b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) { 9310b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 9320b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End)); 9330b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts); 9340b57cec5SDimitry Andric } 9350b57cec5SDimitry Andric } 9360b57cec5SDimitry Andric 9370b57cec5SDimitry Andric return {}; 9380b57cec5SDimitry Andric } 9390b57cec5SDimitry Andric 9400b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range, 9410b57cec5SDimitry Andric const SourceManager &SM, 9420b57cec5SDimitry Andric const LangOptions &LangOpts, 9430b57cec5SDimitry Andric bool *Invalid) { 9440b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts); 9450b57cec5SDimitry Andric if (Range.isInvalid()) { 9460b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9470b57cec5SDimitry Andric return {}; 9480b57cec5SDimitry Andric } 9490b57cec5SDimitry Andric 9500b57cec5SDimitry Andric // Break down the source location. 9510b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 9520b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) { 9530b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9540b57cec5SDimitry Andric return {}; 9550b57cec5SDimitry Andric } 9560b57cec5SDimitry Andric 9570b57cec5SDimitry Andric unsigned EndOffs; 9580b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 9590b57cec5SDimitry Andric beginInfo.second > EndOffs) { 9600b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9610b57cec5SDimitry Andric return {}; 9620b57cec5SDimitry Andric } 9630b57cec5SDimitry Andric 9640b57cec5SDimitry Andric // Try to the load the file buffer. 9650b57cec5SDimitry Andric bool invalidTemp = false; 9660b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 9670b57cec5SDimitry Andric if (invalidTemp) { 9680b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9690b57cec5SDimitry Andric return {}; 9700b57cec5SDimitry Andric } 9710b57cec5SDimitry Andric 9720b57cec5SDimitry Andric if (Invalid) *Invalid = false; 9730b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second); 9740b57cec5SDimitry Andric } 9750b57cec5SDimitry Andric 9760b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 9770b57cec5SDimitry Andric const SourceManager &SM, 9780b57cec5SDimitry Andric const LangOptions &LangOpts) { 9790b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 9800b57cec5SDimitry Andric 9810b57cec5SDimitry Andric // Find the location of the immediate macro expansion. 9820b57cec5SDimitry Andric while (true) { 9830b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc); 9840b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 9850b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 9860b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart(); 9870b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion()) 9880b57cec5SDimitry Andric break; 9890b57cec5SDimitry Andric 9900b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come 9910b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )" 9920b57cec5SDimitry Andric 9930b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the 9940b57cec5SDimitry Andric // macro expansion. 9950b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 9960b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc(); 9970b57cec5SDimitry Andric if (SpellLoc.isFileID()) 9980b57cec5SDimitry Andric break; // No inner macro. 9990b57cec5SDimitry Andric 10000b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion 10010b57cec5SDimitry Andric // location, it means there is no inner macro. 10020b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc); 10030b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID)) 10040b57cec5SDimitry Andric break; 10050b57cec5SDimitry Andric 10060b57cec5SDimitry Andric // Argument came from inner macro. 10070b57cec5SDimitry Andric Loc = SpellLoc; 10080b57cec5SDimitry Andric } 10090b57cec5SDimitry Andric 10100b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10110b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10120b57cec5SDimitry Andric // expanding this macro. 10130b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc); 10140b57cec5SDimitry Andric 10150b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10160b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10170b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10180b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10190b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10200b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10210b57cec5SDimitry Andric } 10220b57cec5SDimitry Andric 10230b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics( 10240b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 10250b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10260b57cec5SDimitry Andric // Walk past macro argument expansions. 10270b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc)) 10280b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10290b57cec5SDimitry Andric 10300b57cec5SDimitry Andric // If the macro's spelling has no FileID, then it's actually a token paste 10310b57cec5SDimitry Andric // or stringization (or similar) and not a macro at all. 10320b57cec5SDimitry Andric if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 10330b57cec5SDimitry Andric return {}; 10340b57cec5SDimitry Andric 10350b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10360b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10370b57cec5SDimitry Andric // expanding this macro. 10380b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 10390b57cec5SDimitry Andric 10400b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10410b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10420b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10430b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10440b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10450b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10460b57cec5SDimitry Andric } 10470b57cec5SDimitry Andric 10480b57cec5SDimitry Andric bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 10490b57cec5SDimitry Andric return isIdentifierBody(c, LangOpts.DollarIdents); 10500b57cec5SDimitry Andric } 10510b57cec5SDimitry Andric 10520b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 10530b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0])); 10540b57cec5SDimitry Andric if (Str - 1 < BufferStart) 10550b57cec5SDimitry Andric return false; 10560b57cec5SDimitry Andric 10570b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') || 10580b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) { 10590b57cec5SDimitry Andric if (Str - 2 < BufferStart) 10600b57cec5SDimitry Andric return false; 10610b57cec5SDimitry Andric --Str; 10620b57cec5SDimitry Andric } 10630b57cec5SDimitry Andric --Str; 10640b57cec5SDimitry Andric 10650b57cec5SDimitry Andric // Rewind to first non-space character: 10660b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str)) 10670b57cec5SDimitry Andric --Str; 10680b57cec5SDimitry Andric 10690b57cec5SDimitry Andric return *Str == '\\'; 10700b57cec5SDimitry Andric } 10710b57cec5SDimitry Andric 10720b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc, 10730b57cec5SDimitry Andric const SourceManager &SM) { 10740b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID()) 10750b57cec5SDimitry Andric return {}; 10760b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 10770b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 10780b57cec5SDimitry Andric return {}; 10790b57cec5SDimitry Andric bool Invalid = false; 10800b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 10810b57cec5SDimitry Andric if (Invalid) 10820b57cec5SDimitry Andric return {}; 10830b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 10840b57cec5SDimitry Andric if (!Line) 10850b57cec5SDimitry Andric return {}; 10860b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data()); 10870b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 10880b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos 10890b57cec5SDimitry Andric ? "" 10900b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars); 10910b57cec5SDimitry Andric } 10920b57cec5SDimitry Andric 10930b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 10940b57cec5SDimitry Andric // Diagnostics forwarding code. 10950b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 10960b57cec5SDimitry Andric 10970b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 10980b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping. 10990b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow 11000b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined. 11010b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 11020b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 11030b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 11040b57cec5SDimitry Andric SourceLocation FileLoc, 11050b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) { 11060b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion"); 11070b57cec5SDimitry Andric 11080b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like 11090b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the 11100b57cec5SDimitry Andric // spelling location. 11110b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 11120b57cec5SDimitry Andric 11130b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 11140b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset. 11150b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 11160b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 11170b57cec5SDimitry Andric 11180b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the 11190b57cec5SDimitry Andric // original _Pragma(...) sequence. 11200b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 11210b57cec5SDimitry Andric 11220b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 11230b57cec5SDimitry Andric } 11240b57cec5SDimitry Andric 11250b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified 11260b57cec5SDimitry Andric /// offset in the current file. 11270b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc, 11280b57cec5SDimitry Andric unsigned TokLen) const { 11290b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd && 11300b57cec5SDimitry Andric "Location out of range for this buffer!"); 11310b57cec5SDimitry Andric 11320b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return 11330b57cec5SDimitry Andric // the file id from FileLoc with the offset specified. 11340b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart; 11350b57cec5SDimitry Andric if (FileLoc.isFileID()) 11360b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo); 11370b57cec5SDimitry Andric 11380b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the 11390b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined. 11400b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers"); 11410b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 11420b57cec5SDimitry Andric } 11430b57cec5SDimitry Andric 11440b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source 11450b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering. 11460b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 11470b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID); 11480b57cec5SDimitry Andric } 11490b57cec5SDimitry Andric 11500b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11510b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code. 11520b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11530b57cec5SDimitry Andric 11540b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 11550b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 11560b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) { 11570b57cec5SDimitry Andric switch (Letter) { 11580b57cec5SDimitry Andric default: return 0; 11590b57cec5SDimitry Andric case '=': return '#'; 11600b57cec5SDimitry Andric case ')': return ']'; 11610b57cec5SDimitry Andric case '(': return '['; 11620b57cec5SDimitry Andric case '!': return '|'; 11630b57cec5SDimitry Andric case '\'': return '^'; 11640b57cec5SDimitry Andric case '>': return '}'; 11650b57cec5SDimitry Andric case '/': return '\\'; 11660b57cec5SDimitry Andric case '<': return '{'; 11670b57cec5SDimitry Andric case '-': return '~'; 11680b57cec5SDimitry Andric } 11690b57cec5SDimitry Andric } 11700b57cec5SDimitry Andric 11710b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when 11720b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 11730b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use 11740b57cec5SDimitry Andric /// whether trigraphs are enabled or not. 11750b57cec5SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L) { 11760b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP); 11770b57cec5SDimitry Andric if (!Res || !L) return Res; 11780b57cec5SDimitry Andric 11790b57cec5SDimitry Andric if (!L->getLangOpts().Trigraphs) { 11800b57cec5SDimitry Andric if (!L->isLexingRawMode()) 11810b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored); 11820b57cec5SDimitry Andric return 0; 11830b57cec5SDimitry Andric } 11840b57cec5SDimitry Andric 11850b57cec5SDimitry Andric if (!L->isLexingRawMode()) 11860b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 11870b57cec5SDimitry Andric return Res; 11880b57cec5SDimitry Andric } 11890b57cec5SDimitry Andric 11900b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline, 11910b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 11920b57cec5SDimitry Andric /// trigraph equivalent on entry to this function. 11930b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 11940b57cec5SDimitry Andric unsigned Size = 0; 11950b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) { 11960b57cec5SDimitry Andric ++Size; 11970b57cec5SDimitry Andric 11980b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 11990b57cec5SDimitry Andric continue; 12000b57cec5SDimitry Andric 12010b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half. 12020b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 12030b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size]) 12040b57cec5SDimitry Andric ++Size; 12050b57cec5SDimitry Andric 12060b57cec5SDimitry Andric return Size; 12070b57cec5SDimitry Andric } 12080b57cec5SDimitry Andric 12090b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else. 12100b57cec5SDimitry Andric return 0; 12110b57cec5SDimitry Andric } 12120b57cec5SDimitry Andric 12130b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 12140b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found, 12150b57cec5SDimitry Andric /// otherwise return P. 12160b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) { 12170b57cec5SDimitry Andric while (true) { 12180b57cec5SDimitry Andric const char *AfterEscape; 12190b57cec5SDimitry Andric if (*P == '\\') { 12200b57cec5SDimitry Andric AfterEscape = P+1; 12210b57cec5SDimitry Andric } else if (*P == '?') { 12220b57cec5SDimitry Andric // If not a trigraph for escape, bail out. 12230b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/') 12240b57cec5SDimitry Andric return P; 12250b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not 12260b57cec5SDimitry Andric // support trigraphs. 12270b57cec5SDimitry Andric AfterEscape = P+3; 12280b57cec5SDimitry Andric } else { 12290b57cec5SDimitry Andric return P; 12300b57cec5SDimitry Andric } 12310b57cec5SDimitry Andric 12320b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 12330b57cec5SDimitry Andric if (NewLineSize == 0) return P; 12340b57cec5SDimitry Andric P = AfterEscape+NewLineSize; 12350b57cec5SDimitry Andric } 12360b57cec5SDimitry Andric } 12370b57cec5SDimitry Andric 12380b57cec5SDimitry Andric Optional<Token> Lexer::findNextToken(SourceLocation Loc, 12390b57cec5SDimitry Andric const SourceManager &SM, 12400b57cec5SDimitry Andric const LangOptions &LangOpts) { 12410b57cec5SDimitry Andric if (Loc.isMacroID()) { 12420b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 12430b57cec5SDimitry Andric return None; 12440b57cec5SDimitry Andric } 12450b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 12460b57cec5SDimitry Andric 12470b57cec5SDimitry Andric // Break down the source location. 12480b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 12490b57cec5SDimitry Andric 12500b57cec5SDimitry Andric // Try to load the file buffer. 12510b57cec5SDimitry Andric bool InvalidTemp = false; 12520b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 12530b57cec5SDimitry Andric if (InvalidTemp) 12540b57cec5SDimitry Andric return None; 12550b57cec5SDimitry Andric 12560b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second; 12570b57cec5SDimitry Andric 12580b57cec5SDimitry Andric // Lex from the start of the given location. 12590b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 12600b57cec5SDimitry Andric TokenBegin, File.end()); 12610b57cec5SDimitry Andric // Find the token. 12620b57cec5SDimitry Andric Token Tok; 12630b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok); 12640b57cec5SDimitry Andric return Tok; 12650b57cec5SDimitry Andric } 12660b57cec5SDimitry Andric 12670b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the 12680b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location 12690b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the 12700b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid. 12710b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken( 12720b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 12730b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 12740b57cec5SDimitry Andric Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 12750b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind)) 12760b57cec5SDimitry Andric return {}; 12770b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation(); 12780b57cec5SDimitry Andric 12790b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any. 12800b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0; 12810b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) { 12820b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 12830b57cec5SDimitry Andric unsigned char C = *TokenEnd; 12840b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) { 12850b57cec5SDimitry Andric C = *(++TokenEnd); 12860b57cec5SDimitry Andric NumWhitespaceChars++; 12870b57cec5SDimitry Andric } 12880b57cec5SDimitry Andric 12890b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r 12900b57cec5SDimitry Andric if (C == '\n' || C == '\r') { 12910b57cec5SDimitry Andric char PrevC = C; 12920b57cec5SDimitry Andric C = *(++TokenEnd); 12930b57cec5SDimitry Andric NumWhitespaceChars++; 12940b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC) 12950b57cec5SDimitry Andric NumWhitespaceChars++; 12960b57cec5SDimitry Andric } 12970b57cec5SDimitry Andric } 12980b57cec5SDimitry Andric 12990b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 13000b57cec5SDimitry Andric } 13010b57cec5SDimitry Andric 13020b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 13030b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases: 13040b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph, 13050b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?', 13060b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not. 13070b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between 13080b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return 13090b57cec5SDimitry Andric /// the char after it. 13100b57cec5SDimitry Andric /// 13110b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we 13120b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented 13130b57cec5SDimitry Andric /// Ptr by Size bytes. 13140b57cec5SDimitry Andric /// 13150b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 13160b57cec5SDimitry Andric /// be updated to match. 13170b57cec5SDimitry Andric char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 13180b57cec5SDimitry Andric Token *Tok) { 13190b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 13200b57cec5SDimitry Andric if (Ptr[0] == '\\') { 13210b57cec5SDimitry Andric ++Size; 13220b57cec5SDimitry Andric ++Ptr; 13230b57cec5SDimitry Andric Slash: 13240b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 13250b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 13260b57cec5SDimitry Andric 13270b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and 13280b57cec5SDimitry Andric // newline. 13290b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 13300b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13310b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13320b57cec5SDimitry Andric 13330b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline. 13340b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 13350b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space); 13360b57cec5SDimitry Andric 13370b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 13380b57cec5SDimitry Andric Size += EscapedNewLineSize; 13390b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 13400b57cec5SDimitry Andric 13410b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 13420b57cec5SDimitry Andric return getCharAndSizeSlow(Ptr, Size, Tok); 13430b57cec5SDimitry Andric } 13440b57cec5SDimitry Andric 13450b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 13460b57cec5SDimitry Andric return '\\'; 13470b57cec5SDimitry Andric } 13480b57cec5SDimitry Andric 13490b57cec5SDimitry Andric // If this is a trigraph, process it. 13500b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') { 13510b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit 13520b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it. 13530b57cec5SDimitry Andric if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 13540b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13550b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13560b57cec5SDimitry Andric 13570b57cec5SDimitry Andric Ptr += 3; 13580b57cec5SDimitry Andric Size += 3; 13590b57cec5SDimitry Andric if (C == '\\') goto Slash; 13600b57cec5SDimitry Andric return C; 13610b57cec5SDimitry Andric } 13620b57cec5SDimitry Andric } 13630b57cec5SDimitry Andric 13640b57cec5SDimitry Andric // If this is neither, return a single character. 13650b57cec5SDimitry Andric ++Size; 13660b57cec5SDimitry Andric return *Ptr; 13670b57cec5SDimitry Andric } 13680b57cec5SDimitry Andric 13690b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 13700b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 13710b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes. 13720b57cec5SDimitry Andric /// 13730b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 13740b57cec5SDimitry Andric /// be updated to match. 13750b57cec5SDimitry Andric char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 13760b57cec5SDimitry Andric const LangOptions &LangOpts) { 13770b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 13780b57cec5SDimitry Andric if (Ptr[0] == '\\') { 13790b57cec5SDimitry Andric ++Size; 13800b57cec5SDimitry Andric ++Ptr; 13810b57cec5SDimitry Andric Slash: 13820b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 13830b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 13840b57cec5SDimitry Andric 13850b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline. 13860b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 13870b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 13880b57cec5SDimitry Andric Size += EscapedNewLineSize; 13890b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 13900b57cec5SDimitry Andric 13910b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 13920b57cec5SDimitry Andric return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 13930b57cec5SDimitry Andric } 13940b57cec5SDimitry Andric 13950b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 13960b57cec5SDimitry Andric return '\\'; 13970b57cec5SDimitry Andric } 13980b57cec5SDimitry Andric 13990b57cec5SDimitry Andric // If this is a trigraph, process it. 14000b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 14010b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return 14020b57cec5SDimitry Andric // it. 14030b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) { 14040b57cec5SDimitry Andric Ptr += 3; 14050b57cec5SDimitry Andric Size += 3; 14060b57cec5SDimitry Andric if (C == '\\') goto Slash; 14070b57cec5SDimitry Andric return C; 14080b57cec5SDimitry Andric } 14090b57cec5SDimitry Andric } 14100b57cec5SDimitry Andric 14110b57cec5SDimitry Andric // If this is neither, return a single character. 14120b57cec5SDimitry Andric ++Size; 14130b57cec5SDimitry Andric return *Ptr; 14140b57cec5SDimitry Andric } 14150b57cec5SDimitry Andric 14160b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14170b57cec5SDimitry Andric // Helper methods for lexing. 14180b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14190b57cec5SDimitry Andric 14200b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file. 14210b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 14220b57cec5SDimitry Andric BufferPtr = BufferStart + Offset; 14230b57cec5SDimitry Andric if (BufferPtr > BufferEnd) 14240b57cec5SDimitry Andric BufferPtr = BufferEnd; 14250b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two 14260b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the 14270b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line. 14280b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine; 14290b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine; 14300b57cec5SDimitry Andric } 14310b57cec5SDimitry Andric 14320b57cec5SDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 14330b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14340b57cec5SDimitry Andric return false; 1435480093f4SDimitry Andric } else if (LangOpts.DollarIdents && '$' == C) { 1436480093f4SDimitry Andric return true; 14370b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 14380b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 14390b57cec5SDimitry Andric C11AllowedIDCharRanges); 14400b57cec5SDimitry Andric return C11AllowedIDChars.contains(C); 14410b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus) { 14420b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 14430b57cec5SDimitry Andric CXX03AllowedIDCharRanges); 14440b57cec5SDimitry Andric return CXX03AllowedIDChars.contains(C); 14450b57cec5SDimitry Andric } else { 14460b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 14470b57cec5SDimitry Andric C99AllowedIDCharRanges); 14480b57cec5SDimitry Andric return C99AllowedIDChars.contains(C); 14490b57cec5SDimitry Andric } 14500b57cec5SDimitry Andric } 14510b57cec5SDimitry Andric 14520b57cec5SDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 14530b57cec5SDimitry Andric assert(isAllowedIDChar(C, LangOpts)); 14540b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14550b57cec5SDimitry Andric return false; 14560b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 14570b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 14580b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges); 14590b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C); 14600b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus) { 14610b57cec5SDimitry Andric return true; 14620b57cec5SDimitry Andric } else { 14630b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 14640b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 14650b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C); 14660b57cec5SDimitry Andric } 14670b57cec5SDimitry Andric } 14680b57cec5SDimitry Andric 14690b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 14700b57cec5SDimitry Andric const char *End) { 14710b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 14720b57cec5SDimitry Andric L.getSourceLocation(End)); 14730b57cec5SDimitry Andric } 14740b57cec5SDimitry Andric 14750b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 14760b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) { 14770b57cec5SDimitry Andric // Check C99 compatibility. 14780b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 14790b57cec5SDimitry Andric enum { 14800b57cec5SDimitry Andric CannotAppearInIdentifier = 0, 14810b57cec5SDimitry Andric CannotStartIdentifier 14820b57cec5SDimitry Andric }; 14830b57cec5SDimitry Andric 14840b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 14850b57cec5SDimitry Andric C99AllowedIDCharRanges); 14860b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 14870b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 14880b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) { 14890b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 14900b57cec5SDimitry Andric << Range 14910b57cec5SDimitry Andric << CannotAppearInIdentifier; 14920b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 14930b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 14940b57cec5SDimitry Andric << Range 14950b57cec5SDimitry Andric << CannotStartIdentifier; 14960b57cec5SDimitry Andric } 14970b57cec5SDimitry Andric } 14980b57cec5SDimitry Andric 14990b57cec5SDimitry Andric // Check C++98 compatibility. 15000b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { 15010b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 15020b57cec5SDimitry Andric CXX03AllowedIDCharRanges); 15030b57cec5SDimitry Andric if (!CXX03AllowedIDChars.contains(C)) { 15040b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) 15050b57cec5SDimitry Andric << Range; 15060b57cec5SDimitry Andric } 15070b57cec5SDimitry Andric } 15080b57cec5SDimitry Andric } 15090b57cec5SDimitry Andric 15100b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier 15110b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier 15120b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier 15130b57cec5SDimitry Andric /// character and warn if so. 15140b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 15150b57cec5SDimitry Andric CharSourceRange Range) { 15160b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 15170b57cec5SDimitry Andric struct HomoglyphPair { 15180b57cec5SDimitry Andric uint32_t Character; 15190b57cec5SDimitry Andric char LooksLike; 15200b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; } 15210b57cec5SDimitry Andric }; 15220b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = { 15230b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN 15240b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 15250b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK 15260b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE 15270b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 15280b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER 15290b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER 15300b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION 15310b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES 15320b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR 15330b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS 15340b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN 15350b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH 15360b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS 15370b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR 15380b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES 15390b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND 15400b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO 15410b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR 15420b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON 15430b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 15440b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 15450b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 15460b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 15470b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 15480b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 15490b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 15500b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 15510b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 15520b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 15530b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA 15540b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 15550b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 15560b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 15570b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON 15580b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 15590b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 15600b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 15610b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 15620b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 15630b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 15640b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 15650b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 15660b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 15670b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 15680b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 15690b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 15700b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 15710b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE 15720b57cec5SDimitry Andric {0, 0} 15730b57cec5SDimitry Andric }; 15740b57cec5SDimitry Andric auto Homoglyph = 15750b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs), 15760b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 15770b57cec5SDimitry Andric if (Homoglyph->Character == C) { 15780b57cec5SDimitry Andric llvm::SmallString<5> CharBuf; 15790b57cec5SDimitry Andric { 15800b57cec5SDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 15810b57cec5SDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 15820b57cec5SDimitry Andric } 15830b57cec5SDimitry Andric if (Homoglyph->LooksLike) { 15840b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 15850b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 15860b57cec5SDimitry Andric << Range << CharBuf << LooksLikeStr; 15870b57cec5SDimitry Andric } else { 15880b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 15890b57cec5SDimitry Andric << Range << CharBuf; 15900b57cec5SDimitry Andric } 15910b57cec5SDimitry Andric } 15920b57cec5SDimitry Andric } 15930b57cec5SDimitry Andric 15940b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 15950b57cec5SDimitry Andric Token &Result) { 15960b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size; 15970b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 15980b57cec5SDimitry Andric if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) 15990b57cec5SDimitry Andric return false; 16000b57cec5SDimitry Andric 16010b57cec5SDimitry Andric if (!isLexingRawMode()) 16020b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 16030b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 16040b57cec5SDimitry Andric /*IsFirst=*/false); 16050b57cec5SDimitry Andric 16060b57cec5SDimitry Andric Result.setFlag(Token::HasUCN); 16070b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 16080b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 16090b57cec5SDimitry Andric CurPtr = UCNPtr; 16100b57cec5SDimitry Andric else 16110b57cec5SDimitry Andric while (CurPtr != UCNPtr) 16120b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 16130b57cec5SDimitry Andric return true; 16140b57cec5SDimitry Andric } 16150b57cec5SDimitry Andric 16160b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 16170b57cec5SDimitry Andric const char *UnicodePtr = CurPtr; 16180b57cec5SDimitry Andric llvm::UTF32 CodePoint; 16190b57cec5SDimitry Andric llvm::ConversionResult Result = 16200b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 16210b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 16220b57cec5SDimitry Andric &CodePoint, 16230b57cec5SDimitry Andric llvm::strictConversion); 16240b57cec5SDimitry Andric if (Result != llvm::conversionOK || 16250b57cec5SDimitry Andric !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) 16260b57cec5SDimitry Andric return false; 16270b57cec5SDimitry Andric 16280b57cec5SDimitry Andric if (!isLexingRawMode()) { 16290b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 16300b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), 16310b57cec5SDimitry Andric /*IsFirst=*/false); 16320b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 16330b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr)); 16340b57cec5SDimitry Andric } 16350b57cec5SDimitry Andric 16360b57cec5SDimitry Andric CurPtr = UnicodePtr; 16370b57cec5SDimitry Andric return true; 16380b57cec5SDimitry Andric } 16390b57cec5SDimitry Andric 16400b57cec5SDimitry Andric bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 16410b57cec5SDimitry Andric // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 16420b57cec5SDimitry Andric unsigned Size; 16430b57cec5SDimitry Andric unsigned char C = *CurPtr++; 16440b57cec5SDimitry Andric while (isIdentifierBody(C)) 16450b57cec5SDimitry Andric C = *CurPtr++; 16460b57cec5SDimitry Andric 16470b57cec5SDimitry Andric --CurPtr; // Back up over the skipped character. 16480b57cec5SDimitry Andric 16490b57cec5SDimitry Andric // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 16500b57cec5SDimitry Andric // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 16510b57cec5SDimitry Andric // 16520b57cec5SDimitry Andric // TODO: Could merge these checks into an InfoTable flag to make the 16530b57cec5SDimitry Andric // comparison cheaper 16540b57cec5SDimitry Andric if (isASCII(C) && C != '\\' && C != '?' && 16550b57cec5SDimitry Andric (C != '$' || !LangOpts.DollarIdents)) { 16560b57cec5SDimitry Andric FinishIdentifier: 16570b57cec5SDimitry Andric const char *IdStart = BufferPtr; 16580b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 16590b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart); 16600b57cec5SDimitry Andric 16610b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to 16620b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it. 16630b57cec5SDimitry Andric if (LexingRawMode) 16640b57cec5SDimitry Andric return true; 16650b57cec5SDimitry Andric 16660b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind, 16670b57cec5SDimitry Andric // looking up the identifier in the identifier table. 16680b57cec5SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 16690b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code 16700b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it. 16710b57cec5SDimitry Andric 16720b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat 16730b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword. 16740b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'. 16750b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) { 16760b57cec5SDimitry Andric // Return the code-completion token. 16770b57cec5SDimitry Andric Result.setKind(tok::code_completion); 16780b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters. 16790b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in 16800b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that 16810b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 16820b57cec5SDimitry Andric // simpler. 16830b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0"); 16840b57cec5SDimitry Andric ++CurPtr; 16850b57cec5SDimitry Andric // Note that code completion token is not added as a separate character 16860b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need 16870b57cec5SDimitry Andric // to check if the buffer has ended. 16880b57cec5SDimitry Andric if (CurPtr < BufferEnd) { 16890b57cec5SDimitry Andric while (isIdentifierBody(*CurPtr)) 16900b57cec5SDimitry Andric ++CurPtr; 16910b57cec5SDimitry Andric } 16920b57cec5SDimitry Andric BufferPtr = CurPtr; 16930b57cec5SDimitry Andric return true; 16940b57cec5SDimitry Andric } 16950b57cec5SDimitry Andric 16960b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the 16970b57cec5SDimitry Andric // preprocessor, which may macro expand it or something. 16980b57cec5SDimitry Andric if (II->isHandleIdentifierCase()) 16990b57cec5SDimitry Andric return PP->HandleIdentifier(Result); 17000b57cec5SDimitry Andric 17010b57cec5SDimitry Andric return true; 17020b57cec5SDimitry Andric } 17030b57cec5SDimitry Andric 17040b57cec5SDimitry Andric // Otherwise, $,\,? in identifier found. Enter slower path. 17050b57cec5SDimitry Andric 17060b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17070b57cec5SDimitry Andric while (true) { 17080b57cec5SDimitry Andric if (C == '$') { 17090b57cec5SDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done. 17100b57cec5SDimitry Andric if (!LangOpts.DollarIdents) goto FinishIdentifier; 17110b57cec5SDimitry Andric 17120b57cec5SDimitry Andric // Otherwise, emit a diagnostic and continue. 17130b57cec5SDimitry Andric if (!isLexingRawMode()) 17140b57cec5SDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier); 17150b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 17160b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17170b57cec5SDimitry Andric continue; 17180b57cec5SDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 17190b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17200b57cec5SDimitry Andric continue; 17210b57cec5SDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 17220b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17230b57cec5SDimitry Andric continue; 17240b57cec5SDimitry Andric } else if (!isIdentifierBody(C)) { 17250b57cec5SDimitry Andric goto FinishIdentifier; 17260b57cec5SDimitry Andric } 17270b57cec5SDimitry Andric 17280b57cec5SDimitry Andric // Otherwise, this character is good, consume it. 17290b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 17300b57cec5SDimitry Andric 17310b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17320b57cec5SDimitry Andric while (isIdentifierBody(C)) { 17330b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 17340b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17350b57cec5SDimitry Andric } 17360b57cec5SDimitry Andric } 17370b57cec5SDimitry Andric } 17380b57cec5SDimitry Andric 17390b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant. 17400b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens). 17410b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 17420b57cec5SDimitry Andric unsigned Size; 17430b57cec5SDimitry Andric char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 17440b57cec5SDimitry Andric if (C1 != '0') 17450b57cec5SDimitry Andric return false; 17460b57cec5SDimitry Andric char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 17470b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X'); 17480b57cec5SDimitry Andric } 17490b57cec5SDimitry Andric 17500b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point 17510b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the 17520b57cec5SDimitry Andric /// constant. 17530b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 17540b57cec5SDimitry Andric unsigned Size; 17550b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 17560b57cec5SDimitry Andric char PrevCh = 0; 17570b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) { 17580b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 17590b57cec5SDimitry Andric PrevCh = C; 17600b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 17610b57cec5SDimitry Andric } 17620b57cec5SDimitry Andric 17630b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 17640b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 17650b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex. 17660b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 17670b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 17680b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 17690b57cec5SDimitry Andric } 17700b57cec5SDimitry Andric 17710b57cec5SDimitry Andric // If we have a hex FP constant, continue. 17720b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 17730b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 17740b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's 17750b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix. 17760b57cec5SDimitry Andric bool IsHexFloat = true; 17770b57cec5SDimitry Andric if (!LangOpts.C99) { 17780b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts)) 17790b57cec5SDimitry Andric IsHexFloat = false; 17800b57cec5SDimitry Andric else if (!getLangOpts().CPlusPlus17 && 17810b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr) 17820b57cec5SDimitry Andric IsHexFloat = false; 17830b57cec5SDimitry Andric } 17840b57cec5SDimitry Andric if (IsHexFloat) 17850b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 17860b57cec5SDimitry Andric } 17870b57cec5SDimitry Andric 17880b57cec5SDimitry Andric // If we have a digit separator, continue. 17890b57cec5SDimitry Andric if (C == '\'' && getLangOpts().CPlusPlus14) { 17900b57cec5SDimitry Andric unsigned NextSize; 17910b57cec5SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 17920b57cec5SDimitry Andric if (isIdentifierBody(Next)) { 17930b57cec5SDimitry Andric if (!isLexingRawMode()) 17940b57cec5SDimitry Andric Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); 17950b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 17960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result); 17970b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 17980b57cec5SDimitry Andric } 17990b57cec5SDimitry Andric } 18000b57cec5SDimitry Andric 18010b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 18020b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 18030b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 18040b57cec5SDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 18050b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 18060b57cec5SDimitry Andric 18070b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 18080b57cec5SDimitry Andric const char *TokStart = BufferPtr; 18090b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 18100b57cec5SDimitry Andric Result.setLiteralData(TokStart); 18110b57cec5SDimitry Andric return true; 18120b57cec5SDimitry Andric } 18130b57cec5SDimitry Andric 18140b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 18150b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98. 18160b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 18170b57cec5SDimitry Andric bool IsStringLiteral) { 18180b57cec5SDimitry Andric assert(getLangOpts().CPlusPlus); 18190b57cec5SDimitry Andric 18200b57cec5SDimitry Andric // Maximally munch an identifier. 18210b57cec5SDimitry Andric unsigned Size; 18220b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 18230b57cec5SDimitry Andric bool Consumed = false; 18240b57cec5SDimitry Andric 18250b57cec5SDimitry Andric if (!isIdentifierHead(C)) { 18260b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 18270b57cec5SDimitry Andric Consumed = true; 18280b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 18290b57cec5SDimitry Andric Consumed = true; 18300b57cec5SDimitry Andric else 18310b57cec5SDimitry Andric return CurPtr; 18320b57cec5SDimitry Andric } 18330b57cec5SDimitry Andric 18340b57cec5SDimitry Andric if (!getLangOpts().CPlusPlus11) { 18350b57cec5SDimitry Andric if (!isLexingRawMode()) 18360b57cec5SDimitry Andric Diag(CurPtr, 18370b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal 18380b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal) 18390b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 18400b57cec5SDimitry Andric return CurPtr; 18410b57cec5SDimitry Andric } 18420b57cec5SDimitry Andric 18430b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 18440b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming 18450b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before 18460b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more 18470b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that. 18480b57cec5SDimitry Andric if (!Consumed) { 18490b57cec5SDimitry Andric bool IsUDSuffix = false; 18500b57cec5SDimitry Andric if (C == '_') 18510b57cec5SDimitry Andric IsUDSuffix = true; 18520b57cec5SDimitry Andric else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 18530b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a 18540b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be 18550b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator). 18560b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3; 18570b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C }; 18580b57cec5SDimitry Andric unsigned Consumed = Size; 18590b57cec5SDimitry Andric unsigned Chars = 1; 18600b57cec5SDimitry Andric while (true) { 18610b57cec5SDimitry Andric unsigned NextSize; 18620b57cec5SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 18630b57cec5SDimitry Andric getLangOpts()); 18640b57cec5SDimitry Andric if (!isIdentifierBody(Next)) { 1865*5ffd83dbSDimitry Andric // End of suffix. Check whether this is on the allowed list. 18660b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars); 18670b57cec5SDimitry Andric IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), 18680b57cec5SDimitry Andric CompleteSuffix); 18690b57cec5SDimitry Andric break; 18700b57cec5SDimitry Andric } 18710b57cec5SDimitry Andric 18720b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength) 18730b57cec5SDimitry Andric // Too long: can't be a standard suffix. 18740b57cec5SDimitry Andric break; 18750b57cec5SDimitry Andric 18760b57cec5SDimitry Andric Buffer[Chars++] = Next; 18770b57cec5SDimitry Andric Consumed += NextSize; 18780b57cec5SDimitry Andric } 18790b57cec5SDimitry Andric } 18800b57cec5SDimitry Andric 18810b57cec5SDimitry Andric if (!IsUDSuffix) { 18820b57cec5SDimitry Andric if (!isLexingRawMode()) 18830b57cec5SDimitry Andric Diag(CurPtr, getLangOpts().MSVCCompat 18840b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal 18850b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal) 18860b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 18870b57cec5SDimitry Andric return CurPtr; 18880b57cec5SDimitry Andric } 18890b57cec5SDimitry Andric 18900b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 18910b57cec5SDimitry Andric } 18920b57cec5SDimitry Andric 18930b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix); 18940b57cec5SDimitry Andric while (true) { 18950b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 18960b57cec5SDimitry Andric if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } 18970b57cec5SDimitry Andric else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} 18980b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} 18990b57cec5SDimitry Andric else break; 19000b57cec5SDimitry Andric } 19010b57cec5SDimitry Andric 19020b57cec5SDimitry Andric return CurPtr; 19030b57cec5SDimitry Andric } 19040b57cec5SDimitry Andric 19050b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 19060b57cec5SDimitry Andric /// either " or L" or u8" or u" or U". 19070b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 19080b57cec5SDimitry Andric tok::TokenKind Kind) { 19090b57cec5SDimitry Andric const char *AfterQuote = CurPtr; 19100b57cec5SDimitry Andric // Does this string contain the \0 character? 19110b57cec5SDimitry Andric const char *NulCharacter = nullptr; 19120b57cec5SDimitry Andric 19130b57cec5SDimitry Andric if (!isLexingRawMode() && 19140b57cec5SDimitry Andric (Kind == tok::utf8_string_literal || 19150b57cec5SDimitry Andric Kind == tok::utf16_string_literal || 19160b57cec5SDimitry Andric Kind == tok::utf32_string_literal)) 19170b57cec5SDimitry Andric Diag(BufferPtr, getLangOpts().CPlusPlus 19180b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 19190b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 19200b57cec5SDimitry Andric 19210b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 19220b57cec5SDimitry Andric while (C != '"') { 19230b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 19240b57cec5SDimitry Andric // getAndAdvanceChar. 19250b57cec5SDimitry Andric if (C == '\\') 19260b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 19270b57cec5SDimitry Andric 19280b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 19290b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 19300b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 19310b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 19320b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 19330b57cec5SDimitry Andric return true; 19340b57cec5SDimitry Andric } 19350b57cec5SDimitry Andric 19360b57cec5SDimitry Andric if (C == 0) { 19370b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 19380b57cec5SDimitry Andric if (ParsingFilename) 19390b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 19400b57cec5SDimitry Andric else 19410b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 19420b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 19430b57cec5SDimitry Andric cutOffLexing(); 19440b57cec5SDimitry Andric return true; 19450b57cec5SDimitry Andric } 19460b57cec5SDimitry Andric 19470b57cec5SDimitry Andric NulCharacter = CurPtr-1; 19480b57cec5SDimitry Andric } 19490b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 19500b57cec5SDimitry Andric } 19510b57cec5SDimitry Andric 19520b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 19530b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 19540b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 19550b57cec5SDimitry Andric 19560b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 19570b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 19580b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 19590b57cec5SDimitry Andric 19600b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var. 19610b57cec5SDimitry Andric const char *TokStart = BufferPtr; 19620b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 19630b57cec5SDimitry Andric Result.setLiteralData(TokStart); 19640b57cec5SDimitry Andric return true; 19650b57cec5SDimitry Andric } 19660b57cec5SDimitry Andric 19670b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 19680b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR". 19690b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 19700b57cec5SDimitry Andric tok::TokenKind Kind) { 19710b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 19720b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string, 19730b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs, 19740b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted. 19750b57cec5SDimitry Andric 19760b57cec5SDimitry Andric if (!isLexingRawMode()) 19770b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 19780b57cec5SDimitry Andric 19790b57cec5SDimitry Andric unsigned PrefixLen = 0; 19800b57cec5SDimitry Andric 19810b57cec5SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 19820b57cec5SDimitry Andric ++PrefixLen; 19830b57cec5SDimitry Andric 19840b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter. 19850b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') { 19860b57cec5SDimitry Andric if (!isLexingRawMode()) { 19870b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen]; 19880b57cec5SDimitry Andric if (PrefixLen == 16) { 19890b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long); 19900b57cec5SDimitry Andric } else { 19910b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 19920b57cec5SDimitry Andric << StringRef(PrefixEnd, 1); 19930b57cec5SDimitry Andric } 19940b57cec5SDimitry Andric } 19950b57cec5SDimitry Andric 19960b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 19970b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but 19980b57cec5SDimitry Andric // there's not much we can do about that. 19990b57cec5SDimitry Andric while (true) { 20000b57cec5SDimitry Andric char C = *CurPtr++; 20010b57cec5SDimitry Andric 20020b57cec5SDimitry Andric if (C == '"') 20030b57cec5SDimitry Andric break; 20040b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) { 20050b57cec5SDimitry Andric --CurPtr; 20060b57cec5SDimitry Andric break; 20070b57cec5SDimitry Andric } 20080b57cec5SDimitry Andric } 20090b57cec5SDimitry Andric 20100b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 20110b57cec5SDimitry Andric return true; 20120b57cec5SDimitry Andric } 20130b57cec5SDimitry Andric 20140b57cec5SDimitry Andric // Save prefix and move CurPtr past it 20150b57cec5SDimitry Andric const char *Prefix = CurPtr; 20160b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '(' 20170b57cec5SDimitry Andric 20180b57cec5SDimitry Andric while (true) { 20190b57cec5SDimitry Andric char C = *CurPtr++; 20200b57cec5SDimitry Andric 20210b57cec5SDimitry Andric if (C == ')') { 20220b57cec5SDimitry Andric // Check for prefix match and closing quote. 20230b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 20240b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"' 20250b57cec5SDimitry Andric break; 20260b57cec5SDimitry Andric } 20270b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 20280b57cec5SDimitry Andric if (!isLexingRawMode()) 20290b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string) 20300b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen); 20310b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 20320b57cec5SDimitry Andric return true; 20330b57cec5SDimitry Andric } 20340b57cec5SDimitry Andric } 20350b57cec5SDimitry Andric 20360b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 20370b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 20380b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 20390b57cec5SDimitry Andric 20400b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 20410b57cec5SDimitry Andric const char *TokStart = BufferPtr; 20420b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 20430b57cec5SDimitry Andric Result.setLiteralData(TokStart); 20440b57cec5SDimitry Andric return true; 20450b57cec5SDimitry Andric } 20460b57cec5SDimitry Andric 20470b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 20480b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames. 20490b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 20500b57cec5SDimitry Andric // Does this string contain the \0 character? 20510b57cec5SDimitry Andric const char *NulCharacter = nullptr; 20520b57cec5SDimitry Andric const char *AfterLessPos = CurPtr; 20530b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 20540b57cec5SDimitry Andric while (C != '>') { 20550b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 20560b57cec5SDimitry Andric // getAndAdvanceChar. 20570b57cec5SDimitry Andric if (C == '\\') 20580b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20590b57cec5SDimitry Andric 20600b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 20610b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 20620b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone < 20630b57cec5SDimitry Andric // character. Return this as such. 20640b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less); 20650b57cec5SDimitry Andric return true; 20660b57cec5SDimitry Andric } 20670b57cec5SDimitry Andric 20680b57cec5SDimitry Andric if (C == 0) { 20690b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) { 20700b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 20710b57cec5SDimitry Andric cutOffLexing(); 20720b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 20730b57cec5SDimitry Andric return true; 20740b57cec5SDimitry Andric } 20750b57cec5SDimitry Andric NulCharacter = CurPtr-1; 20760b57cec5SDimitry Andric } 20770b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20780b57cec5SDimitry Andric } 20790b57cec5SDimitry Andric 20800b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 20810b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 20820b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 20830b57cec5SDimitry Andric 20840b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 20850b57cec5SDimitry Andric const char *TokStart = BufferPtr; 20860b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name); 20870b57cec5SDimitry Andric Result.setLiteralData(TokStart); 20880b57cec5SDimitry Andric return true; 20890b57cec5SDimitry Andric } 20900b57cec5SDimitry Andric 20910b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart, 20920b57cec5SDimitry Andric const char *CompletionPoint, 20930b57cec5SDimitry Andric bool IsAngled) { 20940b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash. 20950b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2096*5ffd83dbSDimitry Andric llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 2097*5ffd83dbSDimitry Andric auto Slash = PartialPath.find_last_of(SlashChars); 20980b57cec5SDimitry Andric StringRef Dir = 20990b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 21000b57cec5SDimitry Andric const char *StartOfFilename = 21010b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 21020b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point. 21030b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 21040b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2105*5ffd83dbSDimitry Andric // We should replace the characters up to the closing quote or closest slash, 2106*5ffd83dbSDimitry Andric // if any. 21070b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) { 21080b57cec5SDimitry Andric char Next = *(CompletionPoint + 1); 21090b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n') 21100b57cec5SDimitry Andric break; 21110b57cec5SDimitry Andric ++CompletionPoint; 21120b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"')) 21130b57cec5SDimitry Andric break; 2114*5ffd83dbSDimitry Andric if (llvm::is_contained(SlashChars, Next)) 2115*5ffd83dbSDimitry Andric break; 21160b57cec5SDimitry Andric } 2117*5ffd83dbSDimitry Andric 21180b57cec5SDimitry Andric PP->setCodeCompletionTokenRange( 21190b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 21200b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 21210b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled); 21220b57cec5SDimitry Andric } 21230b57cec5SDimitry Andric 21240b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having 21250b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'. 21260b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 21270b57cec5SDimitry Andric tok::TokenKind Kind) { 21280b57cec5SDimitry Andric // Does this character contain the \0 character? 21290b57cec5SDimitry Andric const char *NulCharacter = nullptr; 21300b57cec5SDimitry Andric 21310b57cec5SDimitry Andric if (!isLexingRawMode()) { 21320b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 21330b57cec5SDimitry Andric Diag(BufferPtr, getLangOpts().CPlusPlus 21340b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 21350b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 21360b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant) 21370b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 21380b57cec5SDimitry Andric } 21390b57cec5SDimitry Andric 21400b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 21410b57cec5SDimitry Andric if (C == '\'') { 21420b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 21430b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character); 21440b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 21450b57cec5SDimitry Andric return true; 21460b57cec5SDimitry Andric } 21470b57cec5SDimitry Andric 21480b57cec5SDimitry Andric while (C != '\'') { 21490b57cec5SDimitry Andric // Skip escaped characters. 21500b57cec5SDimitry Andric if (C == '\\') 21510b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21520b57cec5SDimitry Andric 21530b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 21540b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 21550b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 21560b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 21570b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 21580b57cec5SDimitry Andric return true; 21590b57cec5SDimitry Andric } 21600b57cec5SDimitry Andric 21610b57cec5SDimitry Andric if (C == 0) { 21620b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 21630b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 21640b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 21650b57cec5SDimitry Andric cutOffLexing(); 21660b57cec5SDimitry Andric return true; 21670b57cec5SDimitry Andric } 21680b57cec5SDimitry Andric 21690b57cec5SDimitry Andric NulCharacter = CurPtr-1; 21700b57cec5SDimitry Andric } 21710b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21720b57cec5SDimitry Andric } 21730b57cec5SDimitry Andric 21740b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 21750b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 21760b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false); 21770b57cec5SDimitry Andric 21780b57cec5SDimitry Andric // If a nul character existed in the character, warn about it. 21790b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 21800b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0; 21810b57cec5SDimitry Andric 21820b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 21830b57cec5SDimitry Andric const char *TokStart = BufferPtr; 21840b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 21850b57cec5SDimitry Andric Result.setLiteralData(TokStart); 21860b57cec5SDimitry Andric return true; 21870b57cec5SDimitry Andric } 21880b57cec5SDimitry Andric 21890b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 21900b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return. 21910b57cec5SDimitry Andric /// 21920b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 21930b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 21940b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 21950b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace. 21960b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 21970b57cec5SDimitry Andric 21980b57cec5SDimitry Andric unsigned char Char = *CurPtr; 21990b57cec5SDimitry Andric 22000b57cec5SDimitry Andric // Skip consecutive spaces efficiently. 22010b57cec5SDimitry Andric while (true) { 22020b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively. 22030b57cec5SDimitry Andric while (isHorizontalWhitespace(Char)) 22040b57cec5SDimitry Andric Char = *++CurPtr; 22050b57cec5SDimitry Andric 22060b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done. 22070b57cec5SDimitry Andric if (!isVerticalWhitespace(Char)) 22080b57cec5SDimitry Andric break; 22090b57cec5SDimitry Andric 22100b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 22110b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this. 22120b57cec5SDimitry Andric BufferPtr = CurPtr; 22130b57cec5SDimitry Andric return false; 22140b57cec5SDimitry Andric } 22150b57cec5SDimitry Andric 22160b57cec5SDimitry Andric // OK, but handle newline. 22170b57cec5SDimitry Andric SawNewline = true; 22180b57cec5SDimitry Andric Char = *++CurPtr; 22190b57cec5SDimitry Andric } 22200b57cec5SDimitry Andric 22210b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now. 22220b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 22230b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 22240b57cec5SDimitry Andric if (SawNewline) { 22250b57cec5SDimitry Andric IsAtStartOfLine = true; 22260b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 22270b57cec5SDimitry Andric } 22280b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 22290b57cec5SDimitry Andric return true; 22300b57cec5SDimitry Andric } 22310b57cec5SDimitry Andric 22320b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space. 22330b57cec5SDimitry Andric char PrevChar = CurPtr[-1]; 22340b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 22350b57cec5SDimitry Andric 22360b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 22370b57cec5SDimitry Andric if (SawNewline) { 22380b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 22390b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 22400b57cec5SDimitry Andric } 22410b57cec5SDimitry Andric 22420b57cec5SDimitry Andric BufferPtr = CurPtr; 22430b57cec5SDimitry Andric return false; 22440b57cec5SDimitry Andric } 22450b57cec5SDimitry Andric 22460b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the 22470b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and 22480b57cec5SDimitry Andric /// return. 22490b57cec5SDimitry Andric /// 22500b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 22510b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 22520b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 22530b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 22540b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an 22550b57cec5SDimitry Andric // extension warning. 22560b57cec5SDimitry Andric if (!LangOpts.LineComment && !isLexingRawMode()) { 22570b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment); 22580b57cec5SDimitry Andric 22590b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation 22600b57cec5SDimitry Andric // unit. 22610b57cec5SDimitry Andric LangOpts.LineComment = true; 22620b57cec5SDimitry Andric } 22630b57cec5SDimitry Andric 22640b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that 22650b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in 22660b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop. 22670b57cec5SDimitry Andric // 22680b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer) 22690b57cec5SDimitry Andric // character that ends the line comment. 22700b57cec5SDimitry Andric char C; 22710b57cec5SDimitry Andric while (true) { 22720b57cec5SDimitry Andric C = *CurPtr; 22730b57cec5SDimitry Andric // Skip over characters in the fast loop. 22740b57cec5SDimitry Andric while (C != 0 && // Potentially EOF. 22750b57cec5SDimitry Andric C != '\n' && C != '\r') // Newline or DOS-style newline. 22760b57cec5SDimitry Andric C = *++CurPtr; 22770b57cec5SDimitry Andric 22780b57cec5SDimitry Andric const char *NextLine = CurPtr; 22790b57cec5SDimitry Andric if (C != 0) { 22800b57cec5SDimitry Andric // We found a newline, see if it's escaped. 22810b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1; 22820b57cec5SDimitry Andric bool HasSpace = false; 22830b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 22840b57cec5SDimitry Andric --EscapePtr; 22850b57cec5SDimitry Andric HasSpace = true; 22860b57cec5SDimitry Andric } 22870b57cec5SDimitry Andric 22880b57cec5SDimitry Andric if (*EscapePtr == '\\') 22890b57cec5SDimitry Andric // Escaped newline. 22900b57cec5SDimitry Andric CurPtr = EscapePtr; 22910b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 22920b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs) 22930b57cec5SDimitry Andric // Trigraph-escaped newline. 22940b57cec5SDimitry Andric CurPtr = EscapePtr-2; 22950b57cec5SDimitry Andric else 22960b57cec5SDimitry Andric break; // This is a newline, we're done. 22970b57cec5SDimitry Andric 22980b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 22990b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode()) 23000b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space); 23010b57cec5SDimitry Andric } 23020b57cec5SDimitry Andric 23030b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 23040b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting 23050b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline, 23060b57cec5SDimitry Andric // we'll handle it below. 23070b57cec5SDimitry Andric const char *OldPtr = CurPtr; 23080b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode(); 23090b57cec5SDimitry Andric LexingRawMode = true; 23100b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 23110b57cec5SDimitry Andric LexingRawMode = OldRawMode; 23120b57cec5SDimitry Andric 23130b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed. 23140b57cec5SDimitry Andric // We're done and can skip forward to the newline. 23150b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) { 23160b57cec5SDimitry Andric CurPtr = NextLine; 23170b57cec5SDimitry Andric break; 23180b57cec5SDimitry Andric } 23190b57cec5SDimitry Andric 23200b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or 23210b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic 23220b57cec5SDimitry Andric // unless the next line is also a // comment. 23230b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' && 23240b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 23250b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr) 23260b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 23270b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next 23280b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic. 23290b57cec5SDimitry Andric if (isWhitespace(C)) { 23300b57cec5SDimitry Andric const char *ForwardPtr = CurPtr; 23310b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace. 23320b57cec5SDimitry Andric ++ForwardPtr; 23330b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 23340b57cec5SDimitry Andric break; 23350b57cec5SDimitry Andric } 23360b57cec5SDimitry Andric 23370b57cec5SDimitry Andric if (!isLexingRawMode()) 23380b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment); 23390b57cec5SDimitry Andric break; 23400b57cec5SDimitry Andric } 23410b57cec5SDimitry Andric } 23420b57cec5SDimitry Andric 23430b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 23440b57cec5SDimitry Andric --CurPtr; 23450b57cec5SDimitry Andric break; 23460b57cec5SDimitry Andric } 23470b57cec5SDimitry Andric 23480b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 23490b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 23500b57cec5SDimitry Andric cutOffLexing(); 23510b57cec5SDimitry Andric return false; 23520b57cec5SDimitry Andric } 23530b57cec5SDimitry Andric } 23540b57cec5SDimitry Andric 23550b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the 23560b57cec5SDimitry Andric // comment unless we're in a #if 0 block. 23570b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 23580b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 23590b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 23600b57cec5SDimitry Andric BufferPtr = CurPtr; 23610b57cec5SDimitry Andric return true; // A token has to be returned. 23620b57cec5SDimitry Andric } 23630b57cec5SDimitry Andric 23640b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 23650b57cec5SDimitry Andric if (inKeepCommentMode()) 23660b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr); 23670b57cec5SDimitry Andric 23680b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 23690b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token. 23700b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 23710b57cec5SDimitry Andric BufferPtr = CurPtr; 23720b57cec5SDimitry Andric return false; 23730b57cec5SDimitry Andric } 23740b57cec5SDimitry Andric 23750b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or 23760b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't 23770b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that 23780b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the 23790b57cec5SDimitry Andric /// comment above in that mode. 23800b57cec5SDimitry Andric ++CurPtr; 23810b57cec5SDimitry Andric 23820b57cec5SDimitry Andric // The next returned token is at the start of the line. 23830b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 23840b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 23850b57cec5SDimitry Andric // No leading whitespace seen so far. 23860b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 23870b57cec5SDimitry Andric BufferPtr = CurPtr; 23880b57cec5SDimitry Andric return false; 23890b57cec5SDimitry Andric } 23900b57cec5SDimitry Andric 23910b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate 23920b57cec5SDimitry Andric /// way and return it. 23930b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 23940b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment 23950b57cec5SDimitry Andric // directly. 23960b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 23970b57cec5SDimitry Andric 23980b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode) 23990b57cec5SDimitry Andric return true; 24000b57cec5SDimitry Andric 24010b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into 24020b57cec5SDimitry Andric // a C-style block comment. 24030b57cec5SDimitry Andric bool Invalid = false; 24040b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid); 24050b57cec5SDimitry Andric if (Invalid) 24060b57cec5SDimitry Andric return true; 24070b57cec5SDimitry Andric 24080b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 24090b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*". 24100b57cec5SDimitry Andric Spelling += "*/"; // add suffix. 24110b57cec5SDimitry Andric 24120b57cec5SDimitry Andric Result.setKind(tok::comment); 24130b57cec5SDimitry Andric PP->CreateString(Spelling, Result, 24140b57cec5SDimitry Andric Result.getLocation(), Result.getLocation()); 24150b57cec5SDimitry Andric return true; 24160b57cec5SDimitry Andric } 24170b57cec5SDimitry Andric 24180b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 24190b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 24200b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment. 24210b57cec5SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 24220b57cec5SDimitry Andric Lexer *L) { 24230b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 24240b57cec5SDimitry Andric 24250b57cec5SDimitry Andric // Back up off the newline. 24260b57cec5SDimitry Andric --CurPtr; 24270b57cec5SDimitry Andric 24280b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character. 24290b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 24300b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline. 24310b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1]) 24320b57cec5SDimitry Andric return false; 24330b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline. 24340b57cec5SDimitry Andric --CurPtr; 24350b57cec5SDimitry Andric } 24360b57cec5SDimitry Andric 24370b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace 24380b57cec5SDimitry Andric // between the slash and newline. 24390b57cec5SDimitry Andric bool HasSpace = false; 24400b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 24410b57cec5SDimitry Andric --CurPtr; 24420b57cec5SDimitry Andric HasSpace = true; 24430b57cec5SDimitry Andric } 24440b57cec5SDimitry Andric 24450b57cec5SDimitry Andric // If we have a slash, we know this is an escaped newline. 24460b57cec5SDimitry Andric if (*CurPtr == '\\') { 24470b57cec5SDimitry Andric if (CurPtr[-1] != '*') return false; 24480b57cec5SDimitry Andric } else { 24490b57cec5SDimitry Andric // It isn't a slash, is it the ?? / trigraph? 24500b57cec5SDimitry Andric if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 24510b57cec5SDimitry Andric CurPtr[-3] != '*') 24520b57cec5SDimitry Andric return false; 24530b57cec5SDimitry Andric 24540b57cec5SDimitry Andric // This is the trigraph ending the comment. Emit a stern warning! 24550b57cec5SDimitry Andric CurPtr -= 2; 24560b57cec5SDimitry Andric 24570b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and 24580b57cec5SDimitry Andric // ignore this * character. 24590b57cec5SDimitry Andric if (!L->getLangOpts().Trigraphs) { 24600b57cec5SDimitry Andric if (!L->isLexingRawMode()) 24610b57cec5SDimitry Andric L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 24620b57cec5SDimitry Andric return false; 24630b57cec5SDimitry Andric } 24640b57cec5SDimitry Andric if (!L->isLexingRawMode()) 24650b57cec5SDimitry Andric L->Diag(CurPtr, diag::trigraph_ends_block_comment); 24660b57cec5SDimitry Andric } 24670b57cec5SDimitry Andric 24680b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters. 24690b57cec5SDimitry Andric if (!L->isLexingRawMode()) 24700b57cec5SDimitry Andric L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 24710b57cec5SDimitry Andric 24720b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 24730b57cec5SDimitry Andric if (HasSpace && !L->isLexingRawMode()) 24740b57cec5SDimitry Andric L->Diag(CurPtr, diag::backslash_newline_space); 24750b57cec5SDimitry Andric 24760b57cec5SDimitry Andric return true; 24770b57cec5SDimitry Andric } 24780b57cec5SDimitry Andric 24790b57cec5SDimitry Andric #ifdef __SSE2__ 24800b57cec5SDimitry Andric #include <emmintrin.h> 24810b57cec5SDimitry Andric #elif __ALTIVEC__ 24820b57cec5SDimitry Andric #include <altivec.h> 24830b57cec5SDimitry Andric #undef bool 24840b57cec5SDimitry Andric #endif 24850b57cec5SDimitry Andric 24860b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment. 24870b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment. 24880b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block 24890b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing 24900b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between 24910b57cec5SDimitry Andric /// the terminating * and /. 24920b57cec5SDimitry Andric /// 24930b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 24940b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 24950b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 24960b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 24970b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once 24980b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common 24990b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their 25000b57cec5SDimitry Andric // comments. 25010b57cec5SDimitry Andric 25020b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle 25030b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline 25040b57cec5SDimitry Andric // after it. 25050b57cec5SDimitry Andric unsigned CharSize; 25060b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize); 25070b57cec5SDimitry Andric CurPtr += CharSize; 25080b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) { 25090b57cec5SDimitry Andric if (!isLexingRawMode()) 25100b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 25110b57cec5SDimitry Andric --CurPtr; 25120b57cec5SDimitry Andric 25130b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 25140b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 25150b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 25160b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 25170b57cec5SDimitry Andric return true; 25180b57cec5SDimitry Andric } 25190b57cec5SDimitry Andric 25200b57cec5SDimitry Andric BufferPtr = CurPtr; 25210b57cec5SDimitry Andric return false; 25220b57cec5SDimitry Andric } 25230b57cec5SDimitry Andric 25240b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so, 25250b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it. 25260b57cec5SDimitry Andric if (C == '/') 25270b57cec5SDimitry Andric C = *CurPtr++; 25280b57cec5SDimitry Andric 25290b57cec5SDimitry Andric while (true) { 25300b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a 25310b57cec5SDimitry Andric // (probably ending) '/' character. 25320b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd && 25330b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it 25340b57cec5SDimitry Andric // doesn't check for '\0'. 25350b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 25360b57cec5SDimitry Andric // While not aligned to a 16-byte boundary. 25370b57cec5SDimitry Andric while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 25380b57cec5SDimitry Andric C = *CurPtr++; 25390b57cec5SDimitry Andric 25400b57cec5SDimitry Andric if (C == '/') goto FoundSlash; 25410b57cec5SDimitry Andric 25420b57cec5SDimitry Andric #ifdef __SSE2__ 25430b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/'); 25440b57cec5SDimitry Andric while (CurPtr+16 <= BufferEnd) { 25450b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 25460b57cec5SDimitry Andric Slashes)); 25470b57cec5SDimitry Andric if (cmp != 0) { 25480b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's 25490b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of 25500b57cec5SDimitry Andric // the outer loop. 25510b57cec5SDimitry Andric CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 25520b57cec5SDimitry Andric goto FoundSlash; 25530b57cec5SDimitry Andric } 25540b57cec5SDimitry Andric CurPtr += 16; 25550b57cec5SDimitry Andric } 25560b57cec5SDimitry Andric #elif __ALTIVEC__ 25570b57cec5SDimitry Andric __vector unsigned char Slashes = { 25580b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/', 25590b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/' 25600b57cec5SDimitry Andric }; 25610b57cec5SDimitry Andric while (CurPtr + 16 <= BufferEnd && 256213138422SDimitry Andric !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) 25630b57cec5SDimitry Andric CurPtr += 16; 25640b57cec5SDimitry Andric #else 25650b57cec5SDimitry Andric // Scan for '/' quickly. Many block comments are very large. 25660b57cec5SDimitry Andric while (CurPtr[0] != '/' && 25670b57cec5SDimitry Andric CurPtr[1] != '/' && 25680b57cec5SDimitry Andric CurPtr[2] != '/' && 25690b57cec5SDimitry Andric CurPtr[3] != '/' && 25700b57cec5SDimitry Andric CurPtr+4 < BufferEnd) { 25710b57cec5SDimitry Andric CurPtr += 4; 25720b57cec5SDimitry Andric } 25730b57cec5SDimitry Andric #endif 25740b57cec5SDimitry Andric 25750b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one. 25760b57cec5SDimitry Andric C = *CurPtr++; 25770b57cec5SDimitry Andric } 25780b57cec5SDimitry Andric 25790b57cec5SDimitry Andric // Loop to scan the remainder. 25800b57cec5SDimitry Andric while (C != '/' && C != '\0') 25810b57cec5SDimitry Andric C = *CurPtr++; 25820b57cec5SDimitry Andric 25830b57cec5SDimitry Andric if (C == '/') { 25840b57cec5SDimitry Andric FoundSlash: 25850b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done! 25860b57cec5SDimitry Andric break; 25870b57cec5SDimitry Andric 25880b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 25890b57cec5SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 25900b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the 25910b57cec5SDimitry Andric // * and /. We're done! 25920b57cec5SDimitry Andric break; 25930b57cec5SDimitry Andric } 25940b57cec5SDimitry Andric } 25950b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') { 25960b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this 25970b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with 25980b57cec5SDimitry Andric // embedded escaped newlines, but oh well. 25990b57cec5SDimitry Andric if (!isLexingRawMode()) 26000b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment); 26010b57cec5SDimitry Andric } 26020b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) { 26030b57cec5SDimitry Andric if (!isLexingRawMode()) 26040b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 26050b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately 26060b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the 26070b57cec5SDimitry Andric // comment, which surely would confuse the parser. 26080b57cec5SDimitry Andric --CurPtr; 26090b57cec5SDimitry Andric 26100b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 26110b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 26120b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 26130b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 26140b57cec5SDimitry Andric return true; 26150b57cec5SDimitry Andric } 26160b57cec5SDimitry Andric 26170b57cec5SDimitry Andric BufferPtr = CurPtr; 26180b57cec5SDimitry Andric return false; 26190b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 26200b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 26210b57cec5SDimitry Andric cutOffLexing(); 26220b57cec5SDimitry Andric return false; 26230b57cec5SDimitry Andric } 26240b57cec5SDimitry Andric 26250b57cec5SDimitry Andric C = *CurPtr++; 26260b57cec5SDimitry Andric } 26270b57cec5SDimitry Andric 26280b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block. 26290b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 26300b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 26310b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 26320b57cec5SDimitry Andric BufferPtr = CurPtr; 26330b57cec5SDimitry Andric return true; // A token has to be returned. 26340b57cec5SDimitry Andric } 26350b57cec5SDimitry Andric 26360b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 26370b57cec5SDimitry Andric if (inKeepCommentMode()) { 26380b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 26390b57cec5SDimitry Andric return true; 26400b57cec5SDimitry Andric } 26410b57cec5SDimitry Andric 26420b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be 26430b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it 26440b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would 26450b57cec5SDimitry Andric // have already returned above with the comment as a token. 26460b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 26470b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 26480b57cec5SDimitry Andric return false; 26490b57cec5SDimitry Andric } 26500b57cec5SDimitry Andric 26510b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token. 26520b57cec5SDimitry Andric BufferPtr = CurPtr; 26530b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 26540b57cec5SDimitry Andric return false; 26550b57cec5SDimitry Andric } 26560b57cec5SDimitry Andric 26570b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 26580b57cec5SDimitry Andric // Primary Lexing Entry Points 26590b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 26600b57cec5SDimitry Andric 26610b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 26620b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode. 26630b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 26640b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false && 26650b57cec5SDimitry Andric "Must be in a preprocessing directive!"); 26660b57cec5SDimitry Andric Token Tmp; 2667480093f4SDimitry Andric Tmp.startToken(); 26680b57cec5SDimitry Andric 26690b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 26700b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 26710b57cec5SDimitry Andric while (true) { 26720b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp); 26730b57cec5SDimitry Andric switch (Char) { 26740b57cec5SDimitry Andric default: 26750b57cec5SDimitry Andric if (Result) 26760b57cec5SDimitry Andric Result->push_back(Char); 26770b57cec5SDimitry Andric break; 26780b57cec5SDimitry Andric case 0: // Null. 26790b57cec5SDimitry Andric // Found end of file? 26800b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) { 26810b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 26820b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 26830b57cec5SDimitry Andric cutOffLexing(); 26840b57cec5SDimitry Andric return; 26850b57cec5SDimitry Andric } 26860b57cec5SDimitry Andric 26870b57cec5SDimitry Andric // Nope, normal character, continue. 26880b57cec5SDimitry Andric if (Result) 26890b57cec5SDimitry Andric Result->push_back(Char); 26900b57cec5SDimitry Andric break; 26910b57cec5SDimitry Andric } 26920b57cec5SDimitry Andric // FALL THROUGH. 26930b57cec5SDimitry Andric LLVM_FALLTHROUGH; 26940b57cec5SDimitry Andric case '\r': 26950b57cec5SDimitry Andric case '\n': 26960b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n. 26970b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 26980b57cec5SDimitry Andric BufferPtr = CurPtr-1; 26990b57cec5SDimitry Andric 27000b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition. 27010b57cec5SDimitry Andric Lex(Tmp); 27020b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) { 27030b57cec5SDimitry Andric if (PP) 27040b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 27050b57cec5SDimitry Andric Lex(Tmp); 27060b57cec5SDimitry Andric } 27070b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!"); 27080b57cec5SDimitry Andric 27090b57cec5SDimitry Andric // Finally, we're done; 27100b57cec5SDimitry Andric return; 27110b57cec5SDimitry Andric } 27120b57cec5SDimitry Andric } 27130b57cec5SDimitry Andric } 27140b57cec5SDimitry Andric 27150b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this 27160b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required. 27170b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be 27180b57cec5SDimitry Andric /// called again. 27190b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 27200b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive, 27210b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will 27220b57cec5SDimitry Andric // then be the end of file. 27230b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 27240b57cec5SDimitry Andric // Done parsing the "line". 27250b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 27260b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 27270b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod); 27280b57cec5SDimitry Andric 27290b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 27300b57cec5SDimitry Andric if (PP) 27310b57cec5SDimitry Andric resetExtendedTokenMode(); 27320b57cec5SDimitry Andric return true; // Have a token. 27330b57cec5SDimitry Andric } 27340b57cec5SDimitry Andric 27350b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller 27360b57cec5SDimitry Andric // that put us in raw mode handle the event. 27370b57cec5SDimitry Andric if (isLexingRawMode()) { 27380b57cec5SDimitry Andric Result.startToken(); 27390b57cec5SDimitry Andric BufferPtr = BufferEnd; 27400b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof); 27410b57cec5SDimitry Andric return true; 27420b57cec5SDimitry Andric } 27430b57cec5SDimitry Andric 27440b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 27450b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack); 27460b57cec5SDimitry Andric ConditionalStack.clear(); 27470b57cec5SDimitry Andric } 27480b57cec5SDimitry Andric 27490b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline. 27500b57cec5SDimitry Andric 27510b57cec5SDimitry Andric // If we are in a #if directive, emit an error. 27520b57cec5SDimitry Andric while (!ConditionalStack.empty()) { 27530b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc) 27540b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc, 27550b57cec5SDimitry Andric diag::err_pp_unterminated_conditional); 27560b57cec5SDimitry Andric ConditionalStack.pop_back(); 27570b57cec5SDimitry Andric } 27580b57cec5SDimitry Andric 27590b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 27600b57cec5SDimitry Andric // a pedwarn. 27610b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 27620b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics(); 27630b57cec5SDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd); 27640b57cec5SDimitry Andric unsigned DiagID; 27650b57cec5SDimitry Andric 27660b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 27670b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2 27680b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic, 27690b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning. 27700b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 27710b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof; 27720b57cec5SDimitry Andric } else { 27730b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof; 27740b57cec5SDimitry Andric } 27750b57cec5SDimitry Andric } else { 27760b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof; 27770b57cec5SDimitry Andric } 27780b57cec5SDimitry Andric 27790b57cec5SDimitry Andric Diag(BufferEnd, DiagID) 27800b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n"); 27810b57cec5SDimitry Andric } 27820b57cec5SDimitry Andric 27830b57cec5SDimitry Andric BufferPtr = CurPtr; 27840b57cec5SDimitry Andric 27850b57cec5SDimitry Andric // Finally, let the preprocessor handle this. 27860b57cec5SDimitry Andric return PP->HandleEndOfFile(Result, isPragmaLexer()); 27870b57cec5SDimitry Andric } 27880b57cec5SDimitry Andric 27890b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 27900b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something 27910b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the 27920b57cec5SDimitry Andric /// lexer. 27930b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() { 27940b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 27950b57cec5SDimitry Andric 27960b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token 27970b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF 27980b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack. 27990b57cec5SDimitry Andric LexingRawMode = true; 28000b57cec5SDimitry Andric 28010b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it. 28020b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr; 28030b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective; 28040b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine; 28050b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 28060b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace; 28070b57cec5SDimitry Andric 28080b57cec5SDimitry Andric Token Tok; 28090b57cec5SDimitry Andric Lex(Tok); 28100b57cec5SDimitry Andric 28110b57cec5SDimitry Andric // Restore state that may have changed. 28120b57cec5SDimitry Andric BufferPtr = TmpBufferPtr; 28130b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode; 28140b57cec5SDimitry Andric HasLeadingSpace = leadingSpace; 28150b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine; 28160b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 28170b57cec5SDimitry Andric 28180b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode. 28190b57cec5SDimitry Andric LexingRawMode = false; 28200b57cec5SDimitry Andric 28210b57cec5SDimitry Andric if (Tok.is(tok::eof)) 28220b57cec5SDimitry Andric return 2; 28230b57cec5SDimitry Andric return Tok.is(tok::l_paren); 28240b57cec5SDimitry Andric } 28250b57cec5SDimitry Andric 28260b57cec5SDimitry Andric /// Find the end of a version control conflict marker. 28270b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 28280b57cec5SDimitry Andric ConflictMarkerKind CMK) { 28290b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 28300b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 28310b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 28320b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator); 28330b57cec5SDimitry Andric while (Pos != StringRef::npos) { 28340b57cec5SDimitry Andric // Must occur at start of line. 28350b57cec5SDimitry Andric if (Pos == 0 || 28360b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 28370b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 28380b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator); 28390b57cec5SDimitry Andric continue; 28400b57cec5SDimitry Andric } 28410b57cec5SDimitry Andric return RestOfBuffer.data()+Pos; 28420b57cec5SDimitry Andric } 28430b57cec5SDimitry Andric return nullptr; 28440b57cec5SDimitry Andric } 28450b57cec5SDimitry Andric 28460b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version 28470b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 28480b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false 28490b57cec5SDimitry Andric /// if not. 28500b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 28510b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 28520b57cec5SDimitry Andric if (CurPtr != BufferStart && 28530b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 28540b57cec5SDimitry Andric return false; 28550b57cec5SDimitry Andric 28560b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>. 28570b57cec5SDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 28580b57cec5SDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 28590b57cec5SDimitry Andric return false; 28600b57cec5SDimitry Andric 28610b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 28620b57cec5SDimitry Andric // it. 28630b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode()) 28640b57cec5SDimitry Andric return false; 28650b57cec5SDimitry Andric 28660b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 28670b57cec5SDimitry Andric 28680b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the 28690b57cec5SDimitry Andric // start of a line to terminate this conflict marker. 28700b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 28710b57cec5SDimitry Andric // We found a match. We are really in a conflict marker. 28720b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line. 28730b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker); 28740b57cec5SDimitry Andric CurrentConflictMarkerState = Kind; 28750b57cec5SDimitry Andric 28760b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the 28770b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n. 28780b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') { 28790b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line"); 28800b57cec5SDimitry Andric ++CurPtr; 28810b57cec5SDimitry Andric } 28820b57cec5SDimitry Andric BufferPtr = CurPtr; 28830b57cec5SDimitry Andric return true; 28840b57cec5SDimitry Andric } 28850b57cec5SDimitry Andric 28860b57cec5SDimitry Andric // No end of conflict marker found. 28870b57cec5SDimitry Andric return false; 28880b57cec5SDimitry Andric } 28890b57cec5SDimitry Andric 28900b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 28910b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 28920b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of 28930b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not. 28940b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 28950b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 28960b57cec5SDimitry Andric if (CurPtr != BufferStart && 28970b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 28980b57cec5SDimitry Andric return false; 28990b57cec5SDimitry Andric 29000b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 29010b57cec5SDimitry Andric // it. 29020b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode()) 29030b57cec5SDimitry Andric return false; 29040b57cec5SDimitry Andric 29050b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row). 29060b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i) 29070b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0]) 29080b57cec5SDimitry Andric return false; 29090b57cec5SDimitry Andric 29100b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could 29110b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 29120b57cec5SDimitry Andric // be the end of conflict marker. 29130b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 29140b57cec5SDimitry Andric CurrentConflictMarkerState)) { 29150b57cec5SDimitry Andric CurPtr = End; 29160b57cec5SDimitry Andric 29170b57cec5SDimitry Andric // Skip ahead to the end of line. 29180b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 29190b57cec5SDimitry Andric ++CurPtr; 29200b57cec5SDimitry Andric 29210b57cec5SDimitry Andric BufferPtr = CurPtr; 29220b57cec5SDimitry Andric 29230b57cec5SDimitry Andric // No longer in the conflict marker. 29240b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 29250b57cec5SDimitry Andric return true; 29260b57cec5SDimitry Andric } 29270b57cec5SDimitry Andric 29280b57cec5SDimitry Andric return false; 29290b57cec5SDimitry Andric } 29300b57cec5SDimitry Andric 29310b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr, 29320b57cec5SDimitry Andric const char *BufferEnd) { 29330b57cec5SDimitry Andric if (CurPtr == BufferEnd) 29340b57cec5SDimitry Andric return nullptr; 29350b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character. 29360b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) { 29370b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>') 29380b57cec5SDimitry Andric return CurPtr + 2; 29390b57cec5SDimitry Andric } 29400b57cec5SDimitry Andric return nullptr; 29410b57cec5SDimitry Andric } 29420b57cec5SDimitry Andric 29430b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 29440b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 29450b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 29460b57cec5SDimitry Andric return false; 29470b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 29480b57cec5SDimitry Andric if (!End) 29490b57cec5SDimitry Andric return false; 29500b57cec5SDimitry Andric const char *Start = CurPtr - 1; 29510b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders) 29520b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source); 29530b57cec5SDimitry Andric Result.startToken(); 29540b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier); 29550b57cec5SDimitry Andric Result.setRawIdentifierData(Start); 29560b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result); 29570b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder); 29580b57cec5SDimitry Andric BufferPtr = End; 29590b57cec5SDimitry Andric return true; 29600b57cec5SDimitry Andric } 29610b57cec5SDimitry Andric 29620b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 29630b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) { 29640b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 29650b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc(); 29660b57cec5SDimitry Andric } 29670b57cec5SDimitry Andric 29680b57cec5SDimitry Andric return false; 29690b57cec5SDimitry Andric } 29700b57cec5SDimitry Andric 29710b57cec5SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 29720b57cec5SDimitry Andric Token *Result) { 29730b57cec5SDimitry Andric unsigned CharSize; 29740b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 29750b57cec5SDimitry Andric 29760b57cec5SDimitry Andric unsigned NumHexDigits; 29770b57cec5SDimitry Andric if (Kind == 'u') 29780b57cec5SDimitry Andric NumHexDigits = 4; 29790b57cec5SDimitry Andric else if (Kind == 'U') 29800b57cec5SDimitry Andric NumHexDigits = 8; 29810b57cec5SDimitry Andric else 29820b57cec5SDimitry Andric return 0; 29830b57cec5SDimitry Andric 29840b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) { 29850b57cec5SDimitry Andric if (Result && !isLexingRawMode()) 29860b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 29870b57cec5SDimitry Andric return 0; 29880b57cec5SDimitry Andric } 29890b57cec5SDimitry Andric 29900b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize; 29910b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1]; 29920b57cec5SDimitry Andric 29930b57cec5SDimitry Andric uint32_t CodePoint = 0; 29940b57cec5SDimitry Andric for (unsigned i = 0; i < NumHexDigits; ++i) { 29950b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize); 29960b57cec5SDimitry Andric 29970b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C); 29980b57cec5SDimitry Andric if (Value == -1U) { 29990b57cec5SDimitry Andric if (Result && !isLexingRawMode()) { 30000b57cec5SDimitry Andric if (i == 0) { 30010b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 30020b57cec5SDimitry Andric << StringRef(KindLoc, 1); 30030b57cec5SDimitry Andric } else { 30040b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 30050b57cec5SDimitry Andric 30060b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u. 30070b57cec5SDimitry Andric if (i == 4 && NumHexDigits == 8) { 30080b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 30090b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight) 30100b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u"); 30110b57cec5SDimitry Andric } 30120b57cec5SDimitry Andric } 30130b57cec5SDimitry Andric } 30140b57cec5SDimitry Andric 30150b57cec5SDimitry Andric return 0; 30160b57cec5SDimitry Andric } 30170b57cec5SDimitry Andric 30180b57cec5SDimitry Andric CodePoint <<= 4; 30190b57cec5SDimitry Andric CodePoint += Value; 30200b57cec5SDimitry Andric 30210b57cec5SDimitry Andric CurPtr += CharSize; 30220b57cec5SDimitry Andric } 30230b57cec5SDimitry Andric 30240b57cec5SDimitry Andric if (Result) { 30250b57cec5SDimitry Andric Result->setFlag(Token::HasUCN); 30260b57cec5SDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) 30270b57cec5SDimitry Andric StartPtr = CurPtr; 30280b57cec5SDimitry Andric else 30290b57cec5SDimitry Andric while (StartPtr != CurPtr) 30300b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 30310b57cec5SDimitry Andric } else { 30320b57cec5SDimitry Andric StartPtr = CurPtr; 30330b57cec5SDimitry Andric } 30340b57cec5SDimitry Andric 30350b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode 30360b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) 30370b57cec5SDimitry Andric return CodePoint; 30380b57cec5SDimitry Andric 30390b57cec5SDimitry Andric // C99 6.4.3p2: A universal character name shall not specify a character whose 30400b57cec5SDimitry Andric // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 30410b57cec5SDimitry Andric // 0060 (`), nor one in the range D800 through DFFF inclusive.) 30420b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a 30430b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the 30440b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 30450b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the 30460b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 30470b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the 30480b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 30490b57cec5SDimitry Andric // basic source character set, the program is ill-formed. 30500b57cec5SDimitry Andric if (CodePoint < 0xA0) { 30510b57cec5SDimitry Andric if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 30520b57cec5SDimitry Andric return CodePoint; 30530b57cec5SDimitry Andric 30540b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad 30550b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 30560b57cec5SDimitry Andric if (Result && PP) { 30570b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F) 30580b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character); 30590b57cec5SDimitry Andric else { 30600b57cec5SDimitry Andric char C = static_cast<char>(CodePoint); 30610b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 30620b57cec5SDimitry Andric } 30630b57cec5SDimitry Andric } 30640b57cec5SDimitry Andric 30650b57cec5SDimitry Andric return 0; 30660b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 30670b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 30680b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad 30690b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 30700b57cec5SDimitry Andric if (Result && PP) { 30710b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 30720b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 30730b57cec5SDimitry Andric else 30740b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid); 30750b57cec5SDimitry Andric } 30760b57cec5SDimitry Andric return 0; 30770b57cec5SDimitry Andric } 30780b57cec5SDimitry Andric 30790b57cec5SDimitry Andric return CodePoint; 30800b57cec5SDimitry Andric } 30810b57cec5SDimitry Andric 30820b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 30830b57cec5SDimitry Andric const char *CurPtr) { 30840b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 30850b57cec5SDimitry Andric UnicodeWhitespaceCharRanges); 30860b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 30870b57cec5SDimitry Andric UnicodeWhitespaceChars.contains(C)) { 30880b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace) 30890b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr); 30900b57cec5SDimitry Andric 30910b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 30920b57cec5SDimitry Andric return true; 30930b57cec5SDimitry Andric } 30940b57cec5SDimitry Andric return false; 30950b57cec5SDimitry Andric } 30960b57cec5SDimitry Andric 30970b57cec5SDimitry Andric bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 30980b57cec5SDimitry Andric if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { 30990b57cec5SDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 31000b57cec5SDimitry Andric !PP->isPreprocessedOutput()) { 31010b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 31020b57cec5SDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), 31030b57cec5SDimitry Andric /*IsFirst=*/true); 31040b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 31050b57cec5SDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 31060b57cec5SDimitry Andric } 31070b57cec5SDimitry Andric 31080b57cec5SDimitry Andric MIOpt.ReadToken(); 31090b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 31100b57cec5SDimitry Andric } 31110b57cec5SDimitry Andric 31120b57cec5SDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 31130b57cec5SDimitry Andric !PP->isPreprocessedOutput() && 31140b57cec5SDimitry Andric !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { 31150b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 31160b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 31170b57cec5SDimitry Andric // just drop the character. 31180b57cec5SDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually 31190b57cec5SDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that 31200b57cec5SDimitry Andric // we not throw away any possible preprocessor tokens, but there's a 31210b57cec5SDimitry Andric // loophole in the mapping of Unicode characters to basic character set 31220b57cec5SDimitry Andric // characters that allows us to map these particular characters to, say, 31230b57cec5SDimitry Andric // whitespace. 31240b57cec5SDimitry Andric Diag(BufferPtr, diag::err_non_ascii) 31250b57cec5SDimitry Andric << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); 31260b57cec5SDimitry Andric 31270b57cec5SDimitry Andric BufferPtr = CurPtr; 31280b57cec5SDimitry Andric return false; 31290b57cec5SDimitry Andric } 31300b57cec5SDimitry Andric 31310b57cec5SDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show 31320b57cec5SDimitry Andric // up by accident. 31330b57cec5SDimitry Andric MIOpt.ReadToken(); 31340b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 31350b57cec5SDimitry Andric return true; 31360b57cec5SDimitry Andric } 31370b57cec5SDimitry Andric 31380b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 31390b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine(); 31400b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace(); 31410b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 31420b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine. 31430b57cec5SDimitry Andric } 31440b57cec5SDimitry Andric 31450b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) { 31460b57cec5SDimitry Andric // Start a new token. 31470b57cec5SDimitry Andric Result.startToken(); 31480b57cec5SDimitry Andric 31490b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal. 31500b57cec5SDimitry Andric if (IsAtStartOfLine) { 31510b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 31520b57cec5SDimitry Andric IsAtStartOfLine = false; 31530b57cec5SDimitry Andric } 31540b57cec5SDimitry Andric 31550b57cec5SDimitry Andric if (HasLeadingSpace) { 31560b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 31570b57cec5SDimitry Andric HasLeadingSpace = false; 31580b57cec5SDimitry Andric } 31590b57cec5SDimitry Andric 31600b57cec5SDimitry Andric if (HasLeadingEmptyMacro) { 31610b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro); 31620b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 31630b57cec5SDimitry Andric } 31640b57cec5SDimitry Andric 31650b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 31660b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false; 31670b57cec5SDimitry Andric bool isRawLex = isLexingRawMode(); 31680b57cec5SDimitry Andric (void) isRawLex; 31690b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 31700b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.) 31710b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 31720b57cec5SDimitry Andric return returnedToken; 31730b57cec5SDimitry Andric } 31740b57cec5SDimitry Andric 31750b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an 31760b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer 31770b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing 31780b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes 31790b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this. 31800b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 31810b57cec5SDimitry Andric LexNextToken: 31820b57cec5SDimitry Andric // New token, can't need cleaning yet. 31830b57cec5SDimitry Andric Result.clearFlag(Token::NeedsCleaning); 31840b57cec5SDimitry Andric Result.setIdentifierInfo(nullptr); 31850b57cec5SDimitry Andric 31860b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 31870b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 31880b57cec5SDimitry Andric 31890b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens. 31900b57cec5SDimitry Andric if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 31910b57cec5SDimitry Andric ++CurPtr; 31920b57cec5SDimitry Andric while ((*CurPtr == ' ') || (*CurPtr == '\t')) 31930b57cec5SDimitry Andric ++CurPtr; 31940b57cec5SDimitry Andric 31950b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just 31960b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the 31970b57cec5SDimitry Andric // whitespace. 31980b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 31990b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 32000b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 32010b57cec5SDimitry Andric return true; 32020b57cec5SDimitry Andric } 32030b57cec5SDimitry Andric 32040b57cec5SDimitry Andric BufferPtr = CurPtr; 32050b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 32060b57cec5SDimitry Andric } 32070b57cec5SDimitry Andric 32080b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 32090b57cec5SDimitry Andric 32100b57cec5SDimitry Andric // Read a character, advancing over it. 32110b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result); 32120b57cec5SDimitry Andric tok::TokenKind Kind; 32130b57cec5SDimitry Andric 32140b57cec5SDimitry Andric switch (Char) { 32150b57cec5SDimitry Andric case 0: // Null. 32160b57cec5SDimitry Andric // Found end of file? 32170b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd) 32180b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 32190b57cec5SDimitry Andric 32200b57cec5SDimitry Andric // Check if we are performing code completion. 32210b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 32220b57cec5SDimitry Andric // Return the code-completion token. 32230b57cec5SDimitry Andric Result.startToken(); 32240b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion); 32250b57cec5SDimitry Andric return true; 32260b57cec5SDimitry Andric } 32270b57cec5SDimitry Andric 32280b57cec5SDimitry Andric if (!isLexingRawMode()) 32290b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file); 32300b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 32310b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 32320b57cec5SDimitry Andric return true; // KeepWhitespaceMode 32330b57cec5SDimitry Andric 32340b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer. 32350b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 32360b57cec5SDimitry Andric goto LexNextToken; 32370b57cec5SDimitry Andric 32380b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z". 32390b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file. 32400b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) { 32410b57cec5SDimitry Andric if (!isLexingRawMode()) 32420b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 32430b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 32440b57cec5SDimitry Andric } 32450b57cec5SDimitry Andric 32460b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage. 32470b57cec5SDimitry Andric Kind = tok::unknown; 32480b57cec5SDimitry Andric break; 32490b57cec5SDimitry Andric 32500b57cec5SDimitry Andric case '\r': 32510b57cec5SDimitry Andric if (CurPtr[0] == '\n') 32520b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 32530b57cec5SDimitry Andric LLVM_FALLTHROUGH; 32540b57cec5SDimitry Andric case '\n': 32550b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 32560b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token. 32570b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 32580b57cec5SDimitry Andric // Done parsing the "line". 32590b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 32600b57cec5SDimitry Andric 32610b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 32620b57cec5SDimitry Andric if (PP) 32630b57cec5SDimitry Andric resetExtendedTokenMode(); 32640b57cec5SDimitry Andric 32650b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line. 32660b57cec5SDimitry Andric IsAtStartOfLine = true; 32670b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 32680b57cec5SDimitry Andric 32690b57cec5SDimitry Andric Kind = tok::eod; 32700b57cec5SDimitry Andric break; 32710b57cec5SDimitry Andric } 32720b57cec5SDimitry Andric 32730b57cec5SDimitry Andric // No leading whitespace seen so far. 32740b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 32750b57cec5SDimitry Andric 32760b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 32770b57cec5SDimitry Andric return true; // KeepWhitespaceMode 32780b57cec5SDimitry Andric 32790b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 32800b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 32810b57cec5SDimitry Andric goto LexNextToken; 32820b57cec5SDimitry Andric case ' ': 32830b57cec5SDimitry Andric case '\t': 32840b57cec5SDimitry Andric case '\f': 32850b57cec5SDimitry Andric case '\v': 32860b57cec5SDimitry Andric SkipHorizontalWhitespace: 32870b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 32880b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 32890b57cec5SDimitry Andric return true; // KeepWhitespaceMode 32900b57cec5SDimitry Andric 32910b57cec5SDimitry Andric SkipIgnoredUnits: 32920b57cec5SDimitry Andric CurPtr = BufferPtr; 32930b57cec5SDimitry Andric 32940b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently 32950b57cec5SDimitry Andric // too (without going through the big switch stmt). 32960b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 32970b57cec5SDimitry Andric LangOpts.LineComment && 32980b57cec5SDimitry Andric (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 32990b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 33000b57cec5SDimitry Andric return true; // There is a token to return. 33010b57cec5SDimitry Andric goto SkipIgnoredUnits; 33020b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 33030b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 33040b57cec5SDimitry Andric return true; // There is a token to return. 33050b57cec5SDimitry Andric goto SkipIgnoredUnits; 33060b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) { 33070b57cec5SDimitry Andric goto SkipHorizontalWhitespace; 33080b57cec5SDimitry Andric } 33090b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 33100b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 33110b57cec5SDimitry Andric goto LexNextToken; 33120b57cec5SDimitry Andric 33130b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants. 33140b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants. 33150b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4': 33160b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': 33170b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 33180b57cec5SDimitry Andric MIOpt.ReadToken(); 33190b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 33200b57cec5SDimitry Andric 33210b57cec5SDimitry Andric case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 33220b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 33230b57cec5SDimitry Andric MIOpt.ReadToken(); 33240b57cec5SDimitry Andric 33250b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 33260b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 33270b57cec5SDimitry Andric 33280b57cec5SDimitry Andric // UTF-16 string literal 33290b57cec5SDimitry Andric if (Char == '"') 33300b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 33310b57cec5SDimitry Andric tok::utf16_string_literal); 33320b57cec5SDimitry Andric 33330b57cec5SDimitry Andric // UTF-16 character constant 33340b57cec5SDimitry Andric if (Char == '\'') 33350b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 33360b57cec5SDimitry Andric tok::utf16_char_constant); 33370b57cec5SDimitry Andric 33380b57cec5SDimitry Andric // UTF-16 raw string literal 33390b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 33400b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 33410b57cec5SDimitry Andric return LexRawStringLiteral(Result, 33420b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 33430b57cec5SDimitry Andric SizeTmp2, Result), 33440b57cec5SDimitry Andric tok::utf16_string_literal); 33450b57cec5SDimitry Andric 33460b57cec5SDimitry Andric if (Char == '8') { 33470b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 33480b57cec5SDimitry Andric 33490b57cec5SDimitry Andric // UTF-8 string literal 33500b57cec5SDimitry Andric if (Char2 == '"') 33510b57cec5SDimitry Andric return LexStringLiteral(Result, 33520b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 33530b57cec5SDimitry Andric SizeTmp2, Result), 33540b57cec5SDimitry Andric tok::utf8_string_literal); 33550b57cec5SDimitry Andric if (Char2 == '\'' && LangOpts.CPlusPlus17) 33560b57cec5SDimitry Andric return LexCharConstant( 33570b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 33580b57cec5SDimitry Andric SizeTmp2, Result), 33590b57cec5SDimitry Andric tok::utf8_char_constant); 33600b57cec5SDimitry Andric 33610b57cec5SDimitry Andric if (Char2 == 'R' && LangOpts.CPlusPlus11) { 33620b57cec5SDimitry Andric unsigned SizeTmp3; 33630b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 33640b57cec5SDimitry Andric // UTF-8 raw string literal 33650b57cec5SDimitry Andric if (Char3 == '"') { 33660b57cec5SDimitry Andric return LexRawStringLiteral(Result, 33670b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 33680b57cec5SDimitry Andric SizeTmp2, Result), 33690b57cec5SDimitry Andric SizeTmp3, Result), 33700b57cec5SDimitry Andric tok::utf8_string_literal); 33710b57cec5SDimitry Andric } 33720b57cec5SDimitry Andric } 33730b57cec5SDimitry Andric } 33740b57cec5SDimitry Andric } 33750b57cec5SDimitry Andric 33760b57cec5SDimitry Andric // treat u like the start of an identifier. 33770b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 33780b57cec5SDimitry Andric 33790b57cec5SDimitry Andric case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 33800b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 33810b57cec5SDimitry Andric MIOpt.ReadToken(); 33820b57cec5SDimitry Andric 33830b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 33840b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 33850b57cec5SDimitry Andric 33860b57cec5SDimitry Andric // UTF-32 string literal 33870b57cec5SDimitry Andric if (Char == '"') 33880b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 33890b57cec5SDimitry Andric tok::utf32_string_literal); 33900b57cec5SDimitry Andric 33910b57cec5SDimitry Andric // UTF-32 character constant 33920b57cec5SDimitry Andric if (Char == '\'') 33930b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 33940b57cec5SDimitry Andric tok::utf32_char_constant); 33950b57cec5SDimitry Andric 33960b57cec5SDimitry Andric // UTF-32 raw string literal 33970b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 33980b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 33990b57cec5SDimitry Andric return LexRawStringLiteral(Result, 34000b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 34010b57cec5SDimitry Andric SizeTmp2, Result), 34020b57cec5SDimitry Andric tok::utf32_string_literal); 34030b57cec5SDimitry Andric } 34040b57cec5SDimitry Andric 34050b57cec5SDimitry Andric // treat U like the start of an identifier. 34060b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 34070b57cec5SDimitry Andric 34080b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal 34090b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34100b57cec5SDimitry Andric MIOpt.ReadToken(); 34110b57cec5SDimitry Andric 34120b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 34130b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 34140b57cec5SDimitry Andric 34150b57cec5SDimitry Andric if (Char == '"') 34160b57cec5SDimitry Andric return LexRawStringLiteral(Result, 34170b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result), 34180b57cec5SDimitry Andric tok::string_literal); 34190b57cec5SDimitry Andric } 34200b57cec5SDimitry Andric 34210b57cec5SDimitry Andric // treat R like the start of an identifier. 34220b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 34230b57cec5SDimitry Andric 34240b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 34250b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34260b57cec5SDimitry Andric MIOpt.ReadToken(); 34270b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 34280b57cec5SDimitry Andric 34290b57cec5SDimitry Andric // Wide string literal. 34300b57cec5SDimitry Andric if (Char == '"') 34310b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 34320b57cec5SDimitry Andric tok::wide_string_literal); 34330b57cec5SDimitry Andric 34340b57cec5SDimitry Andric // Wide raw string literal. 34350b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && Char == 'R' && 34360b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 34370b57cec5SDimitry Andric return LexRawStringLiteral(Result, 34380b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 34390b57cec5SDimitry Andric SizeTmp2, Result), 34400b57cec5SDimitry Andric tok::wide_string_literal); 34410b57cec5SDimitry Andric 34420b57cec5SDimitry Andric // Wide character constant. 34430b57cec5SDimitry Andric if (Char == '\'') 34440b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 34450b57cec5SDimitry Andric tok::wide_char_constant); 34460b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier. 34470b57cec5SDimitry Andric LLVM_FALLTHROUGH; 34480b57cec5SDimitry Andric 34490b57cec5SDimitry Andric // C99 6.4.2: Identifiers. 34500b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 34510b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 34520b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 34530b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z': 34540b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 34550b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 34560b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 34570b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z': 34580b57cec5SDimitry Andric case '_': 34590b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34600b57cec5SDimitry Andric MIOpt.ReadToken(); 34610b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 34620b57cec5SDimitry Andric 34630b57cec5SDimitry Andric case '$': // $ in identifiers. 34640b57cec5SDimitry Andric if (LangOpts.DollarIdents) { 34650b57cec5SDimitry Andric if (!isLexingRawMode()) 34660b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier); 34670b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34680b57cec5SDimitry Andric MIOpt.ReadToken(); 34690b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 34700b57cec5SDimitry Andric } 34710b57cec5SDimitry Andric 34720b57cec5SDimitry Andric Kind = tok::unknown; 34730b57cec5SDimitry Andric break; 34740b57cec5SDimitry Andric 34750b57cec5SDimitry Andric // C99 6.4.4: Character Constants. 34760b57cec5SDimitry Andric case '\'': 34770b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34780b57cec5SDimitry Andric MIOpt.ReadToken(); 34790b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant); 34800b57cec5SDimitry Andric 34810b57cec5SDimitry Andric // C99 6.4.5: String Literals. 34820b57cec5SDimitry Andric case '"': 34830b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 34840b57cec5SDimitry Andric MIOpt.ReadToken(); 34850b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr, 34860b57cec5SDimitry Andric ParsingFilename ? tok::header_name 34870b57cec5SDimitry Andric : tok::string_literal); 34880b57cec5SDimitry Andric 34890b57cec5SDimitry Andric // C99 6.4.6: Punctuators. 34900b57cec5SDimitry Andric case '?': 34910b57cec5SDimitry Andric Kind = tok::question; 34920b57cec5SDimitry Andric break; 34930b57cec5SDimitry Andric case '[': 34940b57cec5SDimitry Andric Kind = tok::l_square; 34950b57cec5SDimitry Andric break; 34960b57cec5SDimitry Andric case ']': 34970b57cec5SDimitry Andric Kind = tok::r_square; 34980b57cec5SDimitry Andric break; 34990b57cec5SDimitry Andric case '(': 35000b57cec5SDimitry Andric Kind = tok::l_paren; 35010b57cec5SDimitry Andric break; 35020b57cec5SDimitry Andric case ')': 35030b57cec5SDimitry Andric Kind = tok::r_paren; 35040b57cec5SDimitry Andric break; 35050b57cec5SDimitry Andric case '{': 35060b57cec5SDimitry Andric Kind = tok::l_brace; 35070b57cec5SDimitry Andric break; 35080b57cec5SDimitry Andric case '}': 35090b57cec5SDimitry Andric Kind = tok::r_brace; 35100b57cec5SDimitry Andric break; 35110b57cec5SDimitry Andric case '.': 35120b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35130b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') { 35140b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 35150b57cec5SDimitry Andric MIOpt.ReadToken(); 35160b57cec5SDimitry Andric 35170b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 35180b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') { 35190b57cec5SDimitry Andric Kind = tok::periodstar; 35200b57cec5SDimitry Andric CurPtr += SizeTmp; 35210b57cec5SDimitry Andric } else if (Char == '.' && 35220b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 35230b57cec5SDimitry Andric Kind = tok::ellipsis; 35240b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 35250b57cec5SDimitry Andric SizeTmp2, Result); 35260b57cec5SDimitry Andric } else { 35270b57cec5SDimitry Andric Kind = tok::period; 35280b57cec5SDimitry Andric } 35290b57cec5SDimitry Andric break; 35300b57cec5SDimitry Andric case '&': 35310b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35320b57cec5SDimitry Andric if (Char == '&') { 35330b57cec5SDimitry Andric Kind = tok::ampamp; 35340b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35350b57cec5SDimitry Andric } else if (Char == '=') { 35360b57cec5SDimitry Andric Kind = tok::ampequal; 35370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35380b57cec5SDimitry Andric } else { 35390b57cec5SDimitry Andric Kind = tok::amp; 35400b57cec5SDimitry Andric } 35410b57cec5SDimitry Andric break; 35420b57cec5SDimitry Andric case '*': 35430b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 35440b57cec5SDimitry Andric Kind = tok::starequal; 35450b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35460b57cec5SDimitry Andric } else { 35470b57cec5SDimitry Andric Kind = tok::star; 35480b57cec5SDimitry Andric } 35490b57cec5SDimitry Andric break; 35500b57cec5SDimitry Andric case '+': 35510b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35520b57cec5SDimitry Andric if (Char == '+') { 35530b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35540b57cec5SDimitry Andric Kind = tok::plusplus; 35550b57cec5SDimitry Andric } else if (Char == '=') { 35560b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35570b57cec5SDimitry Andric Kind = tok::plusequal; 35580b57cec5SDimitry Andric } else { 35590b57cec5SDimitry Andric Kind = tok::plus; 35600b57cec5SDimitry Andric } 35610b57cec5SDimitry Andric break; 35620b57cec5SDimitry Andric case '-': 35630b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35640b57cec5SDimitry Andric if (Char == '-') { // -- 35650b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35660b57cec5SDimitry Andric Kind = tok::minusminus; 35670b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus && 35680b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 35690b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 35700b57cec5SDimitry Andric SizeTmp2, Result); 35710b57cec5SDimitry Andric Kind = tok::arrowstar; 35720b57cec5SDimitry Andric } else if (Char == '>') { // -> 35730b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35740b57cec5SDimitry Andric Kind = tok::arrow; 35750b57cec5SDimitry Andric } else if (Char == '=') { // -= 35760b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35770b57cec5SDimitry Andric Kind = tok::minusequal; 35780b57cec5SDimitry Andric } else { 35790b57cec5SDimitry Andric Kind = tok::minus; 35800b57cec5SDimitry Andric } 35810b57cec5SDimitry Andric break; 35820b57cec5SDimitry Andric case '~': 35830b57cec5SDimitry Andric Kind = tok::tilde; 35840b57cec5SDimitry Andric break; 35850b57cec5SDimitry Andric case '!': 35860b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 35870b57cec5SDimitry Andric Kind = tok::exclaimequal; 35880b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 35890b57cec5SDimitry Andric } else { 35900b57cec5SDimitry Andric Kind = tok::exclaim; 35910b57cec5SDimitry Andric } 35920b57cec5SDimitry Andric break; 35930b57cec5SDimitry Andric case '/': 35940b57cec5SDimitry Andric // 6.4.9: Comments 35950b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35960b57cec5SDimitry Andric if (Char == '/') { // Line comment. 35970b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally 35980b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though, 35990b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the 36000b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex 36010b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as 36020b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'. 36030b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment. 36040b57cec5SDimitry Andric // However, we never do this if we are just preprocessing. 36050b57cec5SDimitry Andric bool TreatAsComment = LangOpts.LineComment && 36060b57cec5SDimitry Andric (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 36070b57cec5SDimitry Andric if (!TreatAsComment) 36080b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput())) 36090b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 36100b57cec5SDimitry Andric 36110b57cec5SDimitry Andric if (TreatAsComment) { 36120b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 36130b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 36140b57cec5SDimitry Andric return true; // There is a token to return. 36150b57cec5SDimitry Andric 36160b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be 36170b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through 36180b57cec5SDimitry Andric // the big switch, handle it efficiently now. 36190b57cec5SDimitry Andric goto SkipIgnoredUnits; 36200b57cec5SDimitry Andric } 36210b57cec5SDimitry Andric } 36220b57cec5SDimitry Andric 36230b57cec5SDimitry Andric if (Char == '*') { // /**/ comment. 36240b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 36250b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 36260b57cec5SDimitry Andric return true; // There is a token to return. 36270b57cec5SDimitry Andric 36280b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 36290b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 36300b57cec5SDimitry Andric goto LexNextToken; 36310b57cec5SDimitry Andric } 36320b57cec5SDimitry Andric 36330b57cec5SDimitry Andric if (Char == '=') { 36340b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36350b57cec5SDimitry Andric Kind = tok::slashequal; 36360b57cec5SDimitry Andric } else { 36370b57cec5SDimitry Andric Kind = tok::slash; 36380b57cec5SDimitry Andric } 36390b57cec5SDimitry Andric break; 36400b57cec5SDimitry Andric case '%': 36410b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36420b57cec5SDimitry Andric if (Char == '=') { 36430b57cec5SDimitry Andric Kind = tok::percentequal; 36440b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36450b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') { 36460b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}' 36470b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36480b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { 36490b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36500b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36510b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 36520b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##' 36530b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36540b57cec5SDimitry Andric SizeTmp2, Result); 36550b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 36560b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36570b57cec5SDimitry Andric if (!isLexingRawMode()) 36580b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 36590b57cec5SDimitry Andric Kind = tok::hashat; 36600b57cec5SDimitry Andric } else { // '%:' -> '#' 36610b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 36620b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 36630b57cec5SDimitry Andric // the preprocessor to handle it. 36640b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 36650b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 36660b57cec5SDimitry Andric goto HandleDirective; 36670b57cec5SDimitry Andric 36680b57cec5SDimitry Andric Kind = tok::hash; 36690b57cec5SDimitry Andric } 36700b57cec5SDimitry Andric } else { 36710b57cec5SDimitry Andric Kind = tok::percent; 36720b57cec5SDimitry Andric } 36730b57cec5SDimitry Andric break; 36740b57cec5SDimitry Andric case '<': 36750b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36760b57cec5SDimitry Andric if (ParsingFilename) { 36770b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr); 36780b57cec5SDimitry Andric } else if (Char == '<') { 36790b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 36800b57cec5SDimitry Andric if (After == '=') { 36810b57cec5SDimitry Andric Kind = tok::lesslessequal; 36820b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36830b57cec5SDimitry Andric SizeTmp2, Result); 36840b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 36850b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker, 36860b57cec5SDimitry Andric // recognize it as such and recover nicely. 36870b57cec5SDimitry Andric goto LexNextToken; 36880b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 36890b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker, 36900b57cec5SDimitry Andric // ignore it. 36910b57cec5SDimitry Andric goto LexNextToken; 36920b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') { 36930b57cec5SDimitry Andric Kind = tok::lesslessless; 36940b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36950b57cec5SDimitry Andric SizeTmp2, Result); 36960b57cec5SDimitry Andric } else { 36970b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 36980b57cec5SDimitry Andric Kind = tok::lessless; 36990b57cec5SDimitry Andric } 37000b57cec5SDimitry Andric } else if (Char == '=') { 37010b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 37020b57cec5SDimitry Andric if (After == '>') { 3703*5ffd83dbSDimitry Andric if (getLangOpts().CPlusPlus20) { 37040b57cec5SDimitry Andric if (!isLexingRawMode()) 37050b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 37060b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37070b57cec5SDimitry Andric SizeTmp2, Result); 37080b57cec5SDimitry Andric Kind = tok::spaceship; 37090b57cec5SDimitry Andric break; 37100b57cec5SDimitry Andric } 37110b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a 37120b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode. 37130b57cec5SDimitry Andric if (getLangOpts().CPlusPlus && !isLexingRawMode()) { 3714*5ffd83dbSDimitry Andric Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 37150b57cec5SDimitry Andric << FixItHint::CreateInsertion( 37160b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 37170b57cec5SDimitry Andric } 37180b57cec5SDimitry Andric } 37190b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37200b57cec5SDimitry Andric Kind = tok::lessequal; 37210b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 37220b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && 37230b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 37240b57cec5SDimitry Andric // C++0x [lex.pptoken]p3: 37250b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent 37260b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor 37270b57cec5SDimitry Andric // token by itself and not as the first character of the alternative 37280b57cec5SDimitry Andric // token <:. 37290b57cec5SDimitry Andric unsigned SizeTmp3; 37300b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 37310b57cec5SDimitry Andric if (After != ':' && After != '>') { 37320b57cec5SDimitry Andric Kind = tok::less; 37330b57cec5SDimitry Andric if (!isLexingRawMode()) 37340b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 37350b57cec5SDimitry Andric break; 37360b57cec5SDimitry Andric } 37370b57cec5SDimitry Andric } 37380b57cec5SDimitry Andric 37390b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37400b57cec5SDimitry Andric Kind = tok::l_square; 37410b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 37420b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37430b57cec5SDimitry Andric Kind = tok::l_brace; 37440b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 37450b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) { 37460b57cec5SDimitry Andric return true; 37470b57cec5SDimitry Andric } else { 37480b57cec5SDimitry Andric Kind = tok::less; 37490b57cec5SDimitry Andric } 37500b57cec5SDimitry Andric break; 37510b57cec5SDimitry Andric case '>': 37520b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37530b57cec5SDimitry Andric if (Char == '=') { 37540b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37550b57cec5SDimitry Andric Kind = tok::greaterequal; 37560b57cec5SDimitry Andric } else if (Char == '>') { 37570b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 37580b57cec5SDimitry Andric if (After == '=') { 37590b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37600b57cec5SDimitry Andric SizeTmp2, Result); 37610b57cec5SDimitry Andric Kind = tok::greatergreaterequal; 37620b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 37630b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such 37640b57cec5SDimitry Andric // and recover nicely. 37650b57cec5SDimitry Andric goto LexNextToken; 37660b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 37670b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 37680b57cec5SDimitry Andric goto LexNextToken; 37690b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') { 37700b57cec5SDimitry Andric Kind = tok::greatergreatergreater; 37710b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37720b57cec5SDimitry Andric SizeTmp2, Result); 37730b57cec5SDimitry Andric } else { 37740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37750b57cec5SDimitry Andric Kind = tok::greatergreater; 37760b57cec5SDimitry Andric } 37770b57cec5SDimitry Andric } else { 37780b57cec5SDimitry Andric Kind = tok::greater; 37790b57cec5SDimitry Andric } 37800b57cec5SDimitry Andric break; 37810b57cec5SDimitry Andric case '^': 37820b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37830b57cec5SDimitry Andric if (Char == '=') { 37840b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37850b57cec5SDimitry Andric Kind = tok::caretequal; 37860b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') { 37870b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37880b57cec5SDimitry Andric Kind = tok::caretcaret; 37890b57cec5SDimitry Andric } else { 37900b57cec5SDimitry Andric Kind = tok::caret; 37910b57cec5SDimitry Andric } 37920b57cec5SDimitry Andric break; 37930b57cec5SDimitry Andric case '|': 37940b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37950b57cec5SDimitry Andric if (Char == '=') { 37960b57cec5SDimitry Andric Kind = tok::pipeequal; 37970b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37980b57cec5SDimitry Andric } else if (Char == '|') { 37990b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it. 38000b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 38010b57cec5SDimitry Andric goto LexNextToken; 38020b57cec5SDimitry Andric Kind = tok::pipepipe; 38030b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38040b57cec5SDimitry Andric } else { 38050b57cec5SDimitry Andric Kind = tok::pipe; 38060b57cec5SDimitry Andric } 38070b57cec5SDimitry Andric break; 38080b57cec5SDimitry Andric case ':': 38090b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38100b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') { 38110b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']' 38120b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38130b57cec5SDimitry Andric } else if ((LangOpts.CPlusPlus || 38140b57cec5SDimitry Andric LangOpts.DoubleSquareBracketAttributes) && 38150b57cec5SDimitry Andric Char == ':') { 38160b57cec5SDimitry Andric Kind = tok::coloncolon; 38170b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38180b57cec5SDimitry Andric } else { 38190b57cec5SDimitry Andric Kind = tok::colon; 38200b57cec5SDimitry Andric } 38210b57cec5SDimitry Andric break; 38220b57cec5SDimitry Andric case ';': 38230b57cec5SDimitry Andric Kind = tok::semi; 38240b57cec5SDimitry Andric break; 38250b57cec5SDimitry Andric case '=': 38260b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38270b57cec5SDimitry Andric if (Char == '=') { 38280b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it. 38290b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 38300b57cec5SDimitry Andric goto LexNextToken; 38310b57cec5SDimitry Andric 38320b57cec5SDimitry Andric Kind = tok::equalequal; 38330b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38340b57cec5SDimitry Andric } else { 38350b57cec5SDimitry Andric Kind = tok::equal; 38360b57cec5SDimitry Andric } 38370b57cec5SDimitry Andric break; 38380b57cec5SDimitry Andric case ',': 38390b57cec5SDimitry Andric Kind = tok::comma; 38400b57cec5SDimitry Andric break; 38410b57cec5SDimitry Andric case '#': 38420b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38430b57cec5SDimitry Andric if (Char == '#') { 38440b57cec5SDimitry Andric Kind = tok::hashhash; 38450b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38460b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 38470b57cec5SDimitry Andric Kind = tok::hashat; 38480b57cec5SDimitry Andric if (!isLexingRawMode()) 38490b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 38500b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38510b57cec5SDimitry Andric } else { 38520b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 38530b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 38540b57cec5SDimitry Andric // the preprocessor to handle it. 38550b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 38560b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 38570b57cec5SDimitry Andric goto HandleDirective; 38580b57cec5SDimitry Andric 38590b57cec5SDimitry Andric Kind = tok::hash; 38600b57cec5SDimitry Andric } 38610b57cec5SDimitry Andric break; 38620b57cec5SDimitry Andric 38630b57cec5SDimitry Andric case '@': 38640b57cec5SDimitry Andric // Objective C support. 38650b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC) 38660b57cec5SDimitry Andric Kind = tok::at; 38670b57cec5SDimitry Andric else 38680b57cec5SDimitry Andric Kind = tok::unknown; 38690b57cec5SDimitry Andric break; 38700b57cec5SDimitry Andric 38710b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 38720b57cec5SDimitry Andric case '\\': 38730b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) { 38740b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 38750b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 38760b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 38770b57cec5SDimitry Andric return true; // KeepWhitespaceMode 38780b57cec5SDimitry Andric 38790b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 38800b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 38810b57cec5SDimitry Andric goto LexNextToken; 38820b57cec5SDimitry Andric } 38830b57cec5SDimitry Andric 38840b57cec5SDimitry Andric return LexUnicode(Result, CodePoint, CurPtr); 38850b57cec5SDimitry Andric } 38860b57cec5SDimitry Andric } 38870b57cec5SDimitry Andric 38880b57cec5SDimitry Andric Kind = tok::unknown; 38890b57cec5SDimitry Andric break; 38900b57cec5SDimitry Andric 38910b57cec5SDimitry Andric default: { 38920b57cec5SDimitry Andric if (isASCII(Char)) { 38930b57cec5SDimitry Andric Kind = tok::unknown; 38940b57cec5SDimitry Andric break; 38950b57cec5SDimitry Andric } 38960b57cec5SDimitry Andric 38970b57cec5SDimitry Andric llvm::UTF32 CodePoint; 38980b57cec5SDimitry Andric 38990b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 39000b57cec5SDimitry Andric // an escaped newline. 39010b57cec5SDimitry Andric --CurPtr; 39020b57cec5SDimitry Andric llvm::ConversionResult Status = 39030b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 39040b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 39050b57cec5SDimitry Andric &CodePoint, 39060b57cec5SDimitry Andric llvm::strictConversion); 39070b57cec5SDimitry Andric if (Status == llvm::conversionOK) { 39080b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 39090b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 39100b57cec5SDimitry Andric return true; // KeepWhitespaceMode 39110b57cec5SDimitry Andric 39120b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 39130b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 39140b57cec5SDimitry Andric goto LexNextToken; 39150b57cec5SDimitry Andric } 39160b57cec5SDimitry Andric return LexUnicode(Result, CodePoint, CurPtr); 39170b57cec5SDimitry Andric } 39180b57cec5SDimitry Andric 39190b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective || 39200b57cec5SDimitry Andric PP->isPreprocessedOutput()) { 39210b57cec5SDimitry Andric ++CurPtr; 39220b57cec5SDimitry Andric Kind = tok::unknown; 39230b57cec5SDimitry Andric break; 39240b57cec5SDimitry Andric } 39250b57cec5SDimitry Andric 39260b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 39270b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 39280b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character. 39290b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8); 39300b57cec5SDimitry Andric 39310b57cec5SDimitry Andric BufferPtr = CurPtr+1; 39320b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with 39330b57cec5SDimitry Andric // this lexer. 39340b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 39350b57cec5SDimitry Andric goto LexNextToken; 39360b57cec5SDimitry Andric } 39370b57cec5SDimitry Andric } 39380b57cec5SDimitry Andric 39390b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 39400b57cec5SDimitry Andric MIOpt.ReadToken(); 39410b57cec5SDimitry Andric 39420b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 39430b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 39440b57cec5SDimitry Andric return true; 39450b57cec5SDimitry Andric 39460b57cec5SDimitry Andric HandleDirective: 39470b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive. 39480b57cec5SDimitry Andric 39490b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash); 39500b57cec5SDimitry Andric PP->HandleDirective(Result); 39510b57cec5SDimitry Andric 39520b57cec5SDimitry Andric if (PP->hadModuleLoaderFatalFailure()) { 39530b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing. 39540b57cec5SDimitry Andric assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 39550b57cec5SDimitry Andric return true; 39560b57cec5SDimitry Andric } 39570b57cec5SDimitry Andric 39580b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state. 39590b57cec5SDimitry Andric return false; 39600b57cec5SDimitry Andric } 3961