10b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "clang/Lex/Lexer.h" 140b57cec5SDimitry Andric #include "UnicodeCharSets.h" 150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h" 16e8d8bef9SDimitry Andric #include "clang/Basic/Diagnostic.h" 170b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 18e8d8bef9SDimitry Andric #include "clang/Basic/LLVM.h" 190b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h" 200b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h" 210b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h" 220b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 230b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h" 240b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h" 250b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h" 260b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h" 270b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h" 280b57cec5SDimitry Andric #include "clang/Lex/Token.h" 290b57cec5SDimitry Andric #include "llvm/ADT/None.h" 300b57cec5SDimitry Andric #include "llvm/ADT/Optional.h" 315ffd83dbSDimitry Andric #include "llvm/ADT/STLExtras.h" 320b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h" 330b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 34e8d8bef9SDimitry Andric #include "llvm/ADT/StringSwitch.h" 350b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 360b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 370b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h" 38e8d8bef9SDimitry Andric #include "llvm/Support/MemoryBufferRef.h" 390b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h" 40*81ad6265SDimitry Andric #include "llvm/Support/Unicode.h" 410b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h" 420b57cec5SDimitry Andric #include <algorithm> 430b57cec5SDimitry Andric #include <cassert> 440b57cec5SDimitry Andric #include <cstddef> 450b57cec5SDimitry Andric #include <cstdint> 460b57cec5SDimitry Andric #include <cstring> 470b57cec5SDimitry Andric #include <string> 480b57cec5SDimitry Andric #include <tuple> 490b57cec5SDimitry Andric #include <utility> 500b57cec5SDimitry Andric 510b57cec5SDimitry Andric using namespace clang; 520b57cec5SDimitry Andric 530b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 540b57cec5SDimitry Andric // Token Class Implementation 550b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 560b57cec5SDimitry Andric 570b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 580b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 590b57cec5SDimitry Andric if (isAnnotation()) 600b57cec5SDimitry Andric return false; 610b57cec5SDimitry Andric if (IdentifierInfo *II = getIdentifierInfo()) 620b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey; 630b57cec5SDimitry Andric return false; 640b57cec5SDimitry Andric } 650b57cec5SDimitry Andric 660b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind. 670b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const { 680b57cec5SDimitry Andric if (isAnnotation()) 690b57cec5SDimitry Andric return tok::objc_not_keyword; 700b57cec5SDimitry Andric IdentifierInfo *specId = getIdentifierInfo(); 710b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 720b57cec5SDimitry Andric } 730b57cec5SDimitry Andric 740b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 750b57cec5SDimitry Andric // Lexer Class Implementation 760b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 770b57cec5SDimitry Andric 780b57cec5SDimitry Andric void Lexer::anchor() {} 790b57cec5SDimitry Andric 800b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 810b57cec5SDimitry Andric const char *BufEnd) { 820b57cec5SDimitry Andric BufferStart = BufStart; 830b57cec5SDimitry Andric BufferPtr = BufPtr; 840b57cec5SDimitry Andric BufferEnd = BufEnd; 850b57cec5SDimitry Andric 860b57cec5SDimitry Andric assert(BufEnd[0] == 0 && 870b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end" 880b57cec5SDimitry Andric " to simplify lexing!"); 890b57cec5SDimitry Andric 900b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act 910b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just 920b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present. 930b57cec5SDimitry Andric if (BufferStart == BufferPtr) { 940b57cec5SDimitry Andric // Determine the size of the BOM. 950b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart); 960b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 970b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 980b57cec5SDimitry Andric .Default(0); 990b57cec5SDimitry Andric 1000b57cec5SDimitry Andric // Skip the BOM. 1010b57cec5SDimitry Andric BufferPtr += BOMLength; 1020b57cec5SDimitry Andric } 1030b57cec5SDimitry Andric 1040b57cec5SDimitry Andric Is_PragmaLexer = false; 1050b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 1060b57cec5SDimitry Andric 1070b57cec5SDimitry Andric // Start of the file is a start of line. 1080b57cec5SDimitry Andric IsAtStartOfLine = true; 1090b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 1100b57cec5SDimitry Andric 1110b57cec5SDimitry Andric HasLeadingSpace = false; 1120b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 1130b57cec5SDimitry Andric 1140b57cec5SDimitry Andric // We are not after parsing a #. 1150b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 1160b57cec5SDimitry Andric 1170b57cec5SDimitry Andric // We are not after parsing #include. 1180b57cec5SDimitry Andric ParsingFilename = false; 1190b57cec5SDimitry Andric 1200b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation 1210b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 1220b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 1230b57cec5SDimitry Andric // or otherwise skipping over tokens. 1240b57cec5SDimitry Andric LexingRawMode = false; 1250b57cec5SDimitry Andric 1260b57cec5SDimitry Andric // Default to not keeping comments. 1270b57cec5SDimitry Andric ExtendedTokenMode = 0; 128e8d8bef9SDimitry Andric 129e8d8bef9SDimitry Andric NewLinePtr = nullptr; 1300b57cec5SDimitry Andric } 1310b57cec5SDimitry Andric 1320b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer 1330b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer 1340b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will 1350b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them. 136e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 137349cc55cSDimitry Andric Preprocessor &PP, bool IsFirstIncludeOfFile) 1380b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID), 1390b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 140*81ad6265SDimitry Andric LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 141*81ad6265SDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 142e8d8bef9SDimitry Andric InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 143e8d8bef9SDimitry Andric InputFile.getBufferEnd()); 1440b57cec5SDimitry Andric 1450b57cec5SDimitry Andric resetExtendedTokenMode(); 1460b57cec5SDimitry Andric } 1470b57cec5SDimitry Andric 1480b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1490b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1500b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 1510b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 152349cc55cSDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd, 153349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 154*81ad6265SDimitry Andric : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 155349cc55cSDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 1560b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd); 1570b57cec5SDimitry Andric 1580b57cec5SDimitry Andric // We *are* in raw mode. 1590b57cec5SDimitry Andric LexingRawMode = true; 1600b57cec5SDimitry Andric } 1610b57cec5SDimitry Andric 1620b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1630b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1640b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 165e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 166349cc55cSDimitry Andric const SourceManager &SM, const LangOptions &langOpts, 167349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 168e8d8bef9SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 169349cc55cSDimitry Andric FromFile.getBufferStart(), FromFile.getBufferEnd(), 170349cc55cSDimitry Andric IsFirstIncludeOfFile) {} 1710b57cec5SDimitry Andric 1720b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() { 1730b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor"); 1740b57cec5SDimitry Andric if (LangOpts.TraditionalCPP) 1750b57cec5SDimitry Andric SetKeepWhitespaceMode(true); 1760b57cec5SDimitry Andric else 1770b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState()); 1780b57cec5SDimitry Andric } 1790b57cec5SDimitry Andric 1800b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 1810b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method 1820b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done. 1830b57cec5SDimitry Andric /// 1840b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a 1850b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an 1860b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be 1870b57cec5SDimitry Andric /// "expanded from". 1880b57cec5SDimitry Andric /// 1890b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a 1900b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making 1910b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 1920b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc 1930b57cec5SDimitry Andric /// out of the critical path of the lexer! 1940b57cec5SDimitry Andric /// 1950b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 1960b57cec5SDimitry Andric SourceLocation ExpansionLocStart, 1970b57cec5SDimitry Andric SourceLocation ExpansionLocEnd, 1980b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) { 1990b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 2000b57cec5SDimitry Andric 2010b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally. 2020b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc); 203e8d8bef9SDimitry Andric llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 2040b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP); 2050b57cec5SDimitry Andric 2060b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we 2070b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a 2080b57cec5SDimitry Andric // scratch buffer. 2090b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc); 2100b57cec5SDimitry Andric 2110b57cec5SDimitry Andric L->BufferPtr = StrData; 2120b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen; 2130b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 2140b57cec5SDimitry Andric 2150b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that 2160b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed. 2170b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 2180b57cec5SDimitry Andric ExpansionLocStart, 2190b57cec5SDimitry Andric ExpansionLocEnd, TokLen); 2200b57cec5SDimitry Andric 2210b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will 2220b57cec5SDimitry Andric // return an EOD token. 2230b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true; 2240b57cec5SDimitry Andric 2250b57cec5SDimitry Andric // This lexer really is for _Pragma. 2260b57cec5SDimitry Andric L->Is_PragmaLexer = true; 2270b57cec5SDimitry Andric return L; 2280b57cec5SDimitry Andric } 2290b57cec5SDimitry Andric 230*81ad6265SDimitry Andric void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 231*81ad6265SDimitry Andric this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 232*81ad6265SDimitry Andric this->IsAtStartOfLine = IsAtStartOfLine; 233*81ad6265SDimitry Andric assert((BufferStart + Offset) <= BufferEnd); 234*81ad6265SDimitry Andric BufferPtr = BufferStart + Offset; 235a7dea167SDimitry Andric } 236a7dea167SDimitry Andric 2370b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) { 2380b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size(); 2390b57cec5SDimitry Andric while (i < e) { 2400b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) { 2410b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\'); 2420b57cec5SDimitry Andric i += 2; 2430b57cec5SDimitry Andric ++e; 2440b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') { 2450b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 2460b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 2470b57cec5SDimitry Andric Str[i] != Str[i + 1]) { 2480b57cec5SDimitry Andric Str[i] = '\\'; 2490b57cec5SDimitry Andric Str[i + 1] = 'n'; 2500b57cec5SDimitry Andric } else { 2510b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'. 2520b57cec5SDimitry Andric Str[i] = '\\'; 2530b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n'); 2540b57cec5SDimitry Andric ++e; 2550b57cec5SDimitry Andric } 2560b57cec5SDimitry Andric i += 2; 2570b57cec5SDimitry Andric } else 2580b57cec5SDimitry Andric ++i; 2590b57cec5SDimitry Andric } 2600b57cec5SDimitry Andric } 2610b57cec5SDimitry Andric 2620b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) { 2635ffd83dbSDimitry Andric std::string Result = std::string(Str); 2640b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"'; 2650b57cec5SDimitry Andric StringifyImpl(Result, Quote); 2660b57cec5SDimitry Andric return Result; 2670b57cec5SDimitry Andric } 2680b57cec5SDimitry Andric 2690b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 2700b57cec5SDimitry Andric 2710b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2720b57cec5SDimitry Andric // Token Spelling 2730b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2740b57cec5SDimitry Andric 2750b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the 2760b57cec5SDimitry Andric /// spelling of this token from the provided input buffer. 2770b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 2780b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) { 2790b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 2800b57cec5SDimitry Andric 2810b57cec5SDimitry Andric size_t Length = 0; 2820b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength(); 2830b57cec5SDimitry Andric 2840b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) { 2850b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote. 2860b57cec5SDimitry Andric while (BufPtr < BufEnd) { 2870b57cec5SDimitry Andric unsigned Size; 2880b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 2890b57cec5SDimitry Andric BufPtr += Size; 2900b57cec5SDimitry Andric 2910b57cec5SDimitry Andric if (Spelling[Length - 1] == '"') 2920b57cec5SDimitry Andric break; 2930b57cec5SDimitry Andric } 2940b57cec5SDimitry Andric 2950b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line 2960b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their 2970b57cec5SDimitry Andric // r-char-sequence. 2980b57cec5SDimitry Andric if (Length >= 2 && 2990b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 3000b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing 3010b57cec5SDimitry Andric // quote. 3020b57cec5SDimitry Andric const char *RawEnd = BufEnd; 3030b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"'); 3040b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1; 3050b57cec5SDimitry Andric 3060b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling. 3070b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength); 3080b57cec5SDimitry Andric Length += RawLength; 3090b57cec5SDimitry Andric BufPtr += RawLength; 3100b57cec5SDimitry Andric 3110b57cec5SDimitry Andric // The rest of the token is lexed normally. 3120b57cec5SDimitry Andric } 3130b57cec5SDimitry Andric } 3140b57cec5SDimitry Andric 3150b57cec5SDimitry Andric while (BufPtr < BufEnd) { 3160b57cec5SDimitry Andric unsigned Size; 3170b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 3180b57cec5SDimitry Andric BufPtr += Size; 3190b57cec5SDimitry Andric } 3200b57cec5SDimitry Andric 3210b57cec5SDimitry Andric assert(Length < Tok.getLength() && 3220b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!"); 3230b57cec5SDimitry Andric return Length; 3240b57cec5SDimitry Andric } 3250b57cec5SDimitry Andric 3260b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3270b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3280b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3290b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3300b57cec5SDimitry Andric /// UCNs, etc. 3310b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc, 3320b57cec5SDimitry Andric SmallVectorImpl<char> &buffer, 3330b57cec5SDimitry Andric const SourceManager &SM, 3340b57cec5SDimitry Andric const LangOptions &options, 3350b57cec5SDimitry Andric bool *invalid) { 3360b57cec5SDimitry Andric // Break down the source location. 3370b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 3380b57cec5SDimitry Andric 3390b57cec5SDimitry Andric // Try to the load the file buffer. 3400b57cec5SDimitry Andric bool invalidTemp = false; 3410b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 3420b57cec5SDimitry Andric if (invalidTemp) { 3430b57cec5SDimitry Andric if (invalid) *invalid = true; 3440b57cec5SDimitry Andric return {}; 3450b57cec5SDimitry Andric } 3460b57cec5SDimitry Andric 3470b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second; 3480b57cec5SDimitry Andric 3490b57cec5SDimitry Andric // Lex from the start of the given location. 3500b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 3510b57cec5SDimitry Andric file.begin(), tokenBegin, file.end()); 3520b57cec5SDimitry Andric Token token; 3530b57cec5SDimitry Andric lexer.LexFromRawLexer(token); 3540b57cec5SDimitry Andric 3550b57cec5SDimitry Andric unsigned length = token.getLength(); 3560b57cec5SDimitry Andric 3570b57cec5SDimitry Andric // Common case: no need for cleaning. 3580b57cec5SDimitry Andric if (!token.needsCleaning()) 3590b57cec5SDimitry Andric return StringRef(tokenBegin, length); 3600b57cec5SDimitry Andric 3610b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string. 3620b57cec5SDimitry Andric buffer.resize(length); 3630b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 3640b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size()); 3650b57cec5SDimitry Andric } 3660b57cec5SDimitry Andric 3670b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3680b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3690b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3700b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3710b57cec5SDimitry Andric /// UCNs, etc. 3720b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 3730b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 3740b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 3750b57cec5SDimitry Andric 3760b57cec5SDimitry Andric bool CharDataInvalid = false; 3770b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 3780b57cec5SDimitry Andric &CharDataInvalid); 3790b57cec5SDimitry Andric if (Invalid) 3800b57cec5SDimitry Andric *Invalid = CharDataInvalid; 3810b57cec5SDimitry Andric if (CharDataInvalid) 3820b57cec5SDimitry Andric return {}; 3830b57cec5SDimitry Andric 3840b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 3850b57cec5SDimitry Andric if (!Tok.needsCleaning()) 3860b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength()); 3870b57cec5SDimitry Andric 3880b57cec5SDimitry Andric std::string Result; 3890b57cec5SDimitry Andric Result.resize(Tok.getLength()); 3900b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 3910b57cec5SDimitry Andric return Result; 3920b57cec5SDimitry Andric } 3930b57cec5SDimitry Andric 3940b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a 3950b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required 3960b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least 3970b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned. 3980b57cec5SDimitry Andric /// 3990b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in 4000b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer* 4010b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a 4020b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer 4030b57cec5SDimitry Andric /// if an internal buffer is returned. 4040b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 4050b57cec5SDimitry Andric const SourceManager &SourceMgr, 4060b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 4070b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 4080b57cec5SDimitry Andric 4090b57cec5SDimitry Andric const char *TokStart = nullptr; 4100b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo. 4110b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier)) 4120b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data(); 4130b57cec5SDimitry Andric else if (!Tok.hasUCN()) { 4140b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 4150b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick. 4160b57cec5SDimitry Andric Buffer = II->getNameStart(); 4170b57cec5SDimitry Andric return II->getLength(); 4180b57cec5SDimitry Andric } 4190b57cec5SDimitry Andric } 4200b57cec5SDimitry Andric 4210b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo. 4220b57cec5SDimitry Andric if (Tok.isLiteral()) 4230b57cec5SDimitry Andric TokStart = Tok.getLiteralData(); 4240b57cec5SDimitry Andric 4250b57cec5SDimitry Andric if (!TokStart) { 4260b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer. 4270b57cec5SDimitry Andric bool CharDataInvalid = false; 4280b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 4290b57cec5SDimitry Andric if (Invalid) 4300b57cec5SDimitry Andric *Invalid = CharDataInvalid; 4310b57cec5SDimitry Andric if (CharDataInvalid) { 4320b57cec5SDimitry Andric Buffer = ""; 4330b57cec5SDimitry Andric return 0; 4340b57cec5SDimitry Andric } 4350b57cec5SDimitry Andric } 4360b57cec5SDimitry Andric 4370b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 4380b57cec5SDimitry Andric if (!Tok.needsCleaning()) { 4390b57cec5SDimitry Andric Buffer = TokStart; 4400b57cec5SDimitry Andric return Tok.getLength(); 4410b57cec5SDimitry Andric } 4420b57cec5SDimitry Andric 4430b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string. 4440b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 4450b57cec5SDimitry Andric } 4460b57cec5SDimitry Andric 4470b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return 4480b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g. 4490b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes 4500b57cec5SDimitry Andric /// that are part of that. 4510b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 4520b57cec5SDimitry Andric const SourceManager &SM, 4530b57cec5SDimitry Andric const LangOptions &LangOpts) { 4540b57cec5SDimitry Andric Token TheTok; 4550b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts)) 4560b57cec5SDimitry Andric return 0; 4570b57cec5SDimitry Andric return TheTok.getLength(); 4580b57cec5SDimitry Andric } 4590b57cec5SDimitry Andric 4600b57cec5SDimitry Andric /// Relex the token at the specified location. 4610b57cec5SDimitry Andric /// \returns true if there was a failure, false on success. 4620b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 4630b57cec5SDimitry Andric const SourceManager &SM, 4640b57cec5SDimitry Andric const LangOptions &LangOpts, 4650b57cec5SDimitry Andric bool IgnoreWhiteSpace) { 4660b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')', 4670b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle 4680b57cec5SDimitry Andric // all obviously single-char tokens. This could use 4690b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 4700b57cec5SDimitry Andric // something. 4710b57cec5SDimitry Andric 4720b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not 4730b57cec5SDimitry Andric // the token this macro expanded to. 4740b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc); 4750b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 4760b57cec5SDimitry Andric bool Invalid = false; 4770b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 4780b57cec5SDimitry Andric if (Invalid) 4790b57cec5SDimitry Andric return true; 4800b57cec5SDimitry Andric 4810b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second; 4820b57cec5SDimitry Andric 4830b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 4840b57cec5SDimitry Andric return true; 4850b57cec5SDimitry Andric 4860b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 4870b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 4880b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end()); 4890b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 4900b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result); 4910b57cec5SDimitry Andric return false; 4920b57cec5SDimitry Andric } 4930b57cec5SDimitry Andric 4940b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains 4950b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid. 4960b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 4970b57cec5SDimitry Andric const char *BufStart = Buffer.data(); 4980b57cec5SDimitry Andric if (Offset >= Buffer.size()) 4990b57cec5SDimitry Andric return nullptr; 5000b57cec5SDimitry Andric 5010b57cec5SDimitry Andric const char *LexStart = BufStart + Offset; 5020b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) { 5030b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) && 5040b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) { 5050b57cec5SDimitry Andric // LexStart should point at first character of logical line. 5060b57cec5SDimitry Andric ++LexStart; 5070b57cec5SDimitry Andric break; 5080b57cec5SDimitry Andric } 5090b57cec5SDimitry Andric } 5100b57cec5SDimitry Andric return LexStart; 5110b57cec5SDimitry Andric } 5120b57cec5SDimitry Andric 5130b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 5140b57cec5SDimitry Andric const SourceManager &SM, 5150b57cec5SDimitry Andric const LangOptions &LangOpts) { 5160b57cec5SDimitry Andric assert(Loc.isFileID()); 5170b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 5180b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 5190b57cec5SDimitry Andric return Loc; 5200b57cec5SDimitry Andric 5210b57cec5SDimitry Andric bool Invalid = false; 5220b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 5230b57cec5SDimitry Andric if (Invalid) 5240b57cec5SDimitry Andric return Loc; 5250b57cec5SDimitry Andric 5260b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line 5270b57cec5SDimitry Andric // (or the buffer). We'll relex from that point. 5280b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second; 5290b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 5300b57cec5SDimitry Andric if (!LexStart || LexStart == StrData) 5310b57cec5SDimitry Andric return Loc; 5320b57cec5SDimitry Andric 5330b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 5340b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 5350b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 5360b57cec5SDimitry Andric Buffer.end()); 5370b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5380b57cec5SDimitry Andric 5390b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location. 5400b57cec5SDimitry Andric Token TheTok; 5410b57cec5SDimitry Andric do { 5420b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 5430b57cec5SDimitry Andric 5440b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) { 5450b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're 5460b57cec5SDimitry Andric // looking for. If the current token encompasses our source location, 5470b57cec5SDimitry Andric // return the beginning of that token. 5480b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 5490b57cec5SDimitry Andric return TheTok.getLocation(); 5500b57cec5SDimitry Andric 5510b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means 5520b57cec5SDimitry Andric // that it points into whitespace. We're done here. 5530b57cec5SDimitry Andric break; 5540b57cec5SDimitry Andric } 5550b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof); 5560b57cec5SDimitry Andric 5570b57cec5SDimitry Andric // We've passed our source location; just return the original source location. 5580b57cec5SDimitry Andric return Loc; 5590b57cec5SDimitry Andric } 5600b57cec5SDimitry Andric 5610b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 5620b57cec5SDimitry Andric const SourceManager &SM, 5630b57cec5SDimitry Andric const LangOptions &LangOpts) { 5640b57cec5SDimitry Andric if (Loc.isFileID()) 5650b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts); 5660b57cec5SDimitry Andric 5670b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc)) 5680b57cec5SDimitry Andric return Loc; 5690b57cec5SDimitry Andric 5700b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc); 5710b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 5720b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 5730b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo = 5740b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc); 5750b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first && 5760b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second); 5770b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 5780b57cec5SDimitry Andric } 5790b57cec5SDimitry Andric 5800b57cec5SDimitry Andric namespace { 5810b57cec5SDimitry Andric 5820b57cec5SDimitry Andric enum PreambleDirectiveKind { 5830b57cec5SDimitry Andric PDK_Skipped, 5840b57cec5SDimitry Andric PDK_Unknown 5850b57cec5SDimitry Andric }; 5860b57cec5SDimitry Andric 5870b57cec5SDimitry Andric } // namespace 5880b57cec5SDimitry Andric 5890b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 5900b57cec5SDimitry Andric const LangOptions &LangOpts, 5910b57cec5SDimitry Andric unsigned MaxLines) { 5920b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a 5930b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our 5940b57cec5SDimitry Andric // position within the file. 595fe6060f1SDimitry Andric const SourceLocation::UIntTy StartOffset = 1; 5960b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 5970b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 5980b57cec5SDimitry Andric Buffer.end()); 5990b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 6000b57cec5SDimitry Andric 6010b57cec5SDimitry Andric bool InPreprocessorDirective = false; 6020b57cec5SDimitry Andric Token TheTok; 6030b57cec5SDimitry Andric SourceLocation ActiveCommentLoc; 6040b57cec5SDimitry Andric 6050b57cec5SDimitry Andric unsigned MaxLineOffset = 0; 6060b57cec5SDimitry Andric if (MaxLines) { 6070b57cec5SDimitry Andric const char *CurPtr = Buffer.begin(); 6080b57cec5SDimitry Andric unsigned CurLine = 0; 6090b57cec5SDimitry Andric while (CurPtr != Buffer.end()) { 6100b57cec5SDimitry Andric char ch = *CurPtr++; 6110b57cec5SDimitry Andric if (ch == '\n') { 6120b57cec5SDimitry Andric ++CurLine; 6130b57cec5SDimitry Andric if (CurLine == MaxLines) 6140b57cec5SDimitry Andric break; 6150b57cec5SDimitry Andric } 6160b57cec5SDimitry Andric } 6170b57cec5SDimitry Andric if (CurPtr != Buffer.end()) 6180b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin(); 6190b57cec5SDimitry Andric } 6200b57cec5SDimitry Andric 6210b57cec5SDimitry Andric do { 6220b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6230b57cec5SDimitry Andric 6240b57cec5SDimitry Andric if (InPreprocessorDirective) { 6250b57cec5SDimitry Andric // If we've hit the end of the file, we're done. 6260b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) { 6270b57cec5SDimitry Andric break; 6280b57cec5SDimitry Andric } 6290b57cec5SDimitry Andric 6300b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this 6310b57cec5SDimitry Andric // token. 6320b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine()) 6330b57cec5SDimitry Andric continue; 6340b57cec5SDimitry Andric 6350b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look 6360b57cec5SDimitry Andric // at this token again below. 6370b57cec5SDimitry Andric InPreprocessorDirective = false; 6380b57cec5SDimitry Andric } 6390b57cec5SDimitry Andric 6400b57cec5SDimitry Andric // Keep track of the # of lines in the preamble. 6410b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) { 6420b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 6430b57cec5SDimitry Andric 6440b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble, 6450b57cec5SDimitry Andric // and we're about to exceed that limit, we're done. 6460b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset) 6470b57cec5SDimitry Andric break; 6480b57cec5SDimitry Andric } 6490b57cec5SDimitry Andric 6500b57cec5SDimitry Andric // Comments are okay; skip over them. 6510b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) { 6520b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid()) 6530b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation(); 6540b57cec5SDimitry Andric continue; 6550b57cec5SDimitry Andric } 6560b57cec5SDimitry Andric 6570b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 6580b57cec5SDimitry Andric // This is the start of a preprocessor directive. 6590b57cec5SDimitry Andric Token HashTok = TheTok; 6600b57cec5SDimitry Andric InPreprocessorDirective = true; 6610b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation(); 6620b57cec5SDimitry Andric 6630b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens, 6640b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at 6650b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives. 6660b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6670b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 6680b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier(); 6690b57cec5SDimitry Andric PreambleDirectiveKind PDK 6700b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 6710b57cec5SDimitry Andric .Case("include", PDK_Skipped) 6720b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped) 6730b57cec5SDimitry Andric .Case("define", PDK_Skipped) 6740b57cec5SDimitry Andric .Case("undef", PDK_Skipped) 6750b57cec5SDimitry Andric .Case("line", PDK_Skipped) 6760b57cec5SDimitry Andric .Case("error", PDK_Skipped) 6770b57cec5SDimitry Andric .Case("pragma", PDK_Skipped) 6780b57cec5SDimitry Andric .Case("import", PDK_Skipped) 6790b57cec5SDimitry Andric .Case("include_next", PDK_Skipped) 6800b57cec5SDimitry Andric .Case("warning", PDK_Skipped) 6810b57cec5SDimitry Andric .Case("ident", PDK_Skipped) 6820b57cec5SDimitry Andric .Case("sccs", PDK_Skipped) 6830b57cec5SDimitry Andric .Case("assert", PDK_Skipped) 6840b57cec5SDimitry Andric .Case("unassert", PDK_Skipped) 6850b57cec5SDimitry Andric .Case("if", PDK_Skipped) 6860b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped) 6870b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped) 6880b57cec5SDimitry Andric .Case("elif", PDK_Skipped) 689fe6060f1SDimitry Andric .Case("elifdef", PDK_Skipped) 690fe6060f1SDimitry Andric .Case("elifndef", PDK_Skipped) 6910b57cec5SDimitry Andric .Case("else", PDK_Skipped) 6920b57cec5SDimitry Andric .Case("endif", PDK_Skipped) 6930b57cec5SDimitry Andric .Default(PDK_Unknown); 6940b57cec5SDimitry Andric 6950b57cec5SDimitry Andric switch (PDK) { 6960b57cec5SDimitry Andric case PDK_Skipped: 6970b57cec5SDimitry Andric continue; 6980b57cec5SDimitry Andric 6990b57cec5SDimitry Andric case PDK_Unknown: 7000b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'. 7010b57cec5SDimitry Andric break; 7020b57cec5SDimitry Andric } 7030b57cec5SDimitry Andric } 7040b57cec5SDimitry Andric 7050b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor 7060b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this 7070b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'. 7080b57cec5SDimitry Andric TheTok = HashTok; 7090b57cec5SDimitry Andric } 7100b57cec5SDimitry Andric 7110b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the 7120b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in 7130b57cec5SDimitry Andric // the preamble. 7140b57cec5SDimitry Andric break; 7150b57cec5SDimitry Andric } while (true); 7160b57cec5SDimitry Andric 7170b57cec5SDimitry Andric SourceLocation End; 7180b57cec5SDimitry Andric if (ActiveCommentLoc.isValid()) 7190b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment. 7200b57cec5SDimitry Andric else 7210b57cec5SDimitry Andric End = TheTok.getLocation(); 7220b57cec5SDimitry Andric 7230b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 7240b57cec5SDimitry Andric TheTok.isAtStartOfLine()); 7250b57cec5SDimitry Andric } 7260b57cec5SDimitry Andric 7270b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 7280b57cec5SDimitry Andric const SourceManager &SM, 7290b57cec5SDimitry Andric const LangOptions &LangOpts) { 7300b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion 7310b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and 7320b57cec5SDimitry Andric // trigraphs. 7330b57cec5SDimitry Andric bool Invalid = false; 7340b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 7350b57cec5SDimitry Andric 7360b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done. 7370b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 7380b57cec5SDimitry Andric return 0; 7390b57cec5SDimitry Andric 7400b57cec5SDimitry Andric unsigned PhysOffset = 0; 7410b57cec5SDimitry Andric 7420b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip 7430b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple 7440b57cec5SDimitry Andric // chars, this method is extremely fast. 7450b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 7460b57cec5SDimitry Andric if (CharNo == 0) 7470b57cec5SDimitry Andric return PhysOffset; 7480b57cec5SDimitry Andric ++TokPtr; 7490b57cec5SDimitry Andric --CharNo; 7500b57cec5SDimitry Andric ++PhysOffset; 7510b57cec5SDimitry Andric } 7520b57cec5SDimitry Andric 7530b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a 7540b57cec5SDimitry Andric // lexer to parse it correctly. 7550b57cec5SDimitry Andric for (; CharNo; --CharNo) { 7560b57cec5SDimitry Andric unsigned Size; 7570b57cec5SDimitry Andric Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 7580b57cec5SDimitry Andric TokPtr += Size; 7590b57cec5SDimitry Andric PhysOffset += Size; 7600b57cec5SDimitry Andric } 7610b57cec5SDimitry Andric 7620b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the 7630b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar 7640b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding 7650b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph. 7660b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 7670b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 7680b57cec5SDimitry Andric 7690b57cec5SDimitry Andric return PhysOffset; 7700b57cec5SDimitry Andric } 7710b57cec5SDimitry Andric 7720b57cec5SDimitry Andric /// Computes the source location just past the end of the 7730b57cec5SDimitry Andric /// token at this source location. 7740b57cec5SDimitry Andric /// 7750b57cec5SDimitry Andric /// This routine can be used to produce a source location that 7760b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and 7770b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a 7780b57cec5SDimitry Andric /// token where it expected something different that it received. If 7790b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if 7800b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid 7810b57cec5SDimitry Andric /// source location. 7820b57cec5SDimitry Andric /// 7830b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source 7840b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source 7850b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces 7860b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc. 7870b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 7880b57cec5SDimitry Andric const SourceManager &SM, 7890b57cec5SDimitry Andric const LangOptions &LangOpts) { 7900b57cec5SDimitry Andric if (Loc.isInvalid()) 7910b57cec5SDimitry Andric return {}; 7920b57cec5SDimitry Andric 7930b57cec5SDimitry Andric if (Loc.isMacroID()) { 7940b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 7950b57cec5SDimitry Andric return {}; // Points inside the macro expansion. 7960b57cec5SDimitry Andric } 7970b57cec5SDimitry Andric 7980b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 7990b57cec5SDimitry Andric if (Len > Offset) 8000b57cec5SDimitry Andric Len = Len - Offset; 8010b57cec5SDimitry Andric else 8020b57cec5SDimitry Andric return Loc; 8030b57cec5SDimitry Andric 8040b57cec5SDimitry Andric return Loc.getLocWithOffset(Len); 8050b57cec5SDimitry Andric } 8060b57cec5SDimitry Andric 8070b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first 8080b57cec5SDimitry Andric /// token of the macro expansion. 8090b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 8100b57cec5SDimitry Andric const SourceManager &SM, 8110b57cec5SDimitry Andric const LangOptions &LangOpts, 8120b57cec5SDimitry Andric SourceLocation *MacroBegin) { 8130b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8140b57cec5SDimitry Andric 8150b57cec5SDimitry Andric SourceLocation expansionLoc; 8160b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 8170b57cec5SDimitry Andric return false; 8180b57cec5SDimitry Andric 8190b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8200b57cec5SDimitry Andric // No other macro expansions, this is the first. 8210b57cec5SDimitry Andric if (MacroBegin) 8220b57cec5SDimitry Andric *MacroBegin = expansionLoc; 8230b57cec5SDimitry Andric return true; 8240b57cec5SDimitry Andric } 8250b57cec5SDimitry Andric 8260b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 8270b57cec5SDimitry Andric } 8280b57cec5SDimitry Andric 8290b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last 8300b57cec5SDimitry Andric /// token of the macro expansion. 8310b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 8320b57cec5SDimitry Andric const SourceManager &SM, 8330b57cec5SDimitry Andric const LangOptions &LangOpts, 8340b57cec5SDimitry Andric SourceLocation *MacroEnd) { 8350b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8360b57cec5SDimitry Andric 8370b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc); 8380b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 8390b57cec5SDimitry Andric if (tokLen == 0) 8400b57cec5SDimitry Andric return false; 8410b57cec5SDimitry Andric 8420b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 8430b57cec5SDimitry Andric SourceLocation expansionLoc; 8440b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 8450b57cec5SDimitry Andric return false; 8460b57cec5SDimitry Andric 8470b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8480b57cec5SDimitry Andric // No other macro expansions. 8490b57cec5SDimitry Andric if (MacroEnd) 8500b57cec5SDimitry Andric *MacroEnd = expansionLoc; 8510b57cec5SDimitry Andric return true; 8520b57cec5SDimitry Andric } 8530b57cec5SDimitry Andric 8540b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 8550b57cec5SDimitry Andric } 8560b57cec5SDimitry Andric 8570b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 8580b57cec5SDimitry Andric const SourceManager &SM, 8590b57cec5SDimitry Andric const LangOptions &LangOpts) { 8600b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8610b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8620b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID()); 8630b57cec5SDimitry Andric if (Range.isTokenRange()) { 8640b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 8650b57cec5SDimitry Andric if (End.isInvalid()) 8660b57cec5SDimitry Andric return {}; 8670b57cec5SDimitry Andric } 8680b57cec5SDimitry Andric 8690b57cec5SDimitry Andric // Break down the source locations. 8700b57cec5SDimitry Andric FileID FID; 8710b57cec5SDimitry Andric unsigned BeginOffs; 8720b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 8730b57cec5SDimitry Andric if (FID.isInvalid()) 8740b57cec5SDimitry Andric return {}; 8750b57cec5SDimitry Andric 8760b57cec5SDimitry Andric unsigned EndOffs; 8770b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) || 8780b57cec5SDimitry Andric BeginOffs > EndOffs) 8790b57cec5SDimitry Andric return {}; 8800b57cec5SDimitry Andric 8810b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End); 8820b57cec5SDimitry Andric } 8830b57cec5SDimitry Andric 884fe6060f1SDimitry Andric // Assumes that `Loc` is in an expansion. 885fe6060f1SDimitry Andric static bool isInExpansionTokenRange(const SourceLocation Loc, 886fe6060f1SDimitry Andric const SourceManager &SM) { 887fe6060f1SDimitry Andric return SM.getSLocEntry(SM.getFileID(Loc)) 888fe6060f1SDimitry Andric .getExpansion() 889fe6060f1SDimitry Andric .isExpansionTokenRange(); 890fe6060f1SDimitry Andric } 891fe6060f1SDimitry Andric 8920b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 8930b57cec5SDimitry Andric const SourceManager &SM, 8940b57cec5SDimitry Andric const LangOptions &LangOpts) { 8950b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8960b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8970b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid()) 8980b57cec5SDimitry Andric return {}; 8990b57cec5SDimitry Andric 9000b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID()) 9010b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9020b57cec5SDimitry Andric 9030b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) { 9040b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 9050b57cec5SDimitry Andric return {}; 9060b57cec5SDimitry Andric Range.setBegin(Begin); 9070b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9080b57cec5SDimitry Andric } 9090b57cec5SDimitry Andric 9100b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) { 911fe6060f1SDimitry Andric if (Range.isTokenRange()) { 912fe6060f1SDimitry Andric if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 913fe6060f1SDimitry Andric return {}; 914fe6060f1SDimitry Andric // Use the *original* end, not the expanded one in `End`. 915fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 916fe6060f1SDimitry Andric } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 9170b57cec5SDimitry Andric return {}; 9180b57cec5SDimitry Andric Range.setEnd(End); 9190b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9200b57cec5SDimitry Andric } 9210b57cec5SDimitry Andric 9220b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID()); 9230b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd; 9240b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 9250b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 9260b57cec5SDimitry Andric &MacroEnd)) || 9270b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 9280b57cec5SDimitry Andric &MacroEnd)))) { 9290b57cec5SDimitry Andric Range.setBegin(MacroBegin); 9300b57cec5SDimitry Andric Range.setEnd(MacroEnd); 931fe6060f1SDimitry Andric // Use the *original* `End`, not the expanded one in `MacroEnd`. 932fe6060f1SDimitry Andric if (Range.isTokenRange()) 933fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(End, SM)); 9340b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9350b57cec5SDimitry Andric } 9360b57cec5SDimitry Andric 9370b57cec5SDimitry Andric bool Invalid = false; 9380b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 9390b57cec5SDimitry Andric &Invalid); 9400b57cec5SDimitry Andric if (Invalid) 9410b57cec5SDimitry Andric return {}; 9420b57cec5SDimitry Andric 9430b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) { 9440b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 9450b57cec5SDimitry Andric &Invalid); 9460b57cec5SDimitry Andric if (Invalid) 9470b57cec5SDimitry Andric return {}; 9480b57cec5SDimitry Andric 9490b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() && 9500b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() == 9510b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) { 9520b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 9530b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End)); 9540b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts); 9550b57cec5SDimitry Andric } 9560b57cec5SDimitry Andric } 9570b57cec5SDimitry Andric 9580b57cec5SDimitry Andric return {}; 9590b57cec5SDimitry Andric } 9600b57cec5SDimitry Andric 9610b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range, 9620b57cec5SDimitry Andric const SourceManager &SM, 9630b57cec5SDimitry Andric const LangOptions &LangOpts, 9640b57cec5SDimitry Andric bool *Invalid) { 9650b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts); 9660b57cec5SDimitry Andric if (Range.isInvalid()) { 9670b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9680b57cec5SDimitry Andric return {}; 9690b57cec5SDimitry Andric } 9700b57cec5SDimitry Andric 9710b57cec5SDimitry Andric // Break down the source location. 9720b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 9730b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) { 9740b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9750b57cec5SDimitry Andric return {}; 9760b57cec5SDimitry Andric } 9770b57cec5SDimitry Andric 9780b57cec5SDimitry Andric unsigned EndOffs; 9790b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 9800b57cec5SDimitry Andric beginInfo.second > EndOffs) { 9810b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9820b57cec5SDimitry Andric return {}; 9830b57cec5SDimitry Andric } 9840b57cec5SDimitry Andric 9850b57cec5SDimitry Andric // Try to the load the file buffer. 9860b57cec5SDimitry Andric bool invalidTemp = false; 9870b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 9880b57cec5SDimitry Andric if (invalidTemp) { 9890b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9900b57cec5SDimitry Andric return {}; 9910b57cec5SDimitry Andric } 9920b57cec5SDimitry Andric 9930b57cec5SDimitry Andric if (Invalid) *Invalid = false; 9940b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second); 9950b57cec5SDimitry Andric } 9960b57cec5SDimitry Andric 9970b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 9980b57cec5SDimitry Andric const SourceManager &SM, 9990b57cec5SDimitry Andric const LangOptions &LangOpts) { 10000b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10010b57cec5SDimitry Andric 10020b57cec5SDimitry Andric // Find the location of the immediate macro expansion. 10030b57cec5SDimitry Andric while (true) { 10040b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc); 10050b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 10060b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 10070b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart(); 10080b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion()) 10090b57cec5SDimitry Andric break; 10100b57cec5SDimitry Andric 10110b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come 10120b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )" 10130b57cec5SDimitry Andric 10140b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the 10150b57cec5SDimitry Andric // macro expansion. 10160b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10170b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc(); 10180b57cec5SDimitry Andric if (SpellLoc.isFileID()) 10190b57cec5SDimitry Andric break; // No inner macro. 10200b57cec5SDimitry Andric 10210b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion 10220b57cec5SDimitry Andric // location, it means there is no inner macro. 10230b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc); 10240b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID)) 10250b57cec5SDimitry Andric break; 10260b57cec5SDimitry Andric 10270b57cec5SDimitry Andric // Argument came from inner macro. 10280b57cec5SDimitry Andric Loc = SpellLoc; 10290b57cec5SDimitry Andric } 10300b57cec5SDimitry Andric 10310b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10320b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10330b57cec5SDimitry Andric // expanding this macro. 10340b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc); 10350b57cec5SDimitry Andric 10360b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10370b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10380b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10390b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10400b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10410b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10420b57cec5SDimitry Andric } 10430b57cec5SDimitry Andric 10440b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics( 10450b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 10460b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10470b57cec5SDimitry Andric // Walk past macro argument expansions. 10480b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc)) 10490b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10500b57cec5SDimitry Andric 10510b57cec5SDimitry Andric // If the macro's spelling has no FileID, then it's actually a token paste 10520b57cec5SDimitry Andric // or stringization (or similar) and not a macro at all. 10530b57cec5SDimitry Andric if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 10540b57cec5SDimitry Andric return {}; 10550b57cec5SDimitry Andric 10560b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10570b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10580b57cec5SDimitry Andric // expanding this macro. 10590b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 10600b57cec5SDimitry Andric 10610b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10620b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10630b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10640b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10650b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10660b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10670b57cec5SDimitry Andric } 10680b57cec5SDimitry Andric 1069349cc55cSDimitry Andric bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1070349cc55cSDimitry Andric return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 10710b57cec5SDimitry Andric } 10720b57cec5SDimitry Andric 10730b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 10740b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0])); 10750b57cec5SDimitry Andric if (Str - 1 < BufferStart) 10760b57cec5SDimitry Andric return false; 10770b57cec5SDimitry Andric 10780b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') || 10790b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) { 10800b57cec5SDimitry Andric if (Str - 2 < BufferStart) 10810b57cec5SDimitry Andric return false; 10820b57cec5SDimitry Andric --Str; 10830b57cec5SDimitry Andric } 10840b57cec5SDimitry Andric --Str; 10850b57cec5SDimitry Andric 10860b57cec5SDimitry Andric // Rewind to first non-space character: 10870b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str)) 10880b57cec5SDimitry Andric --Str; 10890b57cec5SDimitry Andric 10900b57cec5SDimitry Andric return *Str == '\\'; 10910b57cec5SDimitry Andric } 10920b57cec5SDimitry Andric 10930b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc, 10940b57cec5SDimitry Andric const SourceManager &SM) { 10950b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID()) 10960b57cec5SDimitry Andric return {}; 10970b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 10980b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 10990b57cec5SDimitry Andric return {}; 11000b57cec5SDimitry Andric bool Invalid = false; 11010b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 11020b57cec5SDimitry Andric if (Invalid) 11030b57cec5SDimitry Andric return {}; 11040b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 11050b57cec5SDimitry Andric if (!Line) 11060b57cec5SDimitry Andric return {}; 11070b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data()); 11080b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 11090b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos 11100b57cec5SDimitry Andric ? "" 11110b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars); 11120b57cec5SDimitry Andric } 11130b57cec5SDimitry Andric 11140b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11150b57cec5SDimitry Andric // Diagnostics forwarding code. 11160b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11170b57cec5SDimitry Andric 11180b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 11190b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping. 11200b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow 11210b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined. 11220b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 11230b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 11240b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 11250b57cec5SDimitry Andric SourceLocation FileLoc, 11260b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) { 11270b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion"); 11280b57cec5SDimitry Andric 11290b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like 11300b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the 11310b57cec5SDimitry Andric // spelling location. 11320b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 11330b57cec5SDimitry Andric 11340b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 11350b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset. 11360b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 11370b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 11380b57cec5SDimitry Andric 11390b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the 11400b57cec5SDimitry Andric // original _Pragma(...) sequence. 11410b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 11420b57cec5SDimitry Andric 11430b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 11440b57cec5SDimitry Andric } 11450b57cec5SDimitry Andric 11460b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified 11470b57cec5SDimitry Andric /// offset in the current file. 11480b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc, 11490b57cec5SDimitry Andric unsigned TokLen) const { 11500b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd && 11510b57cec5SDimitry Andric "Location out of range for this buffer!"); 11520b57cec5SDimitry Andric 11530b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return 11540b57cec5SDimitry Andric // the file id from FileLoc with the offset specified. 11550b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart; 11560b57cec5SDimitry Andric if (FileLoc.isFileID()) 11570b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo); 11580b57cec5SDimitry Andric 11590b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the 11600b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined. 11610b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers"); 11620b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 11630b57cec5SDimitry Andric } 11640b57cec5SDimitry Andric 11650b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source 11660b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering. 11670b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 11680b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID); 11690b57cec5SDimitry Andric } 11700b57cec5SDimitry Andric 11710b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11720b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code. 11730b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11740b57cec5SDimitry Andric 11750b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 11760b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 11770b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) { 11780b57cec5SDimitry Andric switch (Letter) { 11790b57cec5SDimitry Andric default: return 0; 11800b57cec5SDimitry Andric case '=': return '#'; 11810b57cec5SDimitry Andric case ')': return ']'; 11820b57cec5SDimitry Andric case '(': return '['; 11830b57cec5SDimitry Andric case '!': return '|'; 11840b57cec5SDimitry Andric case '\'': return '^'; 11850b57cec5SDimitry Andric case '>': return '}'; 11860b57cec5SDimitry Andric case '/': return '\\'; 11870b57cec5SDimitry Andric case '<': return '{'; 11880b57cec5SDimitry Andric case '-': return '~'; 11890b57cec5SDimitry Andric } 11900b57cec5SDimitry Andric } 11910b57cec5SDimitry Andric 11920b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when 11930b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 11940b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use 11950b57cec5SDimitry Andric /// whether trigraphs are enabled or not. 1196*81ad6265SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 11970b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP); 11980b57cec5SDimitry Andric if (!Res || !L) return Res; 11990b57cec5SDimitry Andric 1200*81ad6265SDimitry Andric if (!Trigraphs) { 12010b57cec5SDimitry Andric if (!L->isLexingRawMode()) 12020b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored); 12030b57cec5SDimitry Andric return 0; 12040b57cec5SDimitry Andric } 12050b57cec5SDimitry Andric 12060b57cec5SDimitry Andric if (!L->isLexingRawMode()) 12070b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 12080b57cec5SDimitry Andric return Res; 12090b57cec5SDimitry Andric } 12100b57cec5SDimitry Andric 12110b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline, 12120b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 12130b57cec5SDimitry Andric /// trigraph equivalent on entry to this function. 12140b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 12150b57cec5SDimitry Andric unsigned Size = 0; 12160b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) { 12170b57cec5SDimitry Andric ++Size; 12180b57cec5SDimitry Andric 12190b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 12200b57cec5SDimitry Andric continue; 12210b57cec5SDimitry Andric 12220b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half. 12230b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 12240b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size]) 12250b57cec5SDimitry Andric ++Size; 12260b57cec5SDimitry Andric 12270b57cec5SDimitry Andric return Size; 12280b57cec5SDimitry Andric } 12290b57cec5SDimitry Andric 12300b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else. 12310b57cec5SDimitry Andric return 0; 12320b57cec5SDimitry Andric } 12330b57cec5SDimitry Andric 12340b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 12350b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found, 12360b57cec5SDimitry Andric /// otherwise return P. 12370b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) { 12380b57cec5SDimitry Andric while (true) { 12390b57cec5SDimitry Andric const char *AfterEscape; 12400b57cec5SDimitry Andric if (*P == '\\') { 12410b57cec5SDimitry Andric AfterEscape = P+1; 12420b57cec5SDimitry Andric } else if (*P == '?') { 12430b57cec5SDimitry Andric // If not a trigraph for escape, bail out. 12440b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/') 12450b57cec5SDimitry Andric return P; 12460b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not 12470b57cec5SDimitry Andric // support trigraphs. 12480b57cec5SDimitry Andric AfterEscape = P+3; 12490b57cec5SDimitry Andric } else { 12500b57cec5SDimitry Andric return P; 12510b57cec5SDimitry Andric } 12520b57cec5SDimitry Andric 12530b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 12540b57cec5SDimitry Andric if (NewLineSize == 0) return P; 12550b57cec5SDimitry Andric P = AfterEscape+NewLineSize; 12560b57cec5SDimitry Andric } 12570b57cec5SDimitry Andric } 12580b57cec5SDimitry Andric 12590b57cec5SDimitry Andric Optional<Token> Lexer::findNextToken(SourceLocation Loc, 12600b57cec5SDimitry Andric const SourceManager &SM, 12610b57cec5SDimitry Andric const LangOptions &LangOpts) { 12620b57cec5SDimitry Andric if (Loc.isMacroID()) { 12630b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 12640b57cec5SDimitry Andric return None; 12650b57cec5SDimitry Andric } 12660b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 12670b57cec5SDimitry Andric 12680b57cec5SDimitry Andric // Break down the source location. 12690b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 12700b57cec5SDimitry Andric 12710b57cec5SDimitry Andric // Try to load the file buffer. 12720b57cec5SDimitry Andric bool InvalidTemp = false; 12730b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 12740b57cec5SDimitry Andric if (InvalidTemp) 12750b57cec5SDimitry Andric return None; 12760b57cec5SDimitry Andric 12770b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second; 12780b57cec5SDimitry Andric 12790b57cec5SDimitry Andric // Lex from the start of the given location. 12800b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 12810b57cec5SDimitry Andric TokenBegin, File.end()); 12820b57cec5SDimitry Andric // Find the token. 12830b57cec5SDimitry Andric Token Tok; 12840b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok); 12850b57cec5SDimitry Andric return Tok; 12860b57cec5SDimitry Andric } 12870b57cec5SDimitry Andric 12880b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the 12890b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location 12900b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the 12910b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid. 12920b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken( 12930b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 12940b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 12950b57cec5SDimitry Andric Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 12960b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind)) 12970b57cec5SDimitry Andric return {}; 12980b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation(); 12990b57cec5SDimitry Andric 13000b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any. 13010b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0; 13020b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) { 13030b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 13040b57cec5SDimitry Andric unsigned char C = *TokenEnd; 13050b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) { 13060b57cec5SDimitry Andric C = *(++TokenEnd); 13070b57cec5SDimitry Andric NumWhitespaceChars++; 13080b57cec5SDimitry Andric } 13090b57cec5SDimitry Andric 13100b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r 13110b57cec5SDimitry Andric if (C == '\n' || C == '\r') { 13120b57cec5SDimitry Andric char PrevC = C; 13130b57cec5SDimitry Andric C = *(++TokenEnd); 13140b57cec5SDimitry Andric NumWhitespaceChars++; 13150b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC) 13160b57cec5SDimitry Andric NumWhitespaceChars++; 13170b57cec5SDimitry Andric } 13180b57cec5SDimitry Andric } 13190b57cec5SDimitry Andric 13200b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 13210b57cec5SDimitry Andric } 13220b57cec5SDimitry Andric 13230b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 13240b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases: 13250b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph, 13260b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?', 13270b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not. 13280b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between 13290b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return 13300b57cec5SDimitry Andric /// the char after it. 13310b57cec5SDimitry Andric /// 13320b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we 13330b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented 13340b57cec5SDimitry Andric /// Ptr by Size bytes. 13350b57cec5SDimitry Andric /// 13360b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 13370b57cec5SDimitry Andric /// be updated to match. 13380b57cec5SDimitry Andric char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 13390b57cec5SDimitry Andric Token *Tok) { 13400b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 13410b57cec5SDimitry Andric if (Ptr[0] == '\\') { 13420b57cec5SDimitry Andric ++Size; 13430b57cec5SDimitry Andric ++Ptr; 13440b57cec5SDimitry Andric Slash: 13450b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 13460b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 13470b57cec5SDimitry Andric 13480b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and 13490b57cec5SDimitry Andric // newline. 13500b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 13510b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13520b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13530b57cec5SDimitry Andric 13540b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline. 13550b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 13560b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space); 13570b57cec5SDimitry Andric 13580b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 13590b57cec5SDimitry Andric Size += EscapedNewLineSize; 13600b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 13610b57cec5SDimitry Andric 13620b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 13630b57cec5SDimitry Andric return getCharAndSizeSlow(Ptr, Size, Tok); 13640b57cec5SDimitry Andric } 13650b57cec5SDimitry Andric 13660b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 13670b57cec5SDimitry Andric return '\\'; 13680b57cec5SDimitry Andric } 13690b57cec5SDimitry Andric 13700b57cec5SDimitry Andric // If this is a trigraph, process it. 13710b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') { 13720b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit 13730b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it. 1374*81ad6265SDimitry Andric if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 1375*81ad6265SDimitry Andric LangOpts.Trigraphs)) { 13760b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13770b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13780b57cec5SDimitry Andric 13790b57cec5SDimitry Andric Ptr += 3; 13800b57cec5SDimitry Andric Size += 3; 13810b57cec5SDimitry Andric if (C == '\\') goto Slash; 13820b57cec5SDimitry Andric return C; 13830b57cec5SDimitry Andric } 13840b57cec5SDimitry Andric } 13850b57cec5SDimitry Andric 13860b57cec5SDimitry Andric // If this is neither, return a single character. 13870b57cec5SDimitry Andric ++Size; 13880b57cec5SDimitry Andric return *Ptr; 13890b57cec5SDimitry Andric } 13900b57cec5SDimitry Andric 13910b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 13920b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 13930b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes. 13940b57cec5SDimitry Andric /// 13950b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 13960b57cec5SDimitry Andric /// be updated to match. 13970b57cec5SDimitry Andric char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 13980b57cec5SDimitry Andric const LangOptions &LangOpts) { 13990b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 14000b57cec5SDimitry Andric if (Ptr[0] == '\\') { 14010b57cec5SDimitry Andric ++Size; 14020b57cec5SDimitry Andric ++Ptr; 14030b57cec5SDimitry Andric Slash: 14040b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 14050b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 14060b57cec5SDimitry Andric 14070b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline. 14080b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 14090b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 14100b57cec5SDimitry Andric Size += EscapedNewLineSize; 14110b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 14120b57cec5SDimitry Andric 14130b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 14140b57cec5SDimitry Andric return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 14150b57cec5SDimitry Andric } 14160b57cec5SDimitry Andric 14170b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 14180b57cec5SDimitry Andric return '\\'; 14190b57cec5SDimitry Andric } 14200b57cec5SDimitry Andric 14210b57cec5SDimitry Andric // If this is a trigraph, process it. 14220b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 14230b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return 14240b57cec5SDimitry Andric // it. 14250b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) { 14260b57cec5SDimitry Andric Ptr += 3; 14270b57cec5SDimitry Andric Size += 3; 14280b57cec5SDimitry Andric if (C == '\\') goto Slash; 14290b57cec5SDimitry Andric return C; 14300b57cec5SDimitry Andric } 14310b57cec5SDimitry Andric } 14320b57cec5SDimitry Andric 14330b57cec5SDimitry Andric // If this is neither, return a single character. 14340b57cec5SDimitry Andric ++Size; 14350b57cec5SDimitry Andric return *Ptr; 14360b57cec5SDimitry Andric } 14370b57cec5SDimitry Andric 14380b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14390b57cec5SDimitry Andric // Helper methods for lexing. 14400b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14410b57cec5SDimitry Andric 14420b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file. 14430b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 14440b57cec5SDimitry Andric BufferPtr = BufferStart + Offset; 14450b57cec5SDimitry Andric if (BufferPtr > BufferEnd) 14460b57cec5SDimitry Andric BufferPtr = BufferEnd; 14470b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two 14480b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the 14490b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line. 14500b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine; 14510b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine; 14520b57cec5SDimitry Andric } 14530b57cec5SDimitry Andric 1454349cc55cSDimitry Andric static bool isUnicodeWhitespace(uint32_t Codepoint) { 1455349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1456349cc55cSDimitry Andric UnicodeWhitespaceCharRanges); 1457349cc55cSDimitry Andric return UnicodeWhitespaceChars.contains(Codepoint); 1458349cc55cSDimitry Andric } 1459349cc55cSDimitry Andric 14600b57cec5SDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 14610b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14620b57cec5SDimitry Andric return false; 1463480093f4SDimitry Andric } else if (LangOpts.DollarIdents && '$' == C) { 1464480093f4SDimitry Andric return true; 1465349cc55cSDimitry Andric } else if (LangOpts.CPlusPlus) { 1466349cc55cSDimitry Andric // A non-leading codepoint must have the XID_Continue property. 1467349cc55cSDimitry Andric // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1468349cc55cSDimitry Andric // so we need to check both tables. 1469349cc55cSDimitry Andric // '_' doesn't have the XID_Continue property but is allowed in C++. 1470349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1471349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1472349cc55cSDimitry Andric return C == '_' || XIDStartChars.contains(C) || 1473349cc55cSDimitry Andric XIDContinueChars.contains(C); 1474349cc55cSDimitry Andric } else if (LangOpts.C11) { 14750b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 14760b57cec5SDimitry Andric C11AllowedIDCharRanges); 14770b57cec5SDimitry Andric return C11AllowedIDChars.contains(C); 14780b57cec5SDimitry Andric } else { 14790b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 14800b57cec5SDimitry Andric C99AllowedIDCharRanges); 14810b57cec5SDimitry Andric return C99AllowedIDChars.contains(C); 14820b57cec5SDimitry Andric } 14830b57cec5SDimitry Andric } 14840b57cec5SDimitry Andric 14850b57cec5SDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 14860b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14870b57cec5SDimitry Andric return false; 1488349cc55cSDimitry Andric } 1489349cc55cSDimitry Andric if (LangOpts.CPlusPlus) { 1490349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1491349cc55cSDimitry Andric // '_' doesn't have the XID_Start property but is allowed in C++. 1492349cc55cSDimitry Andric return C == '_' || XIDStartChars.contains(C); 1493349cc55cSDimitry Andric } 1494349cc55cSDimitry Andric if (!isAllowedIDChar(C, LangOpts)) 1495349cc55cSDimitry Andric return false; 1496349cc55cSDimitry Andric if (LangOpts.C11) { 14970b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 14980b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges); 14990b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C); 1500349cc55cSDimitry Andric } 15010b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 15020b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 15030b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C); 15040b57cec5SDimitry Andric } 15050b57cec5SDimitry Andric 15060b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 15070b57cec5SDimitry Andric const char *End) { 15080b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 15090b57cec5SDimitry Andric L.getSourceLocation(End)); 15100b57cec5SDimitry Andric } 15110b57cec5SDimitry Andric 15120b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 15130b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) { 15140b57cec5SDimitry Andric // Check C99 compatibility. 15150b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 15160b57cec5SDimitry Andric enum { 15170b57cec5SDimitry Andric CannotAppearInIdentifier = 0, 15180b57cec5SDimitry Andric CannotStartIdentifier 15190b57cec5SDimitry Andric }; 15200b57cec5SDimitry Andric 15210b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 15220b57cec5SDimitry Andric C99AllowedIDCharRanges); 15230b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 15240b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 15250b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) { 15260b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 15270b57cec5SDimitry Andric << Range 15280b57cec5SDimitry Andric << CannotAppearInIdentifier; 15290b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 15300b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 15310b57cec5SDimitry Andric << Range 15320b57cec5SDimitry Andric << CannotStartIdentifier; 15330b57cec5SDimitry Andric } 15340b57cec5SDimitry Andric } 15350b57cec5SDimitry Andric } 15360b57cec5SDimitry Andric 15370b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier 15380b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier 15390b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier 15400b57cec5SDimitry Andric /// character and warn if so. 15410b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 15420b57cec5SDimitry Andric CharSourceRange Range) { 15430b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 15440b57cec5SDimitry Andric struct HomoglyphPair { 15450b57cec5SDimitry Andric uint32_t Character; 15460b57cec5SDimitry Andric char LooksLike; 15470b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; } 15480b57cec5SDimitry Andric }; 15490b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = { 15500b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN 15510b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 15520b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK 15530b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE 15540b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 15550b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER 15560b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER 15570b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION 15580b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES 15590b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR 15600b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS 15610b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN 15620b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH 15630b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS 15640b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR 15650b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES 15660b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND 15670b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO 15680b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR 15690b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON 15700b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 15710b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 15720b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 15730b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 15740b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 15750b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 15760b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 15770b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 15780b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 15790b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 15800b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA 15810b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 15820b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 15830b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 15840b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON 15850b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 15860b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 15870b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 15880b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 15890b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 15900b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 15910b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 15920b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 15930b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 15940b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 15950b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 15960b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 15970b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 15980b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE 15990b57cec5SDimitry Andric {0, 0} 16000b57cec5SDimitry Andric }; 16010b57cec5SDimitry Andric auto Homoglyph = 16020b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs), 16030b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 16040b57cec5SDimitry Andric if (Homoglyph->Character == C) { 16050b57cec5SDimitry Andric llvm::SmallString<5> CharBuf; 16060b57cec5SDimitry Andric { 16070b57cec5SDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 16080b57cec5SDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 16090b57cec5SDimitry Andric } 16100b57cec5SDimitry Andric if (Homoglyph->LooksLike) { 16110b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 16120b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 16130b57cec5SDimitry Andric << Range << CharBuf << LooksLikeStr; 16140b57cec5SDimitry Andric } else { 16150b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 16160b57cec5SDimitry Andric << Range << CharBuf; 16170b57cec5SDimitry Andric } 16180b57cec5SDimitry Andric } 16190b57cec5SDimitry Andric } 16200b57cec5SDimitry Andric 1621349cc55cSDimitry Andric static void diagnoseInvalidUnicodeCodepointInIdentifier( 1622349cc55cSDimitry Andric DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1623349cc55cSDimitry Andric CharSourceRange Range, bool IsFirst) { 1624349cc55cSDimitry Andric if (isASCII(CodePoint)) 1625349cc55cSDimitry Andric return; 1626349cc55cSDimitry Andric 1627349cc55cSDimitry Andric bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts); 1628349cc55cSDimitry Andric bool IsIDContinue = IsIDStart || isAllowedIDChar(CodePoint, LangOpts); 1629349cc55cSDimitry Andric 1630349cc55cSDimitry Andric if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1631349cc55cSDimitry Andric return; 1632349cc55cSDimitry Andric 1633349cc55cSDimitry Andric bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1634349cc55cSDimitry Andric 1635349cc55cSDimitry Andric llvm::SmallString<5> CharBuf; 1636349cc55cSDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 1637349cc55cSDimitry Andric llvm::write_hex(CharOS, CodePoint, llvm::HexPrintStyle::Upper, 4); 1638349cc55cSDimitry Andric 1639349cc55cSDimitry Andric if (!IsFirst || InvalidOnlyAtStart) { 1640349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1641349cc55cSDimitry Andric << Range << CharBuf << int(InvalidOnlyAtStart) 1642349cc55cSDimitry Andric << FixItHint::CreateRemoval(Range); 1643349cc55cSDimitry Andric } else { 1644349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1645349cc55cSDimitry Andric << Range << CharBuf << FixItHint::CreateRemoval(Range); 1646349cc55cSDimitry Andric } 1647349cc55cSDimitry Andric } 1648349cc55cSDimitry Andric 16490b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 16500b57cec5SDimitry Andric Token &Result) { 16510b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size; 16520b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1653349cc55cSDimitry Andric if (CodePoint == 0) { 16540b57cec5SDimitry Andric return false; 1655349cc55cSDimitry Andric } 16560b57cec5SDimitry Andric 1657349cc55cSDimitry Andric if (!isAllowedIDChar(CodePoint, LangOpts)) { 1658349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1659349cc55cSDimitry Andric return false; 1660349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1661349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1662349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1663349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1664349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 1665349cc55cSDimitry Andric /*IsFirst=*/false); 1666349cc55cSDimitry Andric 1667349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1668349cc55cSDimitry Andric // a valid identifier part. 1669349cc55cSDimitry Andric // Carry on as if the codepoint was valid for recovery purposes. 1670349cc55cSDimitry Andric } else if (!isLexingRawMode()) 16710b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 16720b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 16730b57cec5SDimitry Andric /*IsFirst=*/false); 16740b57cec5SDimitry Andric 16750b57cec5SDimitry Andric Result.setFlag(Token::HasUCN); 16760b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 16770b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 16780b57cec5SDimitry Andric CurPtr = UCNPtr; 16790b57cec5SDimitry Andric else 16800b57cec5SDimitry Andric while (CurPtr != UCNPtr) 16810b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 16820b57cec5SDimitry Andric return true; 16830b57cec5SDimitry Andric } 16840b57cec5SDimitry Andric 16850b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 16860b57cec5SDimitry Andric const char *UnicodePtr = CurPtr; 16870b57cec5SDimitry Andric llvm::UTF32 CodePoint; 16880b57cec5SDimitry Andric llvm::ConversionResult Result = 16890b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 16900b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 16910b57cec5SDimitry Andric &CodePoint, 16920b57cec5SDimitry Andric llvm::strictConversion); 1693349cc55cSDimitry Andric if (Result != llvm::conversionOK) 16940b57cec5SDimitry Andric return false; 16950b57cec5SDimitry Andric 1696349cc55cSDimitry Andric if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) { 1697349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1698349cc55cSDimitry Andric return false; 1699349cc55cSDimitry Andric 1700349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1701349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1702349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1703349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1704349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); 1705349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1706349cc55cSDimitry Andric // a valid identifier part. Carry on as if the codepoint was 1707349cc55cSDimitry Andric // valid for recovery purposes. 1708349cc55cSDimitry Andric } else if (!isLexingRawMode()) { 17090b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 17100b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), 17110b57cec5SDimitry Andric /*IsFirst=*/false); 17120b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 17130b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr)); 17140b57cec5SDimitry Andric } 17150b57cec5SDimitry Andric 17160b57cec5SDimitry Andric CurPtr = UnicodePtr; 17170b57cec5SDimitry Andric return true; 17180b57cec5SDimitry Andric } 17190b57cec5SDimitry Andric 1720349cc55cSDimitry Andric bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1721349cc55cSDimitry Andric const char *CurPtr) { 1722349cc55cSDimitry Andric if (isAllowedInitiallyIDChar(C, LangOpts)) { 1723349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1724349cc55cSDimitry Andric !PP->isPreprocessedOutput()) { 1725349cc55cSDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1726349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), 1727349cc55cSDimitry Andric /*IsFirst=*/true); 1728349cc55cSDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1729349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 1730349cc55cSDimitry Andric } 1731349cc55cSDimitry Andric 1732349cc55cSDimitry Andric MIOpt.ReadToken(); 1733349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 1734349cc55cSDimitry Andric } 1735349cc55cSDimitry Andric 1736349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1737349cc55cSDimitry Andric !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1738349cc55cSDimitry Andric !isAllowedInitiallyIDChar(C, LangOpts) && !isUnicodeWhitespace(C)) { 1739349cc55cSDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 1740349cc55cSDimitry Andric // Instead of letting the parser complain about the unknown token, 1741349cc55cSDimitry Andric // just drop the character. 1742349cc55cSDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually 1743349cc55cSDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that 1744349cc55cSDimitry Andric // we not throw away any possible preprocessor tokens, but there's a 1745349cc55cSDimitry Andric // loophole in the mapping of Unicode characters to basic character set 1746349cc55cSDimitry Andric // characters that allows us to map these particular characters to, say, 1747349cc55cSDimitry Andric // whitespace. 1748349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1749349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, C, 1750349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1751349cc55cSDimitry Andric BufferPtr = CurPtr; 1752349cc55cSDimitry Andric return false; 1753349cc55cSDimitry Andric } 1754349cc55cSDimitry Andric 1755349cc55cSDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show 1756349cc55cSDimitry Andric // up by accident. 1757349cc55cSDimitry Andric MIOpt.ReadToken(); 1758349cc55cSDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 1759349cc55cSDimitry Andric return true; 1760349cc55cSDimitry Andric } 1761349cc55cSDimitry Andric 1762349cc55cSDimitry Andric bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1763349cc55cSDimitry Andric // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1764349cc55cSDimitry Andric while (true) { 1765349cc55cSDimitry Andric unsigned char C = *CurPtr; 1766349cc55cSDimitry Andric // Fast path. 1767349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1768349cc55cSDimitry Andric ++CurPtr; 1769349cc55cSDimitry Andric continue; 1770349cc55cSDimitry Andric } 1771349cc55cSDimitry Andric 17720b57cec5SDimitry Andric unsigned Size; 1773349cc55cSDimitry Andric // Slow path: handle trigraph, unicode codepoints, UCNs. 1774349cc55cSDimitry Andric C = getCharAndSize(CurPtr, Size); 1775349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1776349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1777349cc55cSDimitry Andric continue; 1778349cc55cSDimitry Andric } 1779349cc55cSDimitry Andric if (C == '$') { 1780349cc55cSDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done. 1781349cc55cSDimitry Andric if (!LangOpts.DollarIdents) 1782349cc55cSDimitry Andric break; 1783349cc55cSDimitry Andric // Otherwise, emit a diagnostic and continue. 1784349cc55cSDimitry Andric if (!isLexingRawMode()) 1785349cc55cSDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier); 1786349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1787349cc55cSDimitry Andric continue; 1788349cc55cSDimitry Andric } 1789349cc55cSDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1790349cc55cSDimitry Andric continue; 1791349cc55cSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1792349cc55cSDimitry Andric continue; 1793349cc55cSDimitry Andric // Neither an expected Unicode codepoint nor a UCN. 1794349cc55cSDimitry Andric break; 1795349cc55cSDimitry Andric } 17960b57cec5SDimitry Andric 17970b57cec5SDimitry Andric const char *IdStart = BufferPtr; 17980b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 17990b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart); 18000b57cec5SDimitry Andric 18010b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to 18020b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it. 18030b57cec5SDimitry Andric if (LexingRawMode) 18040b57cec5SDimitry Andric return true; 18050b57cec5SDimitry Andric 18060b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind, 18070b57cec5SDimitry Andric // looking up the identifier in the identifier table. 18080b57cec5SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 18090b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code 18100b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it. 18110b57cec5SDimitry Andric 18120b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat 18130b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword. 18140b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'. 18150b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) { 18160b57cec5SDimitry Andric // Return the code-completion token. 18170b57cec5SDimitry Andric Result.setKind(tok::code_completion); 18180b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters. 18190b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in 18200b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that 18210b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 18220b57cec5SDimitry Andric // simpler. 18230b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0"); 18240b57cec5SDimitry Andric ++CurPtr; 18250b57cec5SDimitry Andric // Note that code completion token is not added as a separate character 18260b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need 18270b57cec5SDimitry Andric // to check if the buffer has ended. 18280b57cec5SDimitry Andric if (CurPtr < BufferEnd) { 1829349cc55cSDimitry Andric while (isAsciiIdentifierContinue(*CurPtr)) 18300b57cec5SDimitry Andric ++CurPtr; 18310b57cec5SDimitry Andric } 18320b57cec5SDimitry Andric BufferPtr = CurPtr; 18330b57cec5SDimitry Andric return true; 18340b57cec5SDimitry Andric } 18350b57cec5SDimitry Andric 18360b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the 18370b57cec5SDimitry Andric // preprocessor, which may macro expand it or something. 18380b57cec5SDimitry Andric if (II->isHandleIdentifierCase()) 18390b57cec5SDimitry Andric return PP->HandleIdentifier(Result); 18400b57cec5SDimitry Andric 18410b57cec5SDimitry Andric return true; 18420b57cec5SDimitry Andric } 18430b57cec5SDimitry Andric 18440b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant. 18450b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens). 18460b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 18470b57cec5SDimitry Andric unsigned Size; 18480b57cec5SDimitry Andric char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 18490b57cec5SDimitry Andric if (C1 != '0') 18500b57cec5SDimitry Andric return false; 18510b57cec5SDimitry Andric char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 18520b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X'); 18530b57cec5SDimitry Andric } 18540b57cec5SDimitry Andric 18550b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point 18560b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the 18570b57cec5SDimitry Andric /// constant. 18580b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 18590b57cec5SDimitry Andric unsigned Size; 18600b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 18610b57cec5SDimitry Andric char PrevCh = 0; 18620b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) { 18630b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 18640b57cec5SDimitry Andric PrevCh = C; 18650b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 18660b57cec5SDimitry Andric } 18670b57cec5SDimitry Andric 18680b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 18690b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 18700b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex. 18710b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 18720b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 18730b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 18740b57cec5SDimitry Andric } 18750b57cec5SDimitry Andric 18760b57cec5SDimitry Andric // If we have a hex FP constant, continue. 18770b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 18780b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 18790b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's 18800b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix. 18810b57cec5SDimitry Andric bool IsHexFloat = true; 18820b57cec5SDimitry Andric if (!LangOpts.C99) { 18830b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts)) 18840b57cec5SDimitry Andric IsHexFloat = false; 1885*81ad6265SDimitry Andric else if (!LangOpts.CPlusPlus17 && 18860b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr) 18870b57cec5SDimitry Andric IsHexFloat = false; 18880b57cec5SDimitry Andric } 18890b57cec5SDimitry Andric if (IsHexFloat) 18900b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 18910b57cec5SDimitry Andric } 18920b57cec5SDimitry Andric 18930b57cec5SDimitry Andric // If we have a digit separator, continue. 1894*81ad6265SDimitry Andric if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) { 18950b57cec5SDimitry Andric unsigned NextSize; 1896*81ad6265SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts); 1897349cc55cSDimitry Andric if (isAsciiIdentifierContinue(Next)) { 18980b57cec5SDimitry Andric if (!isLexingRawMode()) 1899*81ad6265SDimitry Andric Diag(CurPtr, LangOpts.CPlusPlus 1900fe6060f1SDimitry Andric ? diag::warn_cxx11_compat_digit_separator 1901fe6060f1SDimitry Andric : diag::warn_c2x_compat_digit_separator); 19020b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 19030b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result); 19040b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19050b57cec5SDimitry Andric } 19060b57cec5SDimitry Andric } 19070b57cec5SDimitry Andric 19080b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 19090b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 19100b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19110b57cec5SDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 19120b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19130b57cec5SDimitry Andric 19140b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 19150b57cec5SDimitry Andric const char *TokStart = BufferPtr; 19160b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 19170b57cec5SDimitry Andric Result.setLiteralData(TokStart); 19180b57cec5SDimitry Andric return true; 19190b57cec5SDimitry Andric } 19200b57cec5SDimitry Andric 19210b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 19220b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98. 19230b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 19240b57cec5SDimitry Andric bool IsStringLiteral) { 1925*81ad6265SDimitry Andric assert(LangOpts.CPlusPlus); 19260b57cec5SDimitry Andric 19270b57cec5SDimitry Andric // Maximally munch an identifier. 19280b57cec5SDimitry Andric unsigned Size; 19290b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 19300b57cec5SDimitry Andric bool Consumed = false; 19310b57cec5SDimitry Andric 1932349cc55cSDimitry Andric if (!isAsciiIdentifierStart(C)) { 19330b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 19340b57cec5SDimitry Andric Consumed = true; 19350b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 19360b57cec5SDimitry Andric Consumed = true; 19370b57cec5SDimitry Andric else 19380b57cec5SDimitry Andric return CurPtr; 19390b57cec5SDimitry Andric } 19400b57cec5SDimitry Andric 1941*81ad6265SDimitry Andric if (!LangOpts.CPlusPlus11) { 19420b57cec5SDimitry Andric if (!isLexingRawMode()) 19430b57cec5SDimitry Andric Diag(CurPtr, 19440b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal 19450b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal) 19460b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 19470b57cec5SDimitry Andric return CurPtr; 19480b57cec5SDimitry Andric } 19490b57cec5SDimitry Andric 19500b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 19510b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming 19520b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before 19530b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more 19540b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that. 19550b57cec5SDimitry Andric if (!Consumed) { 19560b57cec5SDimitry Andric bool IsUDSuffix = false; 19570b57cec5SDimitry Andric if (C == '_') 19580b57cec5SDimitry Andric IsUDSuffix = true; 1959*81ad6265SDimitry Andric else if (IsStringLiteral && LangOpts.CPlusPlus14) { 19600b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a 19610b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be 19620b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator). 19630b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3; 19640b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C }; 19650b57cec5SDimitry Andric unsigned Consumed = Size; 19660b57cec5SDimitry Andric unsigned Chars = 1; 19670b57cec5SDimitry Andric while (true) { 19680b57cec5SDimitry Andric unsigned NextSize; 1969*81ad6265SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts); 1970349cc55cSDimitry Andric if (!isAsciiIdentifierContinue(Next)) { 19715ffd83dbSDimitry Andric // End of suffix. Check whether this is on the allowed list. 19720b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars); 1973*81ad6265SDimitry Andric IsUDSuffix = 1974*81ad6265SDimitry Andric StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 19750b57cec5SDimitry Andric break; 19760b57cec5SDimitry Andric } 19770b57cec5SDimitry Andric 19780b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength) 19790b57cec5SDimitry Andric // Too long: can't be a standard suffix. 19800b57cec5SDimitry Andric break; 19810b57cec5SDimitry Andric 19820b57cec5SDimitry Andric Buffer[Chars++] = Next; 19830b57cec5SDimitry Andric Consumed += NextSize; 19840b57cec5SDimitry Andric } 19850b57cec5SDimitry Andric } 19860b57cec5SDimitry Andric 19870b57cec5SDimitry Andric if (!IsUDSuffix) { 19880b57cec5SDimitry Andric if (!isLexingRawMode()) 1989*81ad6265SDimitry Andric Diag(CurPtr, LangOpts.MSVCCompat 19900b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal 19910b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal) 19920b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 19930b57cec5SDimitry Andric return CurPtr; 19940b57cec5SDimitry Andric } 19950b57cec5SDimitry Andric 19960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 19970b57cec5SDimitry Andric } 19980b57cec5SDimitry Andric 19990b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix); 20000b57cec5SDimitry Andric while (true) { 20010b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 2002349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 2003349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 2004349cc55cSDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2005349cc55cSDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 2006349cc55cSDimitry Andric } else 2007349cc55cSDimitry Andric break; 20080b57cec5SDimitry Andric } 20090b57cec5SDimitry Andric 20100b57cec5SDimitry Andric return CurPtr; 20110b57cec5SDimitry Andric } 20120b57cec5SDimitry Andric 20130b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 20140b57cec5SDimitry Andric /// either " or L" or u8" or u" or U". 20150b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 20160b57cec5SDimitry Andric tok::TokenKind Kind) { 20170b57cec5SDimitry Andric const char *AfterQuote = CurPtr; 20180b57cec5SDimitry Andric // Does this string contain the \0 character? 20190b57cec5SDimitry Andric const char *NulCharacter = nullptr; 20200b57cec5SDimitry Andric 20210b57cec5SDimitry Andric if (!isLexingRawMode() && 20220b57cec5SDimitry Andric (Kind == tok::utf8_string_literal || 20230b57cec5SDimitry Andric Kind == tok::utf16_string_literal || 20240b57cec5SDimitry Andric Kind == tok::utf32_string_literal)) 2025*81ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 20260b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 20270b57cec5SDimitry Andric 20280b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 20290b57cec5SDimitry Andric while (C != '"') { 20300b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 20310b57cec5SDimitry Andric // getAndAdvanceChar. 20320b57cec5SDimitry Andric if (C == '\\') 20330b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20340b57cec5SDimitry Andric 20350b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 20360b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 20370b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 20380b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 20390b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 20400b57cec5SDimitry Andric return true; 20410b57cec5SDimitry Andric } 20420b57cec5SDimitry Andric 20430b57cec5SDimitry Andric if (C == 0) { 20440b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 20450b57cec5SDimitry Andric if (ParsingFilename) 20460b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 20470b57cec5SDimitry Andric else 20480b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 20490b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 20500b57cec5SDimitry Andric cutOffLexing(); 20510b57cec5SDimitry Andric return true; 20520b57cec5SDimitry Andric } 20530b57cec5SDimitry Andric 20540b57cec5SDimitry Andric NulCharacter = CurPtr-1; 20550b57cec5SDimitry Andric } 20560b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20570b57cec5SDimitry Andric } 20580b57cec5SDimitry Andric 20590b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 2060*81ad6265SDimitry Andric if (LangOpts.CPlusPlus) 20610b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 20620b57cec5SDimitry Andric 20630b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 20640b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 20650b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 20660b57cec5SDimitry Andric 20670b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var. 20680b57cec5SDimitry Andric const char *TokStart = BufferPtr; 20690b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 20700b57cec5SDimitry Andric Result.setLiteralData(TokStart); 20710b57cec5SDimitry Andric return true; 20720b57cec5SDimitry Andric } 20730b57cec5SDimitry Andric 20740b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 20750b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR". 20760b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 20770b57cec5SDimitry Andric tok::TokenKind Kind) { 20780b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 20790b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string, 20800b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs, 20810b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted. 20820b57cec5SDimitry Andric 20830b57cec5SDimitry Andric if (!isLexingRawMode()) 20840b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 20850b57cec5SDimitry Andric 20860b57cec5SDimitry Andric unsigned PrefixLen = 0; 20870b57cec5SDimitry Andric 20880b57cec5SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 20890b57cec5SDimitry Andric ++PrefixLen; 20900b57cec5SDimitry Andric 20910b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter. 20920b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') { 20930b57cec5SDimitry Andric if (!isLexingRawMode()) { 20940b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen]; 20950b57cec5SDimitry Andric if (PrefixLen == 16) { 20960b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long); 20970b57cec5SDimitry Andric } else { 20980b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 20990b57cec5SDimitry Andric << StringRef(PrefixEnd, 1); 21000b57cec5SDimitry Andric } 21010b57cec5SDimitry Andric } 21020b57cec5SDimitry Andric 21030b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 21040b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but 21050b57cec5SDimitry Andric // there's not much we can do about that. 21060b57cec5SDimitry Andric while (true) { 21070b57cec5SDimitry Andric char C = *CurPtr++; 21080b57cec5SDimitry Andric 21090b57cec5SDimitry Andric if (C == '"') 21100b57cec5SDimitry Andric break; 21110b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) { 21120b57cec5SDimitry Andric --CurPtr; 21130b57cec5SDimitry Andric break; 21140b57cec5SDimitry Andric } 21150b57cec5SDimitry Andric } 21160b57cec5SDimitry Andric 21170b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 21180b57cec5SDimitry Andric return true; 21190b57cec5SDimitry Andric } 21200b57cec5SDimitry Andric 21210b57cec5SDimitry Andric // Save prefix and move CurPtr past it 21220b57cec5SDimitry Andric const char *Prefix = CurPtr; 21230b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '(' 21240b57cec5SDimitry Andric 21250b57cec5SDimitry Andric while (true) { 21260b57cec5SDimitry Andric char C = *CurPtr++; 21270b57cec5SDimitry Andric 21280b57cec5SDimitry Andric if (C == ')') { 21290b57cec5SDimitry Andric // Check for prefix match and closing quote. 21300b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 21310b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"' 21320b57cec5SDimitry Andric break; 21330b57cec5SDimitry Andric } 21340b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 21350b57cec5SDimitry Andric if (!isLexingRawMode()) 21360b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string) 21370b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen); 21380b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 21390b57cec5SDimitry Andric return true; 21400b57cec5SDimitry Andric } 21410b57cec5SDimitry Andric } 21420b57cec5SDimitry Andric 21430b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 2144*81ad6265SDimitry Andric if (LangOpts.CPlusPlus) 21450b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 21460b57cec5SDimitry Andric 21470b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 21480b57cec5SDimitry Andric const char *TokStart = BufferPtr; 21490b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 21500b57cec5SDimitry Andric Result.setLiteralData(TokStart); 21510b57cec5SDimitry Andric return true; 21520b57cec5SDimitry Andric } 21530b57cec5SDimitry Andric 21540b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 21550b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames. 21560b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 21570b57cec5SDimitry Andric // Does this string contain the \0 character? 21580b57cec5SDimitry Andric const char *NulCharacter = nullptr; 21590b57cec5SDimitry Andric const char *AfterLessPos = CurPtr; 21600b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 21610b57cec5SDimitry Andric while (C != '>') { 21620b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 21630b57cec5SDimitry Andric // getAndAdvanceChar. 21640b57cec5SDimitry Andric if (C == '\\') 21650b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21660b57cec5SDimitry Andric 2167fe6060f1SDimitry Andric if (isVerticalWhitespace(C) || // Newline. 21680b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 21690b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone < 21700b57cec5SDimitry Andric // character. Return this as such. 21710b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less); 21720b57cec5SDimitry Andric return true; 21730b57cec5SDimitry Andric } 21740b57cec5SDimitry Andric 21750b57cec5SDimitry Andric if (C == 0) { 21760b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) { 21770b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 21780b57cec5SDimitry Andric cutOffLexing(); 21790b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 21800b57cec5SDimitry Andric return true; 21810b57cec5SDimitry Andric } 21820b57cec5SDimitry Andric NulCharacter = CurPtr-1; 21830b57cec5SDimitry Andric } 21840b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21850b57cec5SDimitry Andric } 21860b57cec5SDimitry Andric 21870b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 21880b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 21890b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 21900b57cec5SDimitry Andric 21910b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 21920b57cec5SDimitry Andric const char *TokStart = BufferPtr; 21930b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name); 21940b57cec5SDimitry Andric Result.setLiteralData(TokStart); 21950b57cec5SDimitry Andric return true; 21960b57cec5SDimitry Andric } 21970b57cec5SDimitry Andric 21980b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart, 21990b57cec5SDimitry Andric const char *CompletionPoint, 22000b57cec5SDimitry Andric bool IsAngled) { 22010b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash. 22020b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart); 22035ffd83dbSDimitry Andric llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 22045ffd83dbSDimitry Andric auto Slash = PartialPath.find_last_of(SlashChars); 22050b57cec5SDimitry Andric StringRef Dir = 22060b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 22070b57cec5SDimitry Andric const char *StartOfFilename = 22080b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 22090b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point. 22100b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 22110b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 22125ffd83dbSDimitry Andric // We should replace the characters up to the closing quote or closest slash, 22135ffd83dbSDimitry Andric // if any. 22140b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) { 22150b57cec5SDimitry Andric char Next = *(CompletionPoint + 1); 22160b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n') 22170b57cec5SDimitry Andric break; 22180b57cec5SDimitry Andric ++CompletionPoint; 22190b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"')) 22200b57cec5SDimitry Andric break; 22215ffd83dbSDimitry Andric if (llvm::is_contained(SlashChars, Next)) 22225ffd83dbSDimitry Andric break; 22230b57cec5SDimitry Andric } 22245ffd83dbSDimitry Andric 22250b57cec5SDimitry Andric PP->setCodeCompletionTokenRange( 22260b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 22270b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 22280b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled); 22290b57cec5SDimitry Andric } 22300b57cec5SDimitry Andric 22310b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having 22320b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'. 22330b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 22340b57cec5SDimitry Andric tok::TokenKind Kind) { 22350b57cec5SDimitry Andric // Does this character contain the \0 character? 22360b57cec5SDimitry Andric const char *NulCharacter = nullptr; 22370b57cec5SDimitry Andric 22380b57cec5SDimitry Andric if (!isLexingRawMode()) { 22390b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2240*81ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus 22410b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 22420b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 22430b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant) 22440b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 22450b57cec5SDimitry Andric } 22460b57cec5SDimitry Andric 22470b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 22480b57cec5SDimitry Andric if (C == '\'') { 22490b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 22500b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character); 22510b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 22520b57cec5SDimitry Andric return true; 22530b57cec5SDimitry Andric } 22540b57cec5SDimitry Andric 22550b57cec5SDimitry Andric while (C != '\'') { 22560b57cec5SDimitry Andric // Skip escaped characters. 22570b57cec5SDimitry Andric if (C == '\\') 22580b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22590b57cec5SDimitry Andric 22600b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 22610b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 22620b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 22630b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 22640b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 22650b57cec5SDimitry Andric return true; 22660b57cec5SDimitry Andric } 22670b57cec5SDimitry Andric 22680b57cec5SDimitry Andric if (C == 0) { 22690b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 22700b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 22710b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 22720b57cec5SDimitry Andric cutOffLexing(); 22730b57cec5SDimitry Andric return true; 22740b57cec5SDimitry Andric } 22750b57cec5SDimitry Andric 22760b57cec5SDimitry Andric NulCharacter = CurPtr-1; 22770b57cec5SDimitry Andric } 22780b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22790b57cec5SDimitry Andric } 22800b57cec5SDimitry Andric 22810b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 2282*81ad6265SDimitry Andric if (LangOpts.CPlusPlus) 22830b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false); 22840b57cec5SDimitry Andric 22850b57cec5SDimitry Andric // If a nul character existed in the character, warn about it. 22860b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 22870b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0; 22880b57cec5SDimitry Andric 22890b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 22900b57cec5SDimitry Andric const char *TokStart = BufferPtr; 22910b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 22920b57cec5SDimitry Andric Result.setLiteralData(TokStart); 22930b57cec5SDimitry Andric return true; 22940b57cec5SDimitry Andric } 22950b57cec5SDimitry Andric 22960b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 22970b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return. 22980b57cec5SDimitry Andric /// 22990b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 23000b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 23010b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 23020b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace. 23030b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 23040b57cec5SDimitry Andric 23050b57cec5SDimitry Andric unsigned char Char = *CurPtr; 23060b57cec5SDimitry Andric 2307e8d8bef9SDimitry Andric const char *lastNewLine = nullptr; 2308e8d8bef9SDimitry Andric auto setLastNewLine = [&](const char *Ptr) { 2309e8d8bef9SDimitry Andric lastNewLine = Ptr; 2310e8d8bef9SDimitry Andric if (!NewLinePtr) 2311e8d8bef9SDimitry Andric NewLinePtr = Ptr; 2312e8d8bef9SDimitry Andric }; 2313e8d8bef9SDimitry Andric if (SawNewline) 2314e8d8bef9SDimitry Andric setLastNewLine(CurPtr - 1); 2315e8d8bef9SDimitry Andric 23160b57cec5SDimitry Andric // Skip consecutive spaces efficiently. 23170b57cec5SDimitry Andric while (true) { 23180b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively. 23190b57cec5SDimitry Andric while (isHorizontalWhitespace(Char)) 23200b57cec5SDimitry Andric Char = *++CurPtr; 23210b57cec5SDimitry Andric 23220b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done. 23230b57cec5SDimitry Andric if (!isVerticalWhitespace(Char)) 23240b57cec5SDimitry Andric break; 23250b57cec5SDimitry Andric 23260b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 23270b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this. 23280b57cec5SDimitry Andric BufferPtr = CurPtr; 23290b57cec5SDimitry Andric return false; 23300b57cec5SDimitry Andric } 23310b57cec5SDimitry Andric 23320b57cec5SDimitry Andric // OK, but handle newline. 2333e8d8bef9SDimitry Andric if (*CurPtr == '\n') 2334e8d8bef9SDimitry Andric setLastNewLine(CurPtr); 23350b57cec5SDimitry Andric SawNewline = true; 23360b57cec5SDimitry Andric Char = *++CurPtr; 23370b57cec5SDimitry Andric } 23380b57cec5SDimitry Andric 23390b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now. 23400b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 23410b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 23420b57cec5SDimitry Andric if (SawNewline) { 23430b57cec5SDimitry Andric IsAtStartOfLine = true; 23440b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 23450b57cec5SDimitry Andric } 23460b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 23470b57cec5SDimitry Andric return true; 23480b57cec5SDimitry Andric } 23490b57cec5SDimitry Andric 23500b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space. 23510b57cec5SDimitry Andric char PrevChar = CurPtr[-1]; 23520b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 23530b57cec5SDimitry Andric 23540b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 23550b57cec5SDimitry Andric if (SawNewline) { 23560b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 23570b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 2358e8d8bef9SDimitry Andric 2359e8d8bef9SDimitry Andric if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2360e8d8bef9SDimitry Andric if (auto *Handler = PP->getEmptylineHandler()) 2361e8d8bef9SDimitry Andric Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2362e8d8bef9SDimitry Andric getSourceLocation(lastNewLine))); 2363e8d8bef9SDimitry Andric } 23640b57cec5SDimitry Andric } 23650b57cec5SDimitry Andric 23660b57cec5SDimitry Andric BufferPtr = CurPtr; 23670b57cec5SDimitry Andric return false; 23680b57cec5SDimitry Andric } 23690b57cec5SDimitry Andric 23700b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the 23710b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and 23720b57cec5SDimitry Andric /// return. 23730b57cec5SDimitry Andric /// 23740b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 23750b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 23760b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 23770b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 23780b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an 23790b57cec5SDimitry Andric // extension warning. 2380*81ad6265SDimitry Andric if (!LineComment) { 23811fd87a68SDimitry Andric if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 23820b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment); 23830b57cec5SDimitry Andric 23840b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation 23850b57cec5SDimitry Andric // unit. 2386*81ad6265SDimitry Andric LineComment = true; 23870b57cec5SDimitry Andric } 23880b57cec5SDimitry Andric 23890b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that 23900b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in 23910b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop. 23920b57cec5SDimitry Andric // 23930b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer) 23940b57cec5SDimitry Andric // character that ends the line comment. 23950b57cec5SDimitry Andric char C; 23960b57cec5SDimitry Andric while (true) { 23970b57cec5SDimitry Andric C = *CurPtr; 23980b57cec5SDimitry Andric // Skip over characters in the fast loop. 23990b57cec5SDimitry Andric while (C != 0 && // Potentially EOF. 24000b57cec5SDimitry Andric C != '\n' && C != '\r') // Newline or DOS-style newline. 24010b57cec5SDimitry Andric C = *++CurPtr; 24020b57cec5SDimitry Andric 24030b57cec5SDimitry Andric const char *NextLine = CurPtr; 24040b57cec5SDimitry Andric if (C != 0) { 24050b57cec5SDimitry Andric // We found a newline, see if it's escaped. 24060b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1; 24070b57cec5SDimitry Andric bool HasSpace = false; 24080b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 24090b57cec5SDimitry Andric --EscapePtr; 24100b57cec5SDimitry Andric HasSpace = true; 24110b57cec5SDimitry Andric } 24120b57cec5SDimitry Andric 24130b57cec5SDimitry Andric if (*EscapePtr == '\\') 24140b57cec5SDimitry Andric // Escaped newline. 24150b57cec5SDimitry Andric CurPtr = EscapePtr; 24160b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 24170b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs) 24180b57cec5SDimitry Andric // Trigraph-escaped newline. 24190b57cec5SDimitry Andric CurPtr = EscapePtr-2; 24200b57cec5SDimitry Andric else 24210b57cec5SDimitry Andric break; // This is a newline, we're done. 24220b57cec5SDimitry Andric 24230b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 24240b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode()) 24250b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space); 24260b57cec5SDimitry Andric } 24270b57cec5SDimitry Andric 24280b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 24290b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting 24300b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline, 24310b57cec5SDimitry Andric // we'll handle it below. 24320b57cec5SDimitry Andric const char *OldPtr = CurPtr; 24330b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode(); 24340b57cec5SDimitry Andric LexingRawMode = true; 24350b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 24360b57cec5SDimitry Andric LexingRawMode = OldRawMode; 24370b57cec5SDimitry Andric 24380b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed. 24390b57cec5SDimitry Andric // We're done and can skip forward to the newline. 24400b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) { 24410b57cec5SDimitry Andric CurPtr = NextLine; 24420b57cec5SDimitry Andric break; 24430b57cec5SDimitry Andric } 24440b57cec5SDimitry Andric 24450b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or 24460b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic 24470b57cec5SDimitry Andric // unless the next line is also a // comment. 24480b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' && 24490b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 24500b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr) 24510b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 24520b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next 24530b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic. 24540b57cec5SDimitry Andric if (isWhitespace(C)) { 24550b57cec5SDimitry Andric const char *ForwardPtr = CurPtr; 24560b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace. 24570b57cec5SDimitry Andric ++ForwardPtr; 24580b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 24590b57cec5SDimitry Andric break; 24600b57cec5SDimitry Andric } 24610b57cec5SDimitry Andric 24620b57cec5SDimitry Andric if (!isLexingRawMode()) 24630b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment); 24640b57cec5SDimitry Andric break; 24650b57cec5SDimitry Andric } 24660b57cec5SDimitry Andric } 24670b57cec5SDimitry Andric 24680b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 24690b57cec5SDimitry Andric --CurPtr; 24700b57cec5SDimitry Andric break; 24710b57cec5SDimitry Andric } 24720b57cec5SDimitry Andric 24730b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 24740b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 24750b57cec5SDimitry Andric cutOffLexing(); 24760b57cec5SDimitry Andric return false; 24770b57cec5SDimitry Andric } 24780b57cec5SDimitry Andric } 24790b57cec5SDimitry Andric 24800b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the 24810b57cec5SDimitry Andric // comment unless we're in a #if 0 block. 24820b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 24830b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 24840b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 24850b57cec5SDimitry Andric BufferPtr = CurPtr; 24860b57cec5SDimitry Andric return true; // A token has to be returned. 24870b57cec5SDimitry Andric } 24880b57cec5SDimitry Andric 24890b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 24900b57cec5SDimitry Andric if (inKeepCommentMode()) 24910b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr); 24920b57cec5SDimitry Andric 24930b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 24940b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token. 24950b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 24960b57cec5SDimitry Andric BufferPtr = CurPtr; 24970b57cec5SDimitry Andric return false; 24980b57cec5SDimitry Andric } 24990b57cec5SDimitry Andric 25000b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or 25010b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't 25020b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that 25030b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the 25040b57cec5SDimitry Andric /// comment above in that mode. 2505e8d8bef9SDimitry Andric NewLinePtr = CurPtr++; 25060b57cec5SDimitry Andric 25070b57cec5SDimitry Andric // The next returned token is at the start of the line. 25080b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 25090b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 25100b57cec5SDimitry Andric // No leading whitespace seen so far. 25110b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 25120b57cec5SDimitry Andric BufferPtr = CurPtr; 25130b57cec5SDimitry Andric return false; 25140b57cec5SDimitry Andric } 25150b57cec5SDimitry Andric 25160b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate 25170b57cec5SDimitry Andric /// way and return it. 25180b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 25190b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment 25200b57cec5SDimitry Andric // directly. 25210b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 25220b57cec5SDimitry Andric 25230b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode) 25240b57cec5SDimitry Andric return true; 25250b57cec5SDimitry Andric 25260b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into 25270b57cec5SDimitry Andric // a C-style block comment. 25280b57cec5SDimitry Andric bool Invalid = false; 25290b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid); 25300b57cec5SDimitry Andric if (Invalid) 25310b57cec5SDimitry Andric return true; 25320b57cec5SDimitry Andric 25330b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 25340b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*". 25350b57cec5SDimitry Andric Spelling += "*/"; // add suffix. 25360b57cec5SDimitry Andric 25370b57cec5SDimitry Andric Result.setKind(tok::comment); 25380b57cec5SDimitry Andric PP->CreateString(Spelling, Result, 25390b57cec5SDimitry Andric Result.getLocation(), Result.getLocation()); 25400b57cec5SDimitry Andric return true; 25410b57cec5SDimitry Andric } 25420b57cec5SDimitry Andric 25430b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 25440b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 25450b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment. 2546*81ad6265SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 2547*81ad6265SDimitry Andric bool Trigraphs) { 25480b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 25490b57cec5SDimitry Andric 2550fe6060f1SDimitry Andric // Position of the first trigraph in the ending sequence. 255104eeddc0SDimitry Andric const char *TrigraphPos = nullptr; 2552fe6060f1SDimitry Andric // Position of the first whitespace after a '\' in the ending sequence. 255304eeddc0SDimitry Andric const char *SpacePos = nullptr; 2554fe6060f1SDimitry Andric 2555fe6060f1SDimitry Andric while (true) { 25560b57cec5SDimitry Andric // Back up off the newline. 25570b57cec5SDimitry Andric --CurPtr; 25580b57cec5SDimitry Andric 25590b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character. 25600b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 25610b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline. 25620b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1]) 25630b57cec5SDimitry Andric return false; 25640b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline. 25650b57cec5SDimitry Andric --CurPtr; 25660b57cec5SDimitry Andric } 25670b57cec5SDimitry Andric 25680b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace 25690b57cec5SDimitry Andric // between the slash and newline. 25700b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2571fe6060f1SDimitry Andric SpacePos = CurPtr; 25720b57cec5SDimitry Andric --CurPtr; 25730b57cec5SDimitry Andric } 25740b57cec5SDimitry Andric 2575fe6060f1SDimitry Andric // If we have a slash, this is an escaped newline. 25760b57cec5SDimitry Andric if (*CurPtr == '\\') { 2577fe6060f1SDimitry Andric --CurPtr; 2578fe6060f1SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2579fe6060f1SDimitry Andric // This is a trigraph encoding of a slash. 2580fe6060f1SDimitry Andric TrigraphPos = CurPtr - 2; 2581fe6060f1SDimitry Andric CurPtr -= 3; 25820b57cec5SDimitry Andric } else { 25830b57cec5SDimitry Andric return false; 2584fe6060f1SDimitry Andric } 25850b57cec5SDimitry Andric 2586fe6060f1SDimitry Andric // If the character preceding the escaped newline is a '*', then after line 2587fe6060f1SDimitry Andric // splicing we have a '*/' ending the comment. 2588fe6060f1SDimitry Andric if (*CurPtr == '*') 2589fe6060f1SDimitry Andric break; 25900b57cec5SDimitry Andric 2591fe6060f1SDimitry Andric if (*CurPtr != '\n' && *CurPtr != '\r') 2592fe6060f1SDimitry Andric return false; 2593fe6060f1SDimitry Andric } 2594fe6060f1SDimitry Andric 2595fe6060f1SDimitry Andric if (TrigraphPos) { 25960b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and 25970b57cec5SDimitry Andric // ignore this * character. 2598*81ad6265SDimitry Andric if (!Trigraphs) { 25990b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2600fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 26010b57cec5SDimitry Andric return false; 26020b57cec5SDimitry Andric } 26030b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2604fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 26050b57cec5SDimitry Andric } 26060b57cec5SDimitry Andric 26070b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters. 26080b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2609fe6060f1SDimitry Andric L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 26100b57cec5SDimitry Andric 26110b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 2612fe6060f1SDimitry Andric if (SpacePos && !L->isLexingRawMode()) 2613fe6060f1SDimitry Andric L->Diag(SpacePos, diag::backslash_newline_space); 26140b57cec5SDimitry Andric 26150b57cec5SDimitry Andric return true; 26160b57cec5SDimitry Andric } 26170b57cec5SDimitry Andric 26180b57cec5SDimitry Andric #ifdef __SSE2__ 26190b57cec5SDimitry Andric #include <emmintrin.h> 26200b57cec5SDimitry Andric #elif __ALTIVEC__ 26210b57cec5SDimitry Andric #include <altivec.h> 26220b57cec5SDimitry Andric #undef bool 26230b57cec5SDimitry Andric #endif 26240b57cec5SDimitry Andric 26250b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment. 26260b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment. 26270b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block 26280b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing 26290b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between 26300b57cec5SDimitry Andric /// the terminating * and /. 26310b57cec5SDimitry Andric /// 26320b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 26330b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 26340b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 26350b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 26360b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once 26370b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common 26380b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their 26390b57cec5SDimitry Andric // comments. 26400b57cec5SDimitry Andric 26410b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle 26420b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline 26430b57cec5SDimitry Andric // after it. 26440b57cec5SDimitry Andric unsigned CharSize; 26450b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize); 26460b57cec5SDimitry Andric CurPtr += CharSize; 26470b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) { 26480b57cec5SDimitry Andric if (!isLexingRawMode()) 26490b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 26500b57cec5SDimitry Andric --CurPtr; 26510b57cec5SDimitry Andric 26520b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 26530b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 26540b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 26550b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 26560b57cec5SDimitry Andric return true; 26570b57cec5SDimitry Andric } 26580b57cec5SDimitry Andric 26590b57cec5SDimitry Andric BufferPtr = CurPtr; 26600b57cec5SDimitry Andric return false; 26610b57cec5SDimitry Andric } 26620b57cec5SDimitry Andric 26630b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so, 26640b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it. 26650b57cec5SDimitry Andric if (C == '/') 26660b57cec5SDimitry Andric C = *CurPtr++; 26670b57cec5SDimitry Andric 26680b57cec5SDimitry Andric while (true) { 26690b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a 26700b57cec5SDimitry Andric // (probably ending) '/' character. 26710b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd && 26720b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it 26730b57cec5SDimitry Andric // doesn't check for '\0'. 26740b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 26750b57cec5SDimitry Andric // While not aligned to a 16-byte boundary. 26760b57cec5SDimitry Andric while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 26770b57cec5SDimitry Andric C = *CurPtr++; 26780b57cec5SDimitry Andric 26790b57cec5SDimitry Andric if (C == '/') goto FoundSlash; 26800b57cec5SDimitry Andric 26810b57cec5SDimitry Andric #ifdef __SSE2__ 26820b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/'); 26830b57cec5SDimitry Andric while (CurPtr+16 <= BufferEnd) { 26840b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 26850b57cec5SDimitry Andric Slashes)); 26860b57cec5SDimitry Andric if (cmp != 0) { 26870b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's 26880b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of 26890b57cec5SDimitry Andric // the outer loop. 26900b57cec5SDimitry Andric CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 26910b57cec5SDimitry Andric goto FoundSlash; 26920b57cec5SDimitry Andric } 26930b57cec5SDimitry Andric CurPtr += 16; 26940b57cec5SDimitry Andric } 26950b57cec5SDimitry Andric #elif __ALTIVEC__ 26960b57cec5SDimitry Andric __vector unsigned char Slashes = { 26970b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/', 26980b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/' 26990b57cec5SDimitry Andric }; 27000b57cec5SDimitry Andric while (CurPtr + 16 <= BufferEnd && 270113138422SDimitry Andric !vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) 27020b57cec5SDimitry Andric CurPtr += 16; 27030b57cec5SDimitry Andric #else 27040b57cec5SDimitry Andric // Scan for '/' quickly. Many block comments are very large. 27050b57cec5SDimitry Andric while (CurPtr[0] != '/' && 27060b57cec5SDimitry Andric CurPtr[1] != '/' && 27070b57cec5SDimitry Andric CurPtr[2] != '/' && 27080b57cec5SDimitry Andric CurPtr[3] != '/' && 27090b57cec5SDimitry Andric CurPtr+4 < BufferEnd) { 27100b57cec5SDimitry Andric CurPtr += 4; 27110b57cec5SDimitry Andric } 27120b57cec5SDimitry Andric #endif 27130b57cec5SDimitry Andric 27140b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one. 27150b57cec5SDimitry Andric C = *CurPtr++; 27160b57cec5SDimitry Andric } 27170b57cec5SDimitry Andric 27180b57cec5SDimitry Andric // Loop to scan the remainder. 27190b57cec5SDimitry Andric while (C != '/' && C != '\0') 27200b57cec5SDimitry Andric C = *CurPtr++; 27210b57cec5SDimitry Andric 27220b57cec5SDimitry Andric if (C == '/') { 27230b57cec5SDimitry Andric FoundSlash: 27240b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done! 27250b57cec5SDimitry Andric break; 27260b57cec5SDimitry Andric 27270b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2728*81ad6265SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 2729*81ad6265SDimitry Andric LangOpts.Trigraphs)) { 27300b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the 27310b57cec5SDimitry Andric // * and /. We're done! 27320b57cec5SDimitry Andric break; 27330b57cec5SDimitry Andric } 27340b57cec5SDimitry Andric } 27350b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') { 27360b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this 27370b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with 27380b57cec5SDimitry Andric // embedded escaped newlines, but oh well. 27390b57cec5SDimitry Andric if (!isLexingRawMode()) 27400b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment); 27410b57cec5SDimitry Andric } 27420b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) { 27430b57cec5SDimitry Andric if (!isLexingRawMode()) 27440b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 27450b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately 27460b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the 27470b57cec5SDimitry Andric // comment, which surely would confuse the parser. 27480b57cec5SDimitry Andric --CurPtr; 27490b57cec5SDimitry Andric 27500b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 27510b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 27520b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 27530b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 27540b57cec5SDimitry Andric return true; 27550b57cec5SDimitry Andric } 27560b57cec5SDimitry Andric 27570b57cec5SDimitry Andric BufferPtr = CurPtr; 27580b57cec5SDimitry Andric return false; 27590b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 27600b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 27610b57cec5SDimitry Andric cutOffLexing(); 27620b57cec5SDimitry Andric return false; 27630b57cec5SDimitry Andric } 27640b57cec5SDimitry Andric 27650b57cec5SDimitry Andric C = *CurPtr++; 27660b57cec5SDimitry Andric } 27670b57cec5SDimitry Andric 27680b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block. 27690b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 27700b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 27710b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 27720b57cec5SDimitry Andric BufferPtr = CurPtr; 27730b57cec5SDimitry Andric return true; // A token has to be returned. 27740b57cec5SDimitry Andric } 27750b57cec5SDimitry Andric 27760b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 27770b57cec5SDimitry Andric if (inKeepCommentMode()) { 27780b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 27790b57cec5SDimitry Andric return true; 27800b57cec5SDimitry Andric } 27810b57cec5SDimitry Andric 27820b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be 27830b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it 27840b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would 27850b57cec5SDimitry Andric // have already returned above with the comment as a token. 27860b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 27870b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 27880b57cec5SDimitry Andric return false; 27890b57cec5SDimitry Andric } 27900b57cec5SDimitry Andric 27910b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token. 27920b57cec5SDimitry Andric BufferPtr = CurPtr; 27930b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 27940b57cec5SDimitry Andric return false; 27950b57cec5SDimitry Andric } 27960b57cec5SDimitry Andric 27970b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 27980b57cec5SDimitry Andric // Primary Lexing Entry Points 27990b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 28000b57cec5SDimitry Andric 28010b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 28020b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode. 28030b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 28040b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false && 28050b57cec5SDimitry Andric "Must be in a preprocessing directive!"); 28060b57cec5SDimitry Andric Token Tmp; 2807480093f4SDimitry Andric Tmp.startToken(); 28080b57cec5SDimitry Andric 28090b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 28100b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 28110b57cec5SDimitry Andric while (true) { 28120b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp); 28130b57cec5SDimitry Andric switch (Char) { 28140b57cec5SDimitry Andric default: 28150b57cec5SDimitry Andric if (Result) 28160b57cec5SDimitry Andric Result->push_back(Char); 28170b57cec5SDimitry Andric break; 28180b57cec5SDimitry Andric case 0: // Null. 28190b57cec5SDimitry Andric // Found end of file? 28200b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) { 28210b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 28220b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 28230b57cec5SDimitry Andric cutOffLexing(); 28240b57cec5SDimitry Andric return; 28250b57cec5SDimitry Andric } 28260b57cec5SDimitry Andric 28270b57cec5SDimitry Andric // Nope, normal character, continue. 28280b57cec5SDimitry Andric if (Result) 28290b57cec5SDimitry Andric Result->push_back(Char); 28300b57cec5SDimitry Andric break; 28310b57cec5SDimitry Andric } 28320b57cec5SDimitry Andric // FALL THROUGH. 28330b57cec5SDimitry Andric LLVM_FALLTHROUGH; 28340b57cec5SDimitry Andric case '\r': 28350b57cec5SDimitry Andric case '\n': 28360b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n. 28370b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 28380b57cec5SDimitry Andric BufferPtr = CurPtr-1; 28390b57cec5SDimitry Andric 28400b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition. 28410b57cec5SDimitry Andric Lex(Tmp); 28420b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) { 28430b57cec5SDimitry Andric if (PP) 28440b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 28450b57cec5SDimitry Andric Lex(Tmp); 28460b57cec5SDimitry Andric } 28470b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!"); 28480b57cec5SDimitry Andric 28490b57cec5SDimitry Andric // Finally, we're done; 28500b57cec5SDimitry Andric return; 28510b57cec5SDimitry Andric } 28520b57cec5SDimitry Andric } 28530b57cec5SDimitry Andric } 28540b57cec5SDimitry Andric 28550b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this 28560b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required. 28570b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be 28580b57cec5SDimitry Andric /// called again. 28590b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 28600b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive, 28610b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will 28620b57cec5SDimitry Andric // then be the end of file. 28630b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 28640b57cec5SDimitry Andric // Done parsing the "line". 28650b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 28660b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 28670b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod); 28680b57cec5SDimitry Andric 28690b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 28700b57cec5SDimitry Andric if (PP) 28710b57cec5SDimitry Andric resetExtendedTokenMode(); 28720b57cec5SDimitry Andric return true; // Have a token. 28730b57cec5SDimitry Andric } 28740b57cec5SDimitry Andric 28750b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller 28760b57cec5SDimitry Andric // that put us in raw mode handle the event. 28770b57cec5SDimitry Andric if (isLexingRawMode()) { 28780b57cec5SDimitry Andric Result.startToken(); 28790b57cec5SDimitry Andric BufferPtr = BufferEnd; 28800b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof); 28810b57cec5SDimitry Andric return true; 28820b57cec5SDimitry Andric } 28830b57cec5SDimitry Andric 28840b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 28850b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack); 2886fe6060f1SDimitry Andric // If the preamble cuts off the end of a header guard, consider it guarded. 2887fe6060f1SDimitry Andric // The guard is valid for the preamble content itself, and for tools the 2888fe6060f1SDimitry Andric // most useful answer is "yes, this file has a header guard". 2889fe6060f1SDimitry Andric if (!ConditionalStack.empty()) 2890fe6060f1SDimitry Andric MIOpt.ExitTopLevelConditional(); 28910b57cec5SDimitry Andric ConditionalStack.clear(); 28920b57cec5SDimitry Andric } 28930b57cec5SDimitry Andric 28940b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline. 28950b57cec5SDimitry Andric 28960b57cec5SDimitry Andric // If we are in a #if directive, emit an error. 28970b57cec5SDimitry Andric while (!ConditionalStack.empty()) { 28980b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc) 28990b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc, 29000b57cec5SDimitry Andric diag::err_pp_unterminated_conditional); 29010b57cec5SDimitry Andric ConditionalStack.pop_back(); 29020b57cec5SDimitry Andric } 29030b57cec5SDimitry Andric 29040b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 29050b57cec5SDimitry Andric // a pedwarn. 29060b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 29070b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics(); 2908*81ad6265SDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd); 29090b57cec5SDimitry Andric unsigned DiagID; 29100b57cec5SDimitry Andric 29110b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 29120b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2 29130b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic, 29140b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning. 29150b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 29160b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof; 29170b57cec5SDimitry Andric } else { 29180b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof; 29190b57cec5SDimitry Andric } 29200b57cec5SDimitry Andric } else { 29210b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof; 29220b57cec5SDimitry Andric } 29230b57cec5SDimitry Andric 29240b57cec5SDimitry Andric Diag(BufferEnd, DiagID) 29250b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n"); 29260b57cec5SDimitry Andric } 29270b57cec5SDimitry Andric 29280b57cec5SDimitry Andric BufferPtr = CurPtr; 29290b57cec5SDimitry Andric 29300b57cec5SDimitry Andric // Finally, let the preprocessor handle this. 2931*81ad6265SDimitry Andric return PP->HandleEndOfFile(Result, isPragmaLexer()); 29320b57cec5SDimitry Andric } 29330b57cec5SDimitry Andric 29340b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 29350b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something 29360b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the 29370b57cec5SDimitry Andric /// lexer. 29380b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() { 29390b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 29400b57cec5SDimitry Andric 2941*81ad6265SDimitry Andric if (isDependencyDirectivesLexer()) { 2942*81ad6265SDimitry Andric if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 2943*81ad6265SDimitry Andric return 2; 2944*81ad6265SDimitry Andric return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 2945*81ad6265SDimitry Andric tok::l_paren); 2946*81ad6265SDimitry Andric } 2947*81ad6265SDimitry Andric 29480b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token 29490b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF 29500b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack. 29510b57cec5SDimitry Andric LexingRawMode = true; 29520b57cec5SDimitry Andric 29530b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it. 29540b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr; 29550b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective; 29560b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine; 29570b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 29580b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace; 29590b57cec5SDimitry Andric 29600b57cec5SDimitry Andric Token Tok; 29610b57cec5SDimitry Andric Lex(Tok); 29620b57cec5SDimitry Andric 29630b57cec5SDimitry Andric // Restore state that may have changed. 29640b57cec5SDimitry Andric BufferPtr = TmpBufferPtr; 29650b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode; 29660b57cec5SDimitry Andric HasLeadingSpace = leadingSpace; 29670b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine; 29680b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 29690b57cec5SDimitry Andric 29700b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode. 29710b57cec5SDimitry Andric LexingRawMode = false; 29720b57cec5SDimitry Andric 29730b57cec5SDimitry Andric if (Tok.is(tok::eof)) 29740b57cec5SDimitry Andric return 2; 29750b57cec5SDimitry Andric return Tok.is(tok::l_paren); 29760b57cec5SDimitry Andric } 29770b57cec5SDimitry Andric 29780b57cec5SDimitry Andric /// Find the end of a version control conflict marker. 29790b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 29800b57cec5SDimitry Andric ConflictMarkerKind CMK) { 29810b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 29820b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 29830b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 29840b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator); 29850b57cec5SDimitry Andric while (Pos != StringRef::npos) { 29860b57cec5SDimitry Andric // Must occur at start of line. 29870b57cec5SDimitry Andric if (Pos == 0 || 29880b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 29890b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 29900b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator); 29910b57cec5SDimitry Andric continue; 29920b57cec5SDimitry Andric } 29930b57cec5SDimitry Andric return RestOfBuffer.data()+Pos; 29940b57cec5SDimitry Andric } 29950b57cec5SDimitry Andric return nullptr; 29960b57cec5SDimitry Andric } 29970b57cec5SDimitry Andric 29980b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version 29990b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 30000b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false 30010b57cec5SDimitry Andric /// if not. 30020b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 30030b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 30040b57cec5SDimitry Andric if (CurPtr != BufferStart && 30050b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 30060b57cec5SDimitry Andric return false; 30070b57cec5SDimitry Andric 30080b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>. 30090b57cec5SDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 30100b57cec5SDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 30110b57cec5SDimitry Andric return false; 30120b57cec5SDimitry Andric 30130b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 30140b57cec5SDimitry Andric // it. 30150b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode()) 30160b57cec5SDimitry Andric return false; 30170b57cec5SDimitry Andric 30180b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 30190b57cec5SDimitry Andric 30200b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the 30210b57cec5SDimitry Andric // start of a line to terminate this conflict marker. 30220b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 30230b57cec5SDimitry Andric // We found a match. We are really in a conflict marker. 30240b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line. 30250b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker); 30260b57cec5SDimitry Andric CurrentConflictMarkerState = Kind; 30270b57cec5SDimitry Andric 30280b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the 30290b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n. 30300b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') { 30310b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line"); 30320b57cec5SDimitry Andric ++CurPtr; 30330b57cec5SDimitry Andric } 30340b57cec5SDimitry Andric BufferPtr = CurPtr; 30350b57cec5SDimitry Andric return true; 30360b57cec5SDimitry Andric } 30370b57cec5SDimitry Andric 30380b57cec5SDimitry Andric // No end of conflict marker found. 30390b57cec5SDimitry Andric return false; 30400b57cec5SDimitry Andric } 30410b57cec5SDimitry Andric 30420b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 30430b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 30440b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of 30450b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not. 30460b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 30470b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 30480b57cec5SDimitry Andric if (CurPtr != BufferStart && 30490b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 30500b57cec5SDimitry Andric return false; 30510b57cec5SDimitry Andric 30520b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 30530b57cec5SDimitry Andric // it. 30540b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode()) 30550b57cec5SDimitry Andric return false; 30560b57cec5SDimitry Andric 30570b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row). 30580b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i) 30590b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0]) 30600b57cec5SDimitry Andric return false; 30610b57cec5SDimitry Andric 30620b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could 30630b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 30640b57cec5SDimitry Andric // be the end of conflict marker. 30650b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 30660b57cec5SDimitry Andric CurrentConflictMarkerState)) { 30670b57cec5SDimitry Andric CurPtr = End; 30680b57cec5SDimitry Andric 30690b57cec5SDimitry Andric // Skip ahead to the end of line. 30700b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 30710b57cec5SDimitry Andric ++CurPtr; 30720b57cec5SDimitry Andric 30730b57cec5SDimitry Andric BufferPtr = CurPtr; 30740b57cec5SDimitry Andric 30750b57cec5SDimitry Andric // No longer in the conflict marker. 30760b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 30770b57cec5SDimitry Andric return true; 30780b57cec5SDimitry Andric } 30790b57cec5SDimitry Andric 30800b57cec5SDimitry Andric return false; 30810b57cec5SDimitry Andric } 30820b57cec5SDimitry Andric 30830b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr, 30840b57cec5SDimitry Andric const char *BufferEnd) { 30850b57cec5SDimitry Andric if (CurPtr == BufferEnd) 30860b57cec5SDimitry Andric return nullptr; 30870b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character. 30880b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) { 30890b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>') 30900b57cec5SDimitry Andric return CurPtr + 2; 30910b57cec5SDimitry Andric } 30920b57cec5SDimitry Andric return nullptr; 30930b57cec5SDimitry Andric } 30940b57cec5SDimitry Andric 30950b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 30960b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 30970b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 30980b57cec5SDimitry Andric return false; 30990b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 31000b57cec5SDimitry Andric if (!End) 31010b57cec5SDimitry Andric return false; 31020b57cec5SDimitry Andric const char *Start = CurPtr - 1; 31030b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders) 31040b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source); 31050b57cec5SDimitry Andric Result.startToken(); 31060b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier); 31070b57cec5SDimitry Andric Result.setRawIdentifierData(Start); 31080b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result); 31090b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder); 31100b57cec5SDimitry Andric BufferPtr = End; 31110b57cec5SDimitry Andric return true; 31120b57cec5SDimitry Andric } 31130b57cec5SDimitry Andric 31140b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 31150b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) { 31160b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 31170b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc(); 31180b57cec5SDimitry Andric } 31190b57cec5SDimitry Andric 31200b57cec5SDimitry Andric return false; 31210b57cec5SDimitry Andric } 31220b57cec5SDimitry Andric 3123*81ad6265SDimitry Andric llvm::Optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 3124*81ad6265SDimitry Andric const char *SlashLoc, 31250b57cec5SDimitry Andric Token *Result) { 31260b57cec5SDimitry Andric unsigned CharSize; 31270b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 3128*81ad6265SDimitry Andric assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 31290b57cec5SDimitry Andric 31300b57cec5SDimitry Andric unsigned NumHexDigits; 31310b57cec5SDimitry Andric if (Kind == 'u') 31320b57cec5SDimitry Andric NumHexDigits = 4; 31330b57cec5SDimitry Andric else if (Kind == 'U') 31340b57cec5SDimitry Andric NumHexDigits = 8; 3135*81ad6265SDimitry Andric 3136*81ad6265SDimitry Andric bool Delimited = false; 3137*81ad6265SDimitry Andric bool FoundEndDelimiter = false; 3138*81ad6265SDimitry Andric unsigned Count = 0; 3139*81ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 31400b57cec5SDimitry Andric 31410b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3142349cc55cSDimitry Andric if (Diagnose) 31430b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3144*81ad6265SDimitry Andric return llvm::None; 31450b57cec5SDimitry Andric } 31460b57cec5SDimitry Andric 31470b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize; 31480b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1]; 31490b57cec5SDimitry Andric 31500b57cec5SDimitry Andric uint32_t CodePoint = 0; 3151349cc55cSDimitry Andric while (Count != NumHexDigits || Delimited) { 31520b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize); 3153349cc55cSDimitry Andric if (!Delimited && C == '{') { 3154349cc55cSDimitry Andric Delimited = true; 3155349cc55cSDimitry Andric CurPtr += CharSize; 3156349cc55cSDimitry Andric continue; 3157349cc55cSDimitry Andric } 3158349cc55cSDimitry Andric 3159349cc55cSDimitry Andric if (Delimited && C == '}') { 3160349cc55cSDimitry Andric CurPtr += CharSize; 3161349cc55cSDimitry Andric FoundEndDelimiter = true; 3162349cc55cSDimitry Andric break; 3163349cc55cSDimitry Andric } 31640b57cec5SDimitry Andric 31650b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C); 31660b57cec5SDimitry Andric if (Value == -1U) { 3167349cc55cSDimitry Andric if (!Delimited) 3168349cc55cSDimitry Andric break; 3169349cc55cSDimitry Andric if (Diagnose) 3170349cc55cSDimitry Andric Diag(BufferPtr, diag::warn_delimited_ucn_incomplete) 3171*81ad6265SDimitry Andric << StringRef(KindLoc, 1); 3172*81ad6265SDimitry Andric return llvm::None; 3173349cc55cSDimitry Andric } 31740b57cec5SDimitry Andric 3175349cc55cSDimitry Andric if (CodePoint & 0xF000'0000) { 3176349cc55cSDimitry Andric if (Diagnose) 3177349cc55cSDimitry Andric Diag(KindLoc, diag::err_escape_too_large) << 0; 3178*81ad6265SDimitry Andric return llvm::None; 3179349cc55cSDimitry Andric } 3180349cc55cSDimitry Andric 3181349cc55cSDimitry Andric CodePoint <<= 4; 3182349cc55cSDimitry Andric CodePoint |= Value; 3183349cc55cSDimitry Andric CurPtr += CharSize; 3184349cc55cSDimitry Andric Count++; 3185349cc55cSDimitry Andric } 3186349cc55cSDimitry Andric 3187349cc55cSDimitry Andric if (Count == 0) { 3188349cc55cSDimitry Andric if (Diagnose) 3189349cc55cSDimitry Andric Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3190349cc55cSDimitry Andric : diag::warn_ucn_escape_no_digits) 3191349cc55cSDimitry Andric << StringRef(KindLoc, 1); 3192*81ad6265SDimitry Andric return llvm::None; 3193*81ad6265SDimitry Andric } 3194*81ad6265SDimitry Andric 3195*81ad6265SDimitry Andric if (Delimited && Kind == 'U') { 3196*81ad6265SDimitry Andric if (Diagnose) 3197*81ad6265SDimitry Andric Diag(StartPtr, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3198*81ad6265SDimitry Andric return llvm::None; 3199349cc55cSDimitry Andric } 3200349cc55cSDimitry Andric 3201349cc55cSDimitry Andric if (!Delimited && Count != NumHexDigits) { 3202349cc55cSDimitry Andric if (Diagnose) { 3203349cc55cSDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 32040b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u. 3205349cc55cSDimitry Andric if (Count == 4 && NumHexDigits == 8) { 32060b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 32070b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight) 32080b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u"); 32090b57cec5SDimitry Andric } 32100b57cec5SDimitry Andric } 3211*81ad6265SDimitry Andric return llvm::None; 32120b57cec5SDimitry Andric } 32130b57cec5SDimitry Andric 3214349cc55cSDimitry Andric if (Delimited && PP) { 3215*81ad6265SDimitry Andric Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*delimited*/ 0; 32160b57cec5SDimitry Andric } 32170b57cec5SDimitry Andric 32180b57cec5SDimitry Andric if (Result) { 32190b57cec5SDimitry Andric Result->setFlag(Token::HasUCN); 3220349cc55cSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Count + 2 + (Delimited ? 2 : 0))) 32210b57cec5SDimitry Andric StartPtr = CurPtr; 32220b57cec5SDimitry Andric else 32230b57cec5SDimitry Andric while (StartPtr != CurPtr) 32240b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 32250b57cec5SDimitry Andric } else { 32260b57cec5SDimitry Andric StartPtr = CurPtr; 32270b57cec5SDimitry Andric } 3228*81ad6265SDimitry Andric return CodePoint; 3229*81ad6265SDimitry Andric } 3230*81ad6265SDimitry Andric 3231*81ad6265SDimitry Andric llvm::Optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3232*81ad6265SDimitry Andric Token *Result) { 3233*81ad6265SDimitry Andric unsigned CharSize; 3234*81ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 3235*81ad6265SDimitry Andric 3236*81ad6265SDimitry Andric char C = getCharAndSize(StartPtr, CharSize); 3237*81ad6265SDimitry Andric assert(C == 'N' && "expected \\N{...}"); 3238*81ad6265SDimitry Andric 3239*81ad6265SDimitry Andric const char *CurPtr = StartPtr + CharSize; 3240*81ad6265SDimitry Andric const char *KindLoc = &CurPtr[-1]; 3241*81ad6265SDimitry Andric 3242*81ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize); 3243*81ad6265SDimitry Andric if (C != '{') { 3244*81ad6265SDimitry Andric if (Diagnose) 3245*81ad6265SDimitry Andric Diag(StartPtr, diag::warn_ucn_escape_incomplete); 3246*81ad6265SDimitry Andric return llvm::None; 3247*81ad6265SDimitry Andric } 3248*81ad6265SDimitry Andric CurPtr += CharSize; 3249*81ad6265SDimitry Andric const char *StartName = CurPtr; 3250*81ad6265SDimitry Andric bool FoundEndDelimiter = false; 3251*81ad6265SDimitry Andric llvm::SmallVector<char, 30> Buffer; 3252*81ad6265SDimitry Andric while (C) { 3253*81ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize); 3254*81ad6265SDimitry Andric CurPtr += CharSize; 3255*81ad6265SDimitry Andric if (C == '}') { 3256*81ad6265SDimitry Andric FoundEndDelimiter = true; 3257*81ad6265SDimitry Andric break; 3258*81ad6265SDimitry Andric } 3259*81ad6265SDimitry Andric 3260*81ad6265SDimitry Andric if (!isAlphanumeric(C) && C != '_' && C != '-' && C != ' ') 3261*81ad6265SDimitry Andric break; 3262*81ad6265SDimitry Andric Buffer.push_back(C); 3263*81ad6265SDimitry Andric } 3264*81ad6265SDimitry Andric 3265*81ad6265SDimitry Andric if (!FoundEndDelimiter || Buffer.empty()) { 3266*81ad6265SDimitry Andric if (Diagnose) 3267*81ad6265SDimitry Andric Diag(StartPtr, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3268*81ad6265SDimitry Andric : diag::warn_delimited_ucn_incomplete) 3269*81ad6265SDimitry Andric << StringRef(KindLoc, 1); 3270*81ad6265SDimitry Andric return llvm::None; 3271*81ad6265SDimitry Andric } 3272*81ad6265SDimitry Andric 3273*81ad6265SDimitry Andric StringRef Name(Buffer.data(), Buffer.size()); 3274*81ad6265SDimitry Andric llvm::Optional<char32_t> Res = 3275*81ad6265SDimitry Andric llvm::sys::unicode::nameToCodepointStrict(Name); 3276*81ad6265SDimitry Andric llvm::Optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3277*81ad6265SDimitry Andric if (!Res) { 3278*81ad6265SDimitry Andric if (!isLexingRawMode()) { 3279*81ad6265SDimitry Andric Diag(StartPtr, diag::err_invalid_ucn_name) 3280*81ad6265SDimitry Andric << StringRef(Buffer.data(), Buffer.size()); 3281*81ad6265SDimitry Andric LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3282*81ad6265SDimitry Andric if (LooseMatch) { 3283*81ad6265SDimitry Andric Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 3284*81ad6265SDimitry Andric << FixItHint::CreateReplacement( 3285*81ad6265SDimitry Andric makeCharRange(*this, StartName, CurPtr - CharSize), 3286*81ad6265SDimitry Andric LooseMatch->Name); 3287*81ad6265SDimitry Andric } 3288*81ad6265SDimitry Andric } 3289*81ad6265SDimitry Andric // When finding a match using Unicode loose matching rules 3290*81ad6265SDimitry Andric // recover after having emitted a diagnostic. 3291*81ad6265SDimitry Andric if (!LooseMatch) 3292*81ad6265SDimitry Andric return llvm::None; 3293*81ad6265SDimitry Andric // We do not offer missspelled character names suggestions here 3294*81ad6265SDimitry Andric // as the set of what would be a valid suggestion depends on context, 3295*81ad6265SDimitry Andric // and we should not make invalid suggestions. 3296*81ad6265SDimitry Andric } 3297*81ad6265SDimitry Andric 3298*81ad6265SDimitry Andric if (Diagnose && PP && !LooseMatch) 3299*81ad6265SDimitry Andric Diag(BufferPtr, diag::ext_delimited_escape_sequence) << /*named*/ 1; 3300*81ad6265SDimitry Andric 3301*81ad6265SDimitry Andric if (LooseMatch) 3302*81ad6265SDimitry Andric Res = LooseMatch->CodePoint; 3303*81ad6265SDimitry Andric 3304*81ad6265SDimitry Andric if (Result) { 3305*81ad6265SDimitry Andric Result->setFlag(Token::HasUCN); 3306*81ad6265SDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 4)) 3307*81ad6265SDimitry Andric StartPtr = CurPtr; 3308*81ad6265SDimitry Andric else 3309*81ad6265SDimitry Andric while (StartPtr != CurPtr) 3310*81ad6265SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 3311*81ad6265SDimitry Andric } else { 3312*81ad6265SDimitry Andric StartPtr = CurPtr; 3313*81ad6265SDimitry Andric } 3314*81ad6265SDimitry Andric return *Res; 3315*81ad6265SDimitry Andric } 3316*81ad6265SDimitry Andric 3317*81ad6265SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 3318*81ad6265SDimitry Andric Token *Result) { 3319*81ad6265SDimitry Andric 3320*81ad6265SDimitry Andric unsigned CharSize; 3321*81ad6265SDimitry Andric llvm::Optional<uint32_t> CodePointOpt; 3322*81ad6265SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 3323*81ad6265SDimitry Andric if (Kind == 'u' || Kind == 'U') 3324*81ad6265SDimitry Andric CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 3325*81ad6265SDimitry Andric else if (Kind == 'N') 3326*81ad6265SDimitry Andric CodePointOpt = tryReadNamedUCN(StartPtr, Result); 3327*81ad6265SDimitry Andric 3328*81ad6265SDimitry Andric if (!CodePointOpt) 3329*81ad6265SDimitry Andric return 0; 3330*81ad6265SDimitry Andric 3331*81ad6265SDimitry Andric uint32_t CodePoint = *CodePointOpt; 33320b57cec5SDimitry Andric 33330b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode 33340b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) 33350b57cec5SDimitry Andric return CodePoint; 33360b57cec5SDimitry Andric 33370b57cec5SDimitry Andric // C99 6.4.3p2: A universal character name shall not specify a character whose 33380b57cec5SDimitry Andric // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 33390b57cec5SDimitry Andric // 0060 (`), nor one in the range D800 through DFFF inclusive.) 33400b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a 33410b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the 33420b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 33430b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the 33440b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 33450b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the 33460b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 33470b57cec5SDimitry Andric // basic source character set, the program is ill-formed. 33480b57cec5SDimitry Andric if (CodePoint < 0xA0) { 33490b57cec5SDimitry Andric if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 33500b57cec5SDimitry Andric return CodePoint; 33510b57cec5SDimitry Andric 33520b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad 33530b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 33540b57cec5SDimitry Andric if (Result && PP) { 33550b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F) 33560b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character); 33570b57cec5SDimitry Andric else { 33580b57cec5SDimitry Andric char C = static_cast<char>(CodePoint); 33590b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 33600b57cec5SDimitry Andric } 33610b57cec5SDimitry Andric } 33620b57cec5SDimitry Andric 33630b57cec5SDimitry Andric return 0; 33640b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 33650b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 33660b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad 33670b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 33680b57cec5SDimitry Andric if (Result && PP) { 33690b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 33700b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 33710b57cec5SDimitry Andric else 33720b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid); 33730b57cec5SDimitry Andric } 33740b57cec5SDimitry Andric return 0; 33750b57cec5SDimitry Andric } 33760b57cec5SDimitry Andric 33770b57cec5SDimitry Andric return CodePoint; 33780b57cec5SDimitry Andric } 33790b57cec5SDimitry Andric 33800b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 33810b57cec5SDimitry Andric const char *CurPtr) { 33820b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3383349cc55cSDimitry Andric isUnicodeWhitespace(C)) { 33840b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace) 33850b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr); 33860b57cec5SDimitry Andric 33870b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 33880b57cec5SDimitry Andric return true; 33890b57cec5SDimitry Andric } 33900b57cec5SDimitry Andric return false; 33910b57cec5SDimitry Andric } 33920b57cec5SDimitry Andric 33930b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 33940b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine(); 33950b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace(); 33960b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 33970b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine. 33980b57cec5SDimitry Andric } 33990b57cec5SDimitry Andric 34000b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) { 3401*81ad6265SDimitry Andric assert(!isDependencyDirectivesLexer()); 3402*81ad6265SDimitry Andric 34030b57cec5SDimitry Andric // Start a new token. 34040b57cec5SDimitry Andric Result.startToken(); 34050b57cec5SDimitry Andric 34060b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal. 34070b57cec5SDimitry Andric if (IsAtStartOfLine) { 34080b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 34090b57cec5SDimitry Andric IsAtStartOfLine = false; 34100b57cec5SDimitry Andric } 34110b57cec5SDimitry Andric 34120b57cec5SDimitry Andric if (HasLeadingSpace) { 34130b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 34140b57cec5SDimitry Andric HasLeadingSpace = false; 34150b57cec5SDimitry Andric } 34160b57cec5SDimitry Andric 34170b57cec5SDimitry Andric if (HasLeadingEmptyMacro) { 34180b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro); 34190b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 34200b57cec5SDimitry Andric } 34210b57cec5SDimitry Andric 34220b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 34230b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false; 34240b57cec5SDimitry Andric bool isRawLex = isLexingRawMode(); 34250b57cec5SDimitry Andric (void) isRawLex; 34260b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 34270b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.) 34280b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 34290b57cec5SDimitry Andric return returnedToken; 34300b57cec5SDimitry Andric } 34310b57cec5SDimitry Andric 34320b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an 34330b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer 34340b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing 34350b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes 34360b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this. 34370b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 34380b57cec5SDimitry Andric LexNextToken: 34390b57cec5SDimitry Andric // New token, can't need cleaning yet. 34400b57cec5SDimitry Andric Result.clearFlag(Token::NeedsCleaning); 34410b57cec5SDimitry Andric Result.setIdentifierInfo(nullptr); 34420b57cec5SDimitry Andric 34430b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 34440b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 34450b57cec5SDimitry Andric 34460b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens. 3447fe6060f1SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 3448fe6060f1SDimitry Andric do { 34490b57cec5SDimitry Andric ++CurPtr; 3450fe6060f1SDimitry Andric } while (isHorizontalWhitespace(*CurPtr)); 34510b57cec5SDimitry Andric 34520b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just 34530b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the 34540b57cec5SDimitry Andric // whitespace. 34550b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 34560b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 34570b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 34580b57cec5SDimitry Andric return true; 34590b57cec5SDimitry Andric } 34600b57cec5SDimitry Andric 34610b57cec5SDimitry Andric BufferPtr = CurPtr; 34620b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 34630b57cec5SDimitry Andric } 34640b57cec5SDimitry Andric 34650b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 34660b57cec5SDimitry Andric 34670b57cec5SDimitry Andric // Read a character, advancing over it. 34680b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result); 34690b57cec5SDimitry Andric tok::TokenKind Kind; 34700b57cec5SDimitry Andric 3471e8d8bef9SDimitry Andric if (!isVerticalWhitespace(Char)) 3472e8d8bef9SDimitry Andric NewLinePtr = nullptr; 3473e8d8bef9SDimitry Andric 34740b57cec5SDimitry Andric switch (Char) { 34750b57cec5SDimitry Andric case 0: // Null. 34760b57cec5SDimitry Andric // Found end of file? 34770b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd) 34780b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 34790b57cec5SDimitry Andric 34800b57cec5SDimitry Andric // Check if we are performing code completion. 34810b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 34820b57cec5SDimitry Andric // Return the code-completion token. 34830b57cec5SDimitry Andric Result.startToken(); 34840b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion); 34850b57cec5SDimitry Andric return true; 34860b57cec5SDimitry Andric } 34870b57cec5SDimitry Andric 34880b57cec5SDimitry Andric if (!isLexingRawMode()) 34890b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file); 34900b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 34910b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 34920b57cec5SDimitry Andric return true; // KeepWhitespaceMode 34930b57cec5SDimitry Andric 34940b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer. 34950b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 34960b57cec5SDimitry Andric goto LexNextToken; 34970b57cec5SDimitry Andric 34980b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z". 34990b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file. 35000b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) { 35010b57cec5SDimitry Andric if (!isLexingRawMode()) 35020b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 35030b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 35040b57cec5SDimitry Andric } 35050b57cec5SDimitry Andric 35060b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage. 35070b57cec5SDimitry Andric Kind = tok::unknown; 35080b57cec5SDimitry Andric break; 35090b57cec5SDimitry Andric 35100b57cec5SDimitry Andric case '\r': 35110b57cec5SDimitry Andric if (CurPtr[0] == '\n') 35120b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 35130b57cec5SDimitry Andric LLVM_FALLTHROUGH; 35140b57cec5SDimitry Andric case '\n': 35150b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 35160b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token. 35170b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 35180b57cec5SDimitry Andric // Done parsing the "line". 35190b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 35200b57cec5SDimitry Andric 35210b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 35220b57cec5SDimitry Andric if (PP) 35230b57cec5SDimitry Andric resetExtendedTokenMode(); 35240b57cec5SDimitry Andric 35250b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line. 35260b57cec5SDimitry Andric IsAtStartOfLine = true; 35270b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 3528e8d8bef9SDimitry Andric NewLinePtr = CurPtr - 1; 35290b57cec5SDimitry Andric 35300b57cec5SDimitry Andric Kind = tok::eod; 35310b57cec5SDimitry Andric break; 35320b57cec5SDimitry Andric } 35330b57cec5SDimitry Andric 35340b57cec5SDimitry Andric // No leading whitespace seen so far. 35350b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 35360b57cec5SDimitry Andric 35370b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 35380b57cec5SDimitry Andric return true; // KeepWhitespaceMode 35390b57cec5SDimitry Andric 35400b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 35410b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 35420b57cec5SDimitry Andric goto LexNextToken; 35430b57cec5SDimitry Andric case ' ': 35440b57cec5SDimitry Andric case '\t': 35450b57cec5SDimitry Andric case '\f': 35460b57cec5SDimitry Andric case '\v': 35470b57cec5SDimitry Andric SkipHorizontalWhitespace: 35480b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 35490b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 35500b57cec5SDimitry Andric return true; // KeepWhitespaceMode 35510b57cec5SDimitry Andric 35520b57cec5SDimitry Andric SkipIgnoredUnits: 35530b57cec5SDimitry Andric CurPtr = BufferPtr; 35540b57cec5SDimitry Andric 35550b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently 35560b57cec5SDimitry Andric // too (without going through the big switch stmt). 35570b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3558*81ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 35590b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 35600b57cec5SDimitry Andric return true; // There is a token to return. 35610b57cec5SDimitry Andric goto SkipIgnoredUnits; 35620b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 35630b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 35640b57cec5SDimitry Andric return true; // There is a token to return. 35650b57cec5SDimitry Andric goto SkipIgnoredUnits; 35660b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) { 35670b57cec5SDimitry Andric goto SkipHorizontalWhitespace; 35680b57cec5SDimitry Andric } 35690b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 35700b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 35710b57cec5SDimitry Andric goto LexNextToken; 35720b57cec5SDimitry Andric 35730b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants. 35740b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants. 35750b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4': 35760b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': 35770b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 35780b57cec5SDimitry Andric MIOpt.ReadToken(); 35790b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 35800b57cec5SDimitry Andric 3581*81ad6265SDimitry Andric // Identifier (e.g., uber), or 3582*81ad6265SDimitry Andric // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or 3583*81ad6265SDimitry Andric // UTF-8 or UTF-16 string literal (C11/C++11). 3584*81ad6265SDimitry Andric case 'u': 35850b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 35860b57cec5SDimitry Andric MIOpt.ReadToken(); 35870b57cec5SDimitry Andric 35880b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 35890b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 35900b57cec5SDimitry Andric 35910b57cec5SDimitry Andric // UTF-16 string literal 35920b57cec5SDimitry Andric if (Char == '"') 35930b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 35940b57cec5SDimitry Andric tok::utf16_string_literal); 35950b57cec5SDimitry Andric 35960b57cec5SDimitry Andric // UTF-16 character constant 35970b57cec5SDimitry Andric if (Char == '\'') 35980b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 35990b57cec5SDimitry Andric tok::utf16_char_constant); 36000b57cec5SDimitry Andric 36010b57cec5SDimitry Andric // UTF-16 raw string literal 36020b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 36030b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 36040b57cec5SDimitry Andric return LexRawStringLiteral(Result, 36050b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36060b57cec5SDimitry Andric SizeTmp2, Result), 36070b57cec5SDimitry Andric tok::utf16_string_literal); 36080b57cec5SDimitry Andric 36090b57cec5SDimitry Andric if (Char == '8') { 36100b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 36110b57cec5SDimitry Andric 36120b57cec5SDimitry Andric // UTF-8 string literal 36130b57cec5SDimitry Andric if (Char2 == '"') 36140b57cec5SDimitry Andric return LexStringLiteral(Result, 36150b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36160b57cec5SDimitry Andric SizeTmp2, Result), 36170b57cec5SDimitry Andric tok::utf8_string_literal); 3618*81ad6265SDimitry Andric if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x)) 36190b57cec5SDimitry Andric return LexCharConstant( 36200b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36210b57cec5SDimitry Andric SizeTmp2, Result), 36220b57cec5SDimitry Andric tok::utf8_char_constant); 36230b57cec5SDimitry Andric 36240b57cec5SDimitry Andric if (Char2 == 'R' && LangOpts.CPlusPlus11) { 36250b57cec5SDimitry Andric unsigned SizeTmp3; 36260b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 36270b57cec5SDimitry Andric // UTF-8 raw string literal 36280b57cec5SDimitry Andric if (Char3 == '"') { 36290b57cec5SDimitry Andric return LexRawStringLiteral(Result, 36300b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36310b57cec5SDimitry Andric SizeTmp2, Result), 36320b57cec5SDimitry Andric SizeTmp3, Result), 36330b57cec5SDimitry Andric tok::utf8_string_literal); 36340b57cec5SDimitry Andric } 36350b57cec5SDimitry Andric } 36360b57cec5SDimitry Andric } 36370b57cec5SDimitry Andric } 36380b57cec5SDimitry Andric 36390b57cec5SDimitry Andric // treat u like the start of an identifier. 3640349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 36410b57cec5SDimitry Andric 3642*81ad6265SDimitry Andric case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 36430b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36440b57cec5SDimitry Andric MIOpt.ReadToken(); 36450b57cec5SDimitry Andric 36460b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 36470b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36480b57cec5SDimitry Andric 36490b57cec5SDimitry Andric // UTF-32 string literal 36500b57cec5SDimitry Andric if (Char == '"') 36510b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 36520b57cec5SDimitry Andric tok::utf32_string_literal); 36530b57cec5SDimitry Andric 36540b57cec5SDimitry Andric // UTF-32 character constant 36550b57cec5SDimitry Andric if (Char == '\'') 36560b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 36570b57cec5SDimitry Andric tok::utf32_char_constant); 36580b57cec5SDimitry Andric 36590b57cec5SDimitry Andric // UTF-32 raw string literal 36600b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 36610b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 36620b57cec5SDimitry Andric return LexRawStringLiteral(Result, 36630b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 36640b57cec5SDimitry Andric SizeTmp2, Result), 36650b57cec5SDimitry Andric tok::utf32_string_literal); 36660b57cec5SDimitry Andric } 36670b57cec5SDimitry Andric 36680b57cec5SDimitry Andric // treat U like the start of an identifier. 3669349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 36700b57cec5SDimitry Andric 36710b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal 36720b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36730b57cec5SDimitry Andric MIOpt.ReadToken(); 36740b57cec5SDimitry Andric 36750b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 36760b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36770b57cec5SDimitry Andric 36780b57cec5SDimitry Andric if (Char == '"') 36790b57cec5SDimitry Andric return LexRawStringLiteral(Result, 36800b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result), 36810b57cec5SDimitry Andric tok::string_literal); 36820b57cec5SDimitry Andric } 36830b57cec5SDimitry Andric 36840b57cec5SDimitry Andric // treat R like the start of an identifier. 3685349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 36860b57cec5SDimitry Andric 36870b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 36880b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 36890b57cec5SDimitry Andric MIOpt.ReadToken(); 36900b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 36910b57cec5SDimitry Andric 36920b57cec5SDimitry Andric // Wide string literal. 36930b57cec5SDimitry Andric if (Char == '"') 36940b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 36950b57cec5SDimitry Andric tok::wide_string_literal); 36960b57cec5SDimitry Andric 36970b57cec5SDimitry Andric // Wide raw string literal. 36980b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && Char == 'R' && 36990b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 37000b57cec5SDimitry Andric return LexRawStringLiteral(Result, 37010b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37020b57cec5SDimitry Andric SizeTmp2, Result), 37030b57cec5SDimitry Andric tok::wide_string_literal); 37040b57cec5SDimitry Andric 37050b57cec5SDimitry Andric // Wide character constant. 37060b57cec5SDimitry Andric if (Char == '\'') 37070b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 37080b57cec5SDimitry Andric tok::wide_char_constant); 37090b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier. 37100b57cec5SDimitry Andric LLVM_FALLTHROUGH; 37110b57cec5SDimitry Andric 37120b57cec5SDimitry Andric // C99 6.4.2: Identifiers. 37130b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 37140b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 37150b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 37160b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z': 37170b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 37180b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 37190b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 37200b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z': 37210b57cec5SDimitry Andric case '_': 37220b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37230b57cec5SDimitry Andric MIOpt.ReadToken(); 3724349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 37250b57cec5SDimitry Andric 37260b57cec5SDimitry Andric case '$': // $ in identifiers. 37270b57cec5SDimitry Andric if (LangOpts.DollarIdents) { 37280b57cec5SDimitry Andric if (!isLexingRawMode()) 37290b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier); 37300b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37310b57cec5SDimitry Andric MIOpt.ReadToken(); 3732349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 37330b57cec5SDimitry Andric } 37340b57cec5SDimitry Andric 37350b57cec5SDimitry Andric Kind = tok::unknown; 37360b57cec5SDimitry Andric break; 37370b57cec5SDimitry Andric 37380b57cec5SDimitry Andric // C99 6.4.4: Character Constants. 37390b57cec5SDimitry Andric case '\'': 37400b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37410b57cec5SDimitry Andric MIOpt.ReadToken(); 37420b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant); 37430b57cec5SDimitry Andric 37440b57cec5SDimitry Andric // C99 6.4.5: String Literals. 37450b57cec5SDimitry Andric case '"': 37460b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37470b57cec5SDimitry Andric MIOpt.ReadToken(); 37480b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr, 37490b57cec5SDimitry Andric ParsingFilename ? tok::header_name 37500b57cec5SDimitry Andric : tok::string_literal); 37510b57cec5SDimitry Andric 37520b57cec5SDimitry Andric // C99 6.4.6: Punctuators. 37530b57cec5SDimitry Andric case '?': 37540b57cec5SDimitry Andric Kind = tok::question; 37550b57cec5SDimitry Andric break; 37560b57cec5SDimitry Andric case '[': 37570b57cec5SDimitry Andric Kind = tok::l_square; 37580b57cec5SDimitry Andric break; 37590b57cec5SDimitry Andric case ']': 37600b57cec5SDimitry Andric Kind = tok::r_square; 37610b57cec5SDimitry Andric break; 37620b57cec5SDimitry Andric case '(': 37630b57cec5SDimitry Andric Kind = tok::l_paren; 37640b57cec5SDimitry Andric break; 37650b57cec5SDimitry Andric case ')': 37660b57cec5SDimitry Andric Kind = tok::r_paren; 37670b57cec5SDimitry Andric break; 37680b57cec5SDimitry Andric case '{': 37690b57cec5SDimitry Andric Kind = tok::l_brace; 37700b57cec5SDimitry Andric break; 37710b57cec5SDimitry Andric case '}': 37720b57cec5SDimitry Andric Kind = tok::r_brace; 37730b57cec5SDimitry Andric break; 37740b57cec5SDimitry Andric case '.': 37750b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37760b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') { 37770b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37780b57cec5SDimitry Andric MIOpt.ReadToken(); 37790b57cec5SDimitry Andric 37800b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 37810b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') { 37820b57cec5SDimitry Andric Kind = tok::periodstar; 37830b57cec5SDimitry Andric CurPtr += SizeTmp; 37840b57cec5SDimitry Andric } else if (Char == '.' && 37850b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 37860b57cec5SDimitry Andric Kind = tok::ellipsis; 37870b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37880b57cec5SDimitry Andric SizeTmp2, Result); 37890b57cec5SDimitry Andric } else { 37900b57cec5SDimitry Andric Kind = tok::period; 37910b57cec5SDimitry Andric } 37920b57cec5SDimitry Andric break; 37930b57cec5SDimitry Andric case '&': 37940b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37950b57cec5SDimitry Andric if (Char == '&') { 37960b57cec5SDimitry Andric Kind = tok::ampamp; 37970b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 37980b57cec5SDimitry Andric } else if (Char == '=') { 37990b57cec5SDimitry Andric Kind = tok::ampequal; 38000b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38010b57cec5SDimitry Andric } else { 38020b57cec5SDimitry Andric Kind = tok::amp; 38030b57cec5SDimitry Andric } 38040b57cec5SDimitry Andric break; 38050b57cec5SDimitry Andric case '*': 38060b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 38070b57cec5SDimitry Andric Kind = tok::starequal; 38080b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38090b57cec5SDimitry Andric } else { 38100b57cec5SDimitry Andric Kind = tok::star; 38110b57cec5SDimitry Andric } 38120b57cec5SDimitry Andric break; 38130b57cec5SDimitry Andric case '+': 38140b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38150b57cec5SDimitry Andric if (Char == '+') { 38160b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38170b57cec5SDimitry Andric Kind = tok::plusplus; 38180b57cec5SDimitry Andric } else if (Char == '=') { 38190b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38200b57cec5SDimitry Andric Kind = tok::plusequal; 38210b57cec5SDimitry Andric } else { 38220b57cec5SDimitry Andric Kind = tok::plus; 38230b57cec5SDimitry Andric } 38240b57cec5SDimitry Andric break; 38250b57cec5SDimitry Andric case '-': 38260b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38270b57cec5SDimitry Andric if (Char == '-') { // -- 38280b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38290b57cec5SDimitry Andric Kind = tok::minusminus; 38300b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus && 38310b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 38320b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38330b57cec5SDimitry Andric SizeTmp2, Result); 38340b57cec5SDimitry Andric Kind = tok::arrowstar; 38350b57cec5SDimitry Andric } else if (Char == '>') { // -> 38360b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38370b57cec5SDimitry Andric Kind = tok::arrow; 38380b57cec5SDimitry Andric } else if (Char == '=') { // -= 38390b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38400b57cec5SDimitry Andric Kind = tok::minusequal; 38410b57cec5SDimitry Andric } else { 38420b57cec5SDimitry Andric Kind = tok::minus; 38430b57cec5SDimitry Andric } 38440b57cec5SDimitry Andric break; 38450b57cec5SDimitry Andric case '~': 38460b57cec5SDimitry Andric Kind = tok::tilde; 38470b57cec5SDimitry Andric break; 38480b57cec5SDimitry Andric case '!': 38490b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 38500b57cec5SDimitry Andric Kind = tok::exclaimequal; 38510b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38520b57cec5SDimitry Andric } else { 38530b57cec5SDimitry Andric Kind = tok::exclaim; 38540b57cec5SDimitry Andric } 38550b57cec5SDimitry Andric break; 38560b57cec5SDimitry Andric case '/': 38570b57cec5SDimitry Andric // 6.4.9: Comments 38580b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38590b57cec5SDimitry Andric if (Char == '/') { // Line comment. 38600b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally 38610b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though, 38620b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the 38630b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex 38640b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as 38650b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'. 38660b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment. 38670b57cec5SDimitry Andric // However, we never do this if we are just preprocessing. 3868*81ad6265SDimitry Andric bool TreatAsComment = 3869*81ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 38700b57cec5SDimitry Andric if (!TreatAsComment) 38710b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput())) 38720b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 38730b57cec5SDimitry Andric 38740b57cec5SDimitry Andric if (TreatAsComment) { 38750b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38760b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 38770b57cec5SDimitry Andric return true; // There is a token to return. 38780b57cec5SDimitry Andric 38790b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be 38800b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through 38810b57cec5SDimitry Andric // the big switch, handle it efficiently now. 38820b57cec5SDimitry Andric goto SkipIgnoredUnits; 38830b57cec5SDimitry Andric } 38840b57cec5SDimitry Andric } 38850b57cec5SDimitry Andric 38860b57cec5SDimitry Andric if (Char == '*') { // /**/ comment. 38870b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38880b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 38890b57cec5SDimitry Andric return true; // There is a token to return. 38900b57cec5SDimitry Andric 38910b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 38920b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 38930b57cec5SDimitry Andric goto LexNextToken; 38940b57cec5SDimitry Andric } 38950b57cec5SDimitry Andric 38960b57cec5SDimitry Andric if (Char == '=') { 38970b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 38980b57cec5SDimitry Andric Kind = tok::slashequal; 38990b57cec5SDimitry Andric } else { 39000b57cec5SDimitry Andric Kind = tok::slash; 39010b57cec5SDimitry Andric } 39020b57cec5SDimitry Andric break; 39030b57cec5SDimitry Andric case '%': 39040b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39050b57cec5SDimitry Andric if (Char == '=') { 39060b57cec5SDimitry Andric Kind = tok::percentequal; 39070b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39080b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') { 39090b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}' 39100b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39110b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { 39120b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39130b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39140b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 39150b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##' 39160b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39170b57cec5SDimitry Andric SizeTmp2, Result); 39180b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 39190b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39200b57cec5SDimitry Andric if (!isLexingRawMode()) 39210b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 39220b57cec5SDimitry Andric Kind = tok::hashat; 39230b57cec5SDimitry Andric } else { // '%:' -> '#' 39240b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 39250b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 39260b57cec5SDimitry Andric // the preprocessor to handle it. 39270b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 39280b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 39290b57cec5SDimitry Andric goto HandleDirective; 39300b57cec5SDimitry Andric 39310b57cec5SDimitry Andric Kind = tok::hash; 39320b57cec5SDimitry Andric } 39330b57cec5SDimitry Andric } else { 39340b57cec5SDimitry Andric Kind = tok::percent; 39350b57cec5SDimitry Andric } 39360b57cec5SDimitry Andric break; 39370b57cec5SDimitry Andric case '<': 39380b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39390b57cec5SDimitry Andric if (ParsingFilename) { 39400b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr); 39410b57cec5SDimitry Andric } else if (Char == '<') { 39420b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 39430b57cec5SDimitry Andric if (After == '=') { 39440b57cec5SDimitry Andric Kind = tok::lesslessequal; 39450b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39460b57cec5SDimitry Andric SizeTmp2, Result); 39470b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 39480b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker, 39490b57cec5SDimitry Andric // recognize it as such and recover nicely. 39500b57cec5SDimitry Andric goto LexNextToken; 39510b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 39520b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker, 39530b57cec5SDimitry Andric // ignore it. 39540b57cec5SDimitry Andric goto LexNextToken; 39550b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') { 39560b57cec5SDimitry Andric Kind = tok::lesslessless; 39570b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39580b57cec5SDimitry Andric SizeTmp2, Result); 39590b57cec5SDimitry Andric } else { 39600b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39610b57cec5SDimitry Andric Kind = tok::lessless; 39620b57cec5SDimitry Andric } 39630b57cec5SDimitry Andric } else if (Char == '=') { 39640b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 39650b57cec5SDimitry Andric if (After == '>') { 3966*81ad6265SDimitry Andric if (LangOpts.CPlusPlus20) { 39670b57cec5SDimitry Andric if (!isLexingRawMode()) 39680b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 39690b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39700b57cec5SDimitry Andric SizeTmp2, Result); 39710b57cec5SDimitry Andric Kind = tok::spaceship; 39720b57cec5SDimitry Andric break; 39730b57cec5SDimitry Andric } 39740b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a 39750b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode. 3976*81ad6265SDimitry Andric if (LangOpts.CPlusPlus && !isLexingRawMode()) { 39775ffd83dbSDimitry Andric Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 39780b57cec5SDimitry Andric << FixItHint::CreateInsertion( 39790b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 39800b57cec5SDimitry Andric } 39810b57cec5SDimitry Andric } 39820b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39830b57cec5SDimitry Andric Kind = tok::lessequal; 39840b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 39850b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && 39860b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 39870b57cec5SDimitry Andric // C++0x [lex.pptoken]p3: 39880b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent 39890b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor 39900b57cec5SDimitry Andric // token by itself and not as the first character of the alternative 39910b57cec5SDimitry Andric // token <:. 39920b57cec5SDimitry Andric unsigned SizeTmp3; 39930b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 39940b57cec5SDimitry Andric if (After != ':' && After != '>') { 39950b57cec5SDimitry Andric Kind = tok::less; 39960b57cec5SDimitry Andric if (!isLexingRawMode()) 39970b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 39980b57cec5SDimitry Andric break; 39990b57cec5SDimitry Andric } 40000b57cec5SDimitry Andric } 40010b57cec5SDimitry Andric 40020b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40030b57cec5SDimitry Andric Kind = tok::l_square; 40040b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 40050b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40060b57cec5SDimitry Andric Kind = tok::l_brace; 40070b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 40080b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) { 40090b57cec5SDimitry Andric return true; 40100b57cec5SDimitry Andric } else { 40110b57cec5SDimitry Andric Kind = tok::less; 40120b57cec5SDimitry Andric } 40130b57cec5SDimitry Andric break; 40140b57cec5SDimitry Andric case '>': 40150b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40160b57cec5SDimitry Andric if (Char == '=') { 40170b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40180b57cec5SDimitry Andric Kind = tok::greaterequal; 40190b57cec5SDimitry Andric } else if (Char == '>') { 40200b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 40210b57cec5SDimitry Andric if (After == '=') { 40220b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 40230b57cec5SDimitry Andric SizeTmp2, Result); 40240b57cec5SDimitry Andric Kind = tok::greatergreaterequal; 40250b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 40260b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such 40270b57cec5SDimitry Andric // and recover nicely. 40280b57cec5SDimitry Andric goto LexNextToken; 40290b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 40300b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 40310b57cec5SDimitry Andric goto LexNextToken; 40320b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') { 40330b57cec5SDimitry Andric Kind = tok::greatergreatergreater; 40340b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 40350b57cec5SDimitry Andric SizeTmp2, Result); 40360b57cec5SDimitry Andric } else { 40370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40380b57cec5SDimitry Andric Kind = tok::greatergreater; 40390b57cec5SDimitry Andric } 40400b57cec5SDimitry Andric } else { 40410b57cec5SDimitry Andric Kind = tok::greater; 40420b57cec5SDimitry Andric } 40430b57cec5SDimitry Andric break; 40440b57cec5SDimitry Andric case '^': 40450b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40460b57cec5SDimitry Andric if (Char == '=') { 40470b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40480b57cec5SDimitry Andric Kind = tok::caretequal; 40490b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') { 40500b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40510b57cec5SDimitry Andric Kind = tok::caretcaret; 40520b57cec5SDimitry Andric } else { 40530b57cec5SDimitry Andric Kind = tok::caret; 40540b57cec5SDimitry Andric } 40550b57cec5SDimitry Andric break; 40560b57cec5SDimitry Andric case '|': 40570b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40580b57cec5SDimitry Andric if (Char == '=') { 40590b57cec5SDimitry Andric Kind = tok::pipeequal; 40600b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40610b57cec5SDimitry Andric } else if (Char == '|') { 40620b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it. 40630b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 40640b57cec5SDimitry Andric goto LexNextToken; 40650b57cec5SDimitry Andric Kind = tok::pipepipe; 40660b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40670b57cec5SDimitry Andric } else { 40680b57cec5SDimitry Andric Kind = tok::pipe; 40690b57cec5SDimitry Andric } 40700b57cec5SDimitry Andric break; 40710b57cec5SDimitry Andric case ':': 40720b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40730b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') { 40740b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']' 40750b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40760b57cec5SDimitry Andric } else if ((LangOpts.CPlusPlus || 40770b57cec5SDimitry Andric LangOpts.DoubleSquareBracketAttributes) && 40780b57cec5SDimitry Andric Char == ':') { 40790b57cec5SDimitry Andric Kind = tok::coloncolon; 40800b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40810b57cec5SDimitry Andric } else { 40820b57cec5SDimitry Andric Kind = tok::colon; 40830b57cec5SDimitry Andric } 40840b57cec5SDimitry Andric break; 40850b57cec5SDimitry Andric case ';': 40860b57cec5SDimitry Andric Kind = tok::semi; 40870b57cec5SDimitry Andric break; 40880b57cec5SDimitry Andric case '=': 40890b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40900b57cec5SDimitry Andric if (Char == '=') { 40910b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it. 40920b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 40930b57cec5SDimitry Andric goto LexNextToken; 40940b57cec5SDimitry Andric 40950b57cec5SDimitry Andric Kind = tok::equalequal; 40960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40970b57cec5SDimitry Andric } else { 40980b57cec5SDimitry Andric Kind = tok::equal; 40990b57cec5SDimitry Andric } 41000b57cec5SDimitry Andric break; 41010b57cec5SDimitry Andric case ',': 41020b57cec5SDimitry Andric Kind = tok::comma; 41030b57cec5SDimitry Andric break; 41040b57cec5SDimitry Andric case '#': 41050b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41060b57cec5SDimitry Andric if (Char == '#') { 41070b57cec5SDimitry Andric Kind = tok::hashhash; 41080b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41090b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 41100b57cec5SDimitry Andric Kind = tok::hashat; 41110b57cec5SDimitry Andric if (!isLexingRawMode()) 41120b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 41130b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41140b57cec5SDimitry Andric } else { 41150b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 41160b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 41170b57cec5SDimitry Andric // the preprocessor to handle it. 41180b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 41190b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 41200b57cec5SDimitry Andric goto HandleDirective; 41210b57cec5SDimitry Andric 41220b57cec5SDimitry Andric Kind = tok::hash; 41230b57cec5SDimitry Andric } 41240b57cec5SDimitry Andric break; 41250b57cec5SDimitry Andric 41260b57cec5SDimitry Andric case '@': 41270b57cec5SDimitry Andric // Objective C support. 41280b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC) 41290b57cec5SDimitry Andric Kind = tok::at; 41300b57cec5SDimitry Andric else 41310b57cec5SDimitry Andric Kind = tok::unknown; 41320b57cec5SDimitry Andric break; 41330b57cec5SDimitry Andric 41340b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 41350b57cec5SDimitry Andric case '\\': 41360b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) { 41370b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 41380b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 41390b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 41400b57cec5SDimitry Andric return true; // KeepWhitespaceMode 41410b57cec5SDimitry Andric 41420b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 41430b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 41440b57cec5SDimitry Andric goto LexNextToken; 41450b57cec5SDimitry Andric } 41460b57cec5SDimitry Andric 4147349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 41480b57cec5SDimitry Andric } 41490b57cec5SDimitry Andric } 41500b57cec5SDimitry Andric 41510b57cec5SDimitry Andric Kind = tok::unknown; 41520b57cec5SDimitry Andric break; 41530b57cec5SDimitry Andric 41540b57cec5SDimitry Andric default: { 41550b57cec5SDimitry Andric if (isASCII(Char)) { 41560b57cec5SDimitry Andric Kind = tok::unknown; 41570b57cec5SDimitry Andric break; 41580b57cec5SDimitry Andric } 41590b57cec5SDimitry Andric 41600b57cec5SDimitry Andric llvm::UTF32 CodePoint; 41610b57cec5SDimitry Andric 41620b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 41630b57cec5SDimitry Andric // an escaped newline. 41640b57cec5SDimitry Andric --CurPtr; 41650b57cec5SDimitry Andric llvm::ConversionResult Status = 41660b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 41670b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 41680b57cec5SDimitry Andric &CodePoint, 41690b57cec5SDimitry Andric llvm::strictConversion); 41700b57cec5SDimitry Andric if (Status == llvm::conversionOK) { 41710b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 41720b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 41730b57cec5SDimitry Andric return true; // KeepWhitespaceMode 41740b57cec5SDimitry Andric 41750b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 41760b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 41770b57cec5SDimitry Andric goto LexNextToken; 41780b57cec5SDimitry Andric } 4179349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 41800b57cec5SDimitry Andric } 41810b57cec5SDimitry Andric 41820b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective || 41830b57cec5SDimitry Andric PP->isPreprocessedOutput()) { 41840b57cec5SDimitry Andric ++CurPtr; 41850b57cec5SDimitry Andric Kind = tok::unknown; 41860b57cec5SDimitry Andric break; 41870b57cec5SDimitry Andric } 41880b57cec5SDimitry Andric 41890b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 41900b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 41910b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character. 41920b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8); 41930b57cec5SDimitry Andric 41940b57cec5SDimitry Andric BufferPtr = CurPtr+1; 41950b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with 41960b57cec5SDimitry Andric // this lexer. 41970b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 41980b57cec5SDimitry Andric goto LexNextToken; 41990b57cec5SDimitry Andric } 42000b57cec5SDimitry Andric } 42010b57cec5SDimitry Andric 42020b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 42030b57cec5SDimitry Andric MIOpt.ReadToken(); 42040b57cec5SDimitry Andric 42050b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 42060b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 42070b57cec5SDimitry Andric return true; 42080b57cec5SDimitry Andric 42090b57cec5SDimitry Andric HandleDirective: 42100b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive. 42110b57cec5SDimitry Andric 42120b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash); 42130b57cec5SDimitry Andric PP->HandleDirective(Result); 42140b57cec5SDimitry Andric 42150b57cec5SDimitry Andric if (PP->hadModuleLoaderFatalFailure()) { 42160b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing. 42170b57cec5SDimitry Andric assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 42180b57cec5SDimitry Andric return true; 42190b57cec5SDimitry Andric } 42200b57cec5SDimitry Andric 42210b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state. 42220b57cec5SDimitry Andric return false; 42230b57cec5SDimitry Andric } 4224*81ad6265SDimitry Andric 4225*81ad6265SDimitry Andric const char *Lexer::convertDependencyDirectiveToken( 4226*81ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok, Token &Result) { 4227*81ad6265SDimitry Andric const char *TokPtr = BufferStart + DDTok.Offset; 4228*81ad6265SDimitry Andric Result.startToken(); 4229*81ad6265SDimitry Andric Result.setLocation(getSourceLocation(TokPtr)); 4230*81ad6265SDimitry Andric Result.setKind(DDTok.Kind); 4231*81ad6265SDimitry Andric Result.setFlag((Token::TokenFlags)DDTok.Flags); 4232*81ad6265SDimitry Andric Result.setLength(DDTok.Length); 4233*81ad6265SDimitry Andric BufferPtr = TokPtr + DDTok.Length; 4234*81ad6265SDimitry Andric return TokPtr; 4235*81ad6265SDimitry Andric } 4236*81ad6265SDimitry Andric 4237*81ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveToken(Token &Result) { 4238*81ad6265SDimitry Andric assert(isDependencyDirectivesLexer()); 4239*81ad6265SDimitry Andric 4240*81ad6265SDimitry Andric using namespace dependency_directives_scan; 4241*81ad6265SDimitry Andric 4242*81ad6265SDimitry Andric while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 4243*81ad6265SDimitry Andric if (DepDirectives.front().Kind == pp_eof) 4244*81ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd); 4245*81ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0; 4246*81ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front(); 4247*81ad6265SDimitry Andric } 4248*81ad6265SDimitry Andric 4249*81ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok = 4250*81ad6265SDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 4251*81ad6265SDimitry Andric if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 4252*81ad6265SDimitry Andric // Read something other than a preprocessor directive hash. 4253*81ad6265SDimitry Andric MIOpt.ReadToken(); 4254*81ad6265SDimitry Andric } 4255*81ad6265SDimitry Andric 4256*81ad6265SDimitry Andric const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 4257*81ad6265SDimitry Andric 4258*81ad6265SDimitry Andric if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 4259*81ad6265SDimitry Andric PP->HandleDirective(Result); 4260*81ad6265SDimitry Andric return false; 4261*81ad6265SDimitry Andric } 4262*81ad6265SDimitry Andric if (Result.is(tok::raw_identifier)) { 4263*81ad6265SDimitry Andric Result.setRawIdentifierData(TokPtr); 4264*81ad6265SDimitry Andric if (!isLexingRawMode()) { 4265*81ad6265SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 4266*81ad6265SDimitry Andric if (II->isHandleIdentifierCase()) 4267*81ad6265SDimitry Andric return PP->HandleIdentifier(Result); 4268*81ad6265SDimitry Andric } 4269*81ad6265SDimitry Andric return true; 4270*81ad6265SDimitry Andric } 4271*81ad6265SDimitry Andric if (Result.isLiteral()) { 4272*81ad6265SDimitry Andric Result.setLiteralData(TokPtr); 4273*81ad6265SDimitry Andric return true; 4274*81ad6265SDimitry Andric } 4275*81ad6265SDimitry Andric if (Result.is(tok::colon) && 4276*81ad6265SDimitry Andric (LangOpts.CPlusPlus || LangOpts.DoubleSquareBracketAttributes)) { 4277*81ad6265SDimitry Andric // Convert consecutive colons to 'tok::coloncolon'. 4278*81ad6265SDimitry Andric if (*BufferPtr == ':') { 4279*81ad6265SDimitry Andric assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 4280*81ad6265SDimitry Andric tok::colon)); 4281*81ad6265SDimitry Andric ++NextDepDirectiveTokenIndex; 4282*81ad6265SDimitry Andric Result.setKind(tok::coloncolon); 4283*81ad6265SDimitry Andric } 4284*81ad6265SDimitry Andric return true; 4285*81ad6265SDimitry Andric } 4286*81ad6265SDimitry Andric if (Result.is(tok::eod)) 4287*81ad6265SDimitry Andric ParsingPreprocessorDirective = false; 4288*81ad6265SDimitry Andric 4289*81ad6265SDimitry Andric return true; 4290*81ad6265SDimitry Andric } 4291*81ad6265SDimitry Andric 4292*81ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 4293*81ad6265SDimitry Andric assert(isDependencyDirectivesLexer()); 4294*81ad6265SDimitry Andric 4295*81ad6265SDimitry Andric using namespace dependency_directives_scan; 4296*81ad6265SDimitry Andric 4297*81ad6265SDimitry Andric bool Stop = false; 4298*81ad6265SDimitry Andric unsigned NestedIfs = 0; 4299*81ad6265SDimitry Andric do { 4300*81ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front(); 4301*81ad6265SDimitry Andric switch (DepDirectives.front().Kind) { 4302*81ad6265SDimitry Andric case pp_none: 4303*81ad6265SDimitry Andric llvm_unreachable("unexpected 'pp_none'"); 4304*81ad6265SDimitry Andric case pp_include: 4305*81ad6265SDimitry Andric case pp___include_macros: 4306*81ad6265SDimitry Andric case pp_define: 4307*81ad6265SDimitry Andric case pp_undef: 4308*81ad6265SDimitry Andric case pp_import: 4309*81ad6265SDimitry Andric case pp_pragma_import: 4310*81ad6265SDimitry Andric case pp_pragma_once: 4311*81ad6265SDimitry Andric case pp_pragma_push_macro: 4312*81ad6265SDimitry Andric case pp_pragma_pop_macro: 4313*81ad6265SDimitry Andric case pp_pragma_include_alias: 4314*81ad6265SDimitry Andric case pp_include_next: 4315*81ad6265SDimitry Andric case decl_at_import: 4316*81ad6265SDimitry Andric case cxx_module_decl: 4317*81ad6265SDimitry Andric case cxx_import_decl: 4318*81ad6265SDimitry Andric case cxx_export_module_decl: 4319*81ad6265SDimitry Andric case cxx_export_import_decl: 4320*81ad6265SDimitry Andric break; 4321*81ad6265SDimitry Andric case pp_if: 4322*81ad6265SDimitry Andric case pp_ifdef: 4323*81ad6265SDimitry Andric case pp_ifndef: 4324*81ad6265SDimitry Andric ++NestedIfs; 4325*81ad6265SDimitry Andric break; 4326*81ad6265SDimitry Andric case pp_elif: 4327*81ad6265SDimitry Andric case pp_elifdef: 4328*81ad6265SDimitry Andric case pp_elifndef: 4329*81ad6265SDimitry Andric case pp_else: 4330*81ad6265SDimitry Andric if (!NestedIfs) { 4331*81ad6265SDimitry Andric Stop = true; 4332*81ad6265SDimitry Andric } 4333*81ad6265SDimitry Andric break; 4334*81ad6265SDimitry Andric case pp_endif: 4335*81ad6265SDimitry Andric if (!NestedIfs) { 4336*81ad6265SDimitry Andric Stop = true; 4337*81ad6265SDimitry Andric } else { 4338*81ad6265SDimitry Andric --NestedIfs; 4339*81ad6265SDimitry Andric } 4340*81ad6265SDimitry Andric break; 4341*81ad6265SDimitry Andric case pp_eof: 4342*81ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0; 4343*81ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd); 4344*81ad6265SDimitry Andric } 4345*81ad6265SDimitry Andric } while (!Stop); 4346*81ad6265SDimitry Andric 4347*81ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok = 4348*81ad6265SDimitry Andric DepDirectives.front().Tokens.front(); 4349*81ad6265SDimitry Andric assert(DDTok.is(tok::hash)); 4350*81ad6265SDimitry Andric NextDepDirectiveTokenIndex = 1; 4351*81ad6265SDimitry Andric 4352*81ad6265SDimitry Andric convertDependencyDirectiveToken(DDTok, Result); 4353*81ad6265SDimitry Andric return false; 4354*81ad6265SDimitry Andric } 4355