10b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 20b57cec5SDimitry Andric // 30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 60b57cec5SDimitry Andric // 70b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 80b57cec5SDimitry Andric // 90b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces. 100b57cec5SDimitry Andric // 110b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 120b57cec5SDimitry Andric 130b57cec5SDimitry Andric #include "clang/Lex/Lexer.h" 140b57cec5SDimitry Andric #include "UnicodeCharSets.h" 150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h" 16e8d8bef9SDimitry Andric #include "clang/Basic/Diagnostic.h" 170b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 18e8d8bef9SDimitry Andric #include "clang/Basic/LLVM.h" 190b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h" 200b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h" 210b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h" 220b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 230b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h" 240b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h" 250b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h" 260b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h" 270b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h" 280b57cec5SDimitry Andric #include "clang/Lex/Token.h" 295ffd83dbSDimitry Andric #include "llvm/ADT/STLExtras.h" 300b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h" 310b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 32e8d8bef9SDimitry Andric #include "llvm/ADT/StringSwitch.h" 330b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 340b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 350b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h" 36e8d8bef9SDimitry Andric #include "llvm/Support/MemoryBufferRef.h" 370b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h" 3881ad6265SDimitry Andric #include "llvm/Support/Unicode.h" 390b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h" 400b57cec5SDimitry Andric #include <algorithm> 410b57cec5SDimitry Andric #include <cassert> 420b57cec5SDimitry Andric #include <cstddef> 430b57cec5SDimitry Andric #include <cstdint> 440b57cec5SDimitry Andric #include <cstring> 45*bdd1243dSDimitry Andric #include <optional> 460b57cec5SDimitry Andric #include <string> 470b57cec5SDimitry Andric #include <tuple> 480b57cec5SDimitry Andric #include <utility> 490b57cec5SDimitry Andric 500b57cec5SDimitry Andric using namespace clang; 510b57cec5SDimitry Andric 520b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 530b57cec5SDimitry Andric // Token Class Implementation 540b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 550b57cec5SDimitry Andric 560b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 570b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 580b57cec5SDimitry Andric if (isAnnotation()) 590b57cec5SDimitry Andric return false; 600b57cec5SDimitry Andric if (IdentifierInfo *II = getIdentifierInfo()) 610b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey; 620b57cec5SDimitry Andric return false; 630b57cec5SDimitry Andric } 640b57cec5SDimitry Andric 650b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind. 660b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const { 670b57cec5SDimitry Andric if (isAnnotation()) 680b57cec5SDimitry Andric return tok::objc_not_keyword; 690b57cec5SDimitry Andric IdentifierInfo *specId = getIdentifierInfo(); 700b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 710b57cec5SDimitry Andric } 720b57cec5SDimitry Andric 730b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 740b57cec5SDimitry Andric // Lexer Class Implementation 750b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 760b57cec5SDimitry Andric 770b57cec5SDimitry Andric void Lexer::anchor() {} 780b57cec5SDimitry Andric 790b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 800b57cec5SDimitry Andric const char *BufEnd) { 810b57cec5SDimitry Andric BufferStart = BufStart; 820b57cec5SDimitry Andric BufferPtr = BufPtr; 830b57cec5SDimitry Andric BufferEnd = BufEnd; 840b57cec5SDimitry Andric 850b57cec5SDimitry Andric assert(BufEnd[0] == 0 && 860b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end" 870b57cec5SDimitry Andric " to simplify lexing!"); 880b57cec5SDimitry Andric 890b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act 900b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just 910b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present. 920b57cec5SDimitry Andric if (BufferStart == BufferPtr) { 930b57cec5SDimitry Andric // Determine the size of the BOM. 940b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart); 950b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 960b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 970b57cec5SDimitry Andric .Default(0); 980b57cec5SDimitry Andric 990b57cec5SDimitry Andric // Skip the BOM. 1000b57cec5SDimitry Andric BufferPtr += BOMLength; 1010b57cec5SDimitry Andric } 1020b57cec5SDimitry Andric 1030b57cec5SDimitry Andric Is_PragmaLexer = false; 1040b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 1050b57cec5SDimitry Andric 1060b57cec5SDimitry Andric // Start of the file is a start of line. 1070b57cec5SDimitry Andric IsAtStartOfLine = true; 1080b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 1090b57cec5SDimitry Andric 1100b57cec5SDimitry Andric HasLeadingSpace = false; 1110b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 1120b57cec5SDimitry Andric 1130b57cec5SDimitry Andric // We are not after parsing a #. 1140b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 1150b57cec5SDimitry Andric 1160b57cec5SDimitry Andric // We are not after parsing #include. 1170b57cec5SDimitry Andric ParsingFilename = false; 1180b57cec5SDimitry Andric 1190b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation 1200b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 1210b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 1220b57cec5SDimitry Andric // or otherwise skipping over tokens. 1230b57cec5SDimitry Andric LexingRawMode = false; 1240b57cec5SDimitry Andric 1250b57cec5SDimitry Andric // Default to not keeping comments. 1260b57cec5SDimitry Andric ExtendedTokenMode = 0; 127e8d8bef9SDimitry Andric 128e8d8bef9SDimitry Andric NewLinePtr = nullptr; 1290b57cec5SDimitry Andric } 1300b57cec5SDimitry Andric 1310b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer 1320b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer 1330b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will 1340b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them. 135e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, 136349cc55cSDimitry Andric Preprocessor &PP, bool IsFirstIncludeOfFile) 1370b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID), 1380b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 13981ad6265SDimitry Andric LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment), 14081ad6265SDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 141e8d8bef9SDimitry Andric InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(), 142e8d8bef9SDimitry Andric InputFile.getBufferEnd()); 1430b57cec5SDimitry Andric 1440b57cec5SDimitry Andric resetExtendedTokenMode(); 1450b57cec5SDimitry Andric } 1460b57cec5SDimitry Andric 1470b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1480b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1490b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 1500b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 151349cc55cSDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd, 152349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 15381ad6265SDimitry Andric : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment), 154349cc55cSDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) { 1550b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd); 1560b57cec5SDimitry Andric 1570b57cec5SDimitry Andric // We *are* in raw mode. 1580b57cec5SDimitry Andric LexingRawMode = true; 1590b57cec5SDimitry Andric } 1600b57cec5SDimitry Andric 1610b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 1620b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 1630b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 164e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile, 165349cc55cSDimitry Andric const SourceManager &SM, const LangOptions &langOpts, 166349cc55cSDimitry Andric bool IsFirstIncludeOfFile) 167e8d8bef9SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(), 168349cc55cSDimitry Andric FromFile.getBufferStart(), FromFile.getBufferEnd(), 169349cc55cSDimitry Andric IsFirstIncludeOfFile) {} 1700b57cec5SDimitry Andric 1710b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() { 1720b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor"); 1730b57cec5SDimitry Andric if (LangOpts.TraditionalCPP) 1740b57cec5SDimitry Andric SetKeepWhitespaceMode(true); 1750b57cec5SDimitry Andric else 1760b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState()); 1770b57cec5SDimitry Andric } 1780b57cec5SDimitry Andric 1790b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 1800b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method 1810b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done. 1820b57cec5SDimitry Andric /// 1830b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a 1840b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an 1850b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be 1860b57cec5SDimitry Andric /// "expanded from". 1870b57cec5SDimitry Andric /// 1880b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a 1890b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making 1900b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 1910b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc 1920b57cec5SDimitry Andric /// out of the critical path of the lexer! 1930b57cec5SDimitry Andric /// 1940b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 1950b57cec5SDimitry Andric SourceLocation ExpansionLocStart, 1960b57cec5SDimitry Andric SourceLocation ExpansionLocEnd, 1970b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) { 1980b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 1990b57cec5SDimitry Andric 2000b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally. 2010b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc); 202e8d8bef9SDimitry Andric llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID); 2030b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP); 2040b57cec5SDimitry Andric 2050b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we 2060b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a 2070b57cec5SDimitry Andric // scratch buffer. 2080b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc); 2090b57cec5SDimitry Andric 2100b57cec5SDimitry Andric L->BufferPtr = StrData; 2110b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen; 2120b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 2130b57cec5SDimitry Andric 2140b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that 2150b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed. 2160b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 2170b57cec5SDimitry Andric ExpansionLocStart, 2180b57cec5SDimitry Andric ExpansionLocEnd, TokLen); 2190b57cec5SDimitry Andric 2200b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will 2210b57cec5SDimitry Andric // return an EOD token. 2220b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true; 2230b57cec5SDimitry Andric 2240b57cec5SDimitry Andric // This lexer really is for _Pragma. 2250b57cec5SDimitry Andric L->Is_PragmaLexer = true; 2260b57cec5SDimitry Andric return L; 2270b57cec5SDimitry Andric } 2280b57cec5SDimitry Andric 22981ad6265SDimitry Andric void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) { 23081ad6265SDimitry Andric this->IsAtPhysicalStartOfLine = IsAtStartOfLine; 23181ad6265SDimitry Andric this->IsAtStartOfLine = IsAtStartOfLine; 23281ad6265SDimitry Andric assert((BufferStart + Offset) <= BufferEnd); 23381ad6265SDimitry Andric BufferPtr = BufferStart + Offset; 234a7dea167SDimitry Andric } 235a7dea167SDimitry Andric 2360b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) { 2370b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size(); 2380b57cec5SDimitry Andric while (i < e) { 2390b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) { 2400b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\'); 2410b57cec5SDimitry Andric i += 2; 2420b57cec5SDimitry Andric ++e; 2430b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') { 2440b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 2450b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 2460b57cec5SDimitry Andric Str[i] != Str[i + 1]) { 2470b57cec5SDimitry Andric Str[i] = '\\'; 2480b57cec5SDimitry Andric Str[i + 1] = 'n'; 2490b57cec5SDimitry Andric } else { 2500b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'. 2510b57cec5SDimitry Andric Str[i] = '\\'; 2520b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n'); 2530b57cec5SDimitry Andric ++e; 2540b57cec5SDimitry Andric } 2550b57cec5SDimitry Andric i += 2; 2560b57cec5SDimitry Andric } else 2570b57cec5SDimitry Andric ++i; 2580b57cec5SDimitry Andric } 2590b57cec5SDimitry Andric } 2600b57cec5SDimitry Andric 2610b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) { 2625ffd83dbSDimitry Andric std::string Result = std::string(Str); 2630b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"'; 2640b57cec5SDimitry Andric StringifyImpl(Result, Quote); 2650b57cec5SDimitry Andric return Result; 2660b57cec5SDimitry Andric } 2670b57cec5SDimitry Andric 2680b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 2690b57cec5SDimitry Andric 2700b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2710b57cec5SDimitry Andric // Token Spelling 2720b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2730b57cec5SDimitry Andric 2740b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the 2750b57cec5SDimitry Andric /// spelling of this token from the provided input buffer. 2760b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 2770b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) { 2780b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 2790b57cec5SDimitry Andric 2800b57cec5SDimitry Andric size_t Length = 0; 2810b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength(); 2820b57cec5SDimitry Andric 2830b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) { 2840b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote. 2850b57cec5SDimitry Andric while (BufPtr < BufEnd) { 2860b57cec5SDimitry Andric unsigned Size; 2870b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 2880b57cec5SDimitry Andric BufPtr += Size; 2890b57cec5SDimitry Andric 2900b57cec5SDimitry Andric if (Spelling[Length - 1] == '"') 2910b57cec5SDimitry Andric break; 2920b57cec5SDimitry Andric } 2930b57cec5SDimitry Andric 2940b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line 2950b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their 2960b57cec5SDimitry Andric // r-char-sequence. 2970b57cec5SDimitry Andric if (Length >= 2 && 2980b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 2990b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing 3000b57cec5SDimitry Andric // quote. 3010b57cec5SDimitry Andric const char *RawEnd = BufEnd; 3020b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"'); 3030b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1; 3040b57cec5SDimitry Andric 3050b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling. 3060b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength); 3070b57cec5SDimitry Andric Length += RawLength; 3080b57cec5SDimitry Andric BufPtr += RawLength; 3090b57cec5SDimitry Andric 3100b57cec5SDimitry Andric // The rest of the token is lexed normally. 3110b57cec5SDimitry Andric } 3120b57cec5SDimitry Andric } 3130b57cec5SDimitry Andric 3140b57cec5SDimitry Andric while (BufPtr < BufEnd) { 3150b57cec5SDimitry Andric unsigned Size; 3160b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 3170b57cec5SDimitry Andric BufPtr += Size; 3180b57cec5SDimitry Andric } 3190b57cec5SDimitry Andric 3200b57cec5SDimitry Andric assert(Length < Tok.getLength() && 3210b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!"); 3220b57cec5SDimitry Andric return Length; 3230b57cec5SDimitry Andric } 3240b57cec5SDimitry Andric 3250b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3260b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3270b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3280b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3290b57cec5SDimitry Andric /// UCNs, etc. 3300b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc, 3310b57cec5SDimitry Andric SmallVectorImpl<char> &buffer, 3320b57cec5SDimitry Andric const SourceManager &SM, 3330b57cec5SDimitry Andric const LangOptions &options, 3340b57cec5SDimitry Andric bool *invalid) { 3350b57cec5SDimitry Andric // Break down the source location. 3360b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 3370b57cec5SDimitry Andric 3380b57cec5SDimitry Andric // Try to the load the file buffer. 3390b57cec5SDimitry Andric bool invalidTemp = false; 3400b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 3410b57cec5SDimitry Andric if (invalidTemp) { 3420b57cec5SDimitry Andric if (invalid) *invalid = true; 3430b57cec5SDimitry Andric return {}; 3440b57cec5SDimitry Andric } 3450b57cec5SDimitry Andric 3460b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second; 3470b57cec5SDimitry Andric 3480b57cec5SDimitry Andric // Lex from the start of the given location. 3490b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 3500b57cec5SDimitry Andric file.begin(), tokenBegin, file.end()); 3510b57cec5SDimitry Andric Token token; 3520b57cec5SDimitry Andric lexer.LexFromRawLexer(token); 3530b57cec5SDimitry Andric 3540b57cec5SDimitry Andric unsigned length = token.getLength(); 3550b57cec5SDimitry Andric 3560b57cec5SDimitry Andric // Common case: no need for cleaning. 3570b57cec5SDimitry Andric if (!token.needsCleaning()) 3580b57cec5SDimitry Andric return StringRef(tokenBegin, length); 3590b57cec5SDimitry Andric 3600b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string. 3610b57cec5SDimitry Andric buffer.resize(length); 3620b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 3630b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size()); 3640b57cec5SDimitry Andric } 3650b57cec5SDimitry Andric 3660b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 3670b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 3680b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 3690b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 3700b57cec5SDimitry Andric /// UCNs, etc. 3710b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 3720b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 3730b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 3740b57cec5SDimitry Andric 3750b57cec5SDimitry Andric bool CharDataInvalid = false; 3760b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 3770b57cec5SDimitry Andric &CharDataInvalid); 3780b57cec5SDimitry Andric if (Invalid) 3790b57cec5SDimitry Andric *Invalid = CharDataInvalid; 3800b57cec5SDimitry Andric if (CharDataInvalid) 3810b57cec5SDimitry Andric return {}; 3820b57cec5SDimitry Andric 3830b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 3840b57cec5SDimitry Andric if (!Tok.needsCleaning()) 3850b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength()); 3860b57cec5SDimitry Andric 3870b57cec5SDimitry Andric std::string Result; 3880b57cec5SDimitry Andric Result.resize(Tok.getLength()); 3890b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 3900b57cec5SDimitry Andric return Result; 3910b57cec5SDimitry Andric } 3920b57cec5SDimitry Andric 3930b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a 3940b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required 3950b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least 3960b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned. 3970b57cec5SDimitry Andric /// 3980b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in 3990b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer* 4000b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a 4010b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer 4020b57cec5SDimitry Andric /// if an internal buffer is returned. 4030b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 4040b57cec5SDimitry Andric const SourceManager &SourceMgr, 4050b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 4060b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 4070b57cec5SDimitry Andric 4080b57cec5SDimitry Andric const char *TokStart = nullptr; 4090b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo. 4100b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier)) 4110b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data(); 4120b57cec5SDimitry Andric else if (!Tok.hasUCN()) { 4130b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 4140b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick. 4150b57cec5SDimitry Andric Buffer = II->getNameStart(); 4160b57cec5SDimitry Andric return II->getLength(); 4170b57cec5SDimitry Andric } 4180b57cec5SDimitry Andric } 4190b57cec5SDimitry Andric 4200b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo. 4210b57cec5SDimitry Andric if (Tok.isLiteral()) 4220b57cec5SDimitry Andric TokStart = Tok.getLiteralData(); 4230b57cec5SDimitry Andric 4240b57cec5SDimitry Andric if (!TokStart) { 4250b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer. 4260b57cec5SDimitry Andric bool CharDataInvalid = false; 4270b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 4280b57cec5SDimitry Andric if (Invalid) 4290b57cec5SDimitry Andric *Invalid = CharDataInvalid; 4300b57cec5SDimitry Andric if (CharDataInvalid) { 4310b57cec5SDimitry Andric Buffer = ""; 4320b57cec5SDimitry Andric return 0; 4330b57cec5SDimitry Andric } 4340b57cec5SDimitry Andric } 4350b57cec5SDimitry Andric 4360b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 4370b57cec5SDimitry Andric if (!Tok.needsCleaning()) { 4380b57cec5SDimitry Andric Buffer = TokStart; 4390b57cec5SDimitry Andric return Tok.getLength(); 4400b57cec5SDimitry Andric } 4410b57cec5SDimitry Andric 4420b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string. 4430b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 4440b57cec5SDimitry Andric } 4450b57cec5SDimitry Andric 4460b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return 4470b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g. 4480b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes 4490b57cec5SDimitry Andric /// that are part of that. 4500b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 4510b57cec5SDimitry Andric const SourceManager &SM, 4520b57cec5SDimitry Andric const LangOptions &LangOpts) { 4530b57cec5SDimitry Andric Token TheTok; 4540b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts)) 4550b57cec5SDimitry Andric return 0; 4560b57cec5SDimitry Andric return TheTok.getLength(); 4570b57cec5SDimitry Andric } 4580b57cec5SDimitry Andric 4590b57cec5SDimitry Andric /// Relex the token at the specified location. 4600b57cec5SDimitry Andric /// \returns true if there was a failure, false on success. 4610b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 4620b57cec5SDimitry Andric const SourceManager &SM, 4630b57cec5SDimitry Andric const LangOptions &LangOpts, 4640b57cec5SDimitry Andric bool IgnoreWhiteSpace) { 4650b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')', 4660b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle 4670b57cec5SDimitry Andric // all obviously single-char tokens. This could use 4680b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 4690b57cec5SDimitry Andric // something. 4700b57cec5SDimitry Andric 4710b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not 4720b57cec5SDimitry Andric // the token this macro expanded to. 4730b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc); 4740b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 4750b57cec5SDimitry Andric bool Invalid = false; 4760b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 4770b57cec5SDimitry Andric if (Invalid) 4780b57cec5SDimitry Andric return true; 4790b57cec5SDimitry Andric 4800b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second; 4810b57cec5SDimitry Andric 4820b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 4830b57cec5SDimitry Andric return true; 4840b57cec5SDimitry Andric 4850b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 4860b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 4870b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end()); 4880b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 4890b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result); 4900b57cec5SDimitry Andric return false; 4910b57cec5SDimitry Andric } 4920b57cec5SDimitry Andric 4930b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains 4940b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid. 4950b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 4960b57cec5SDimitry Andric const char *BufStart = Buffer.data(); 4970b57cec5SDimitry Andric if (Offset >= Buffer.size()) 4980b57cec5SDimitry Andric return nullptr; 4990b57cec5SDimitry Andric 5000b57cec5SDimitry Andric const char *LexStart = BufStart + Offset; 5010b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) { 5020b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) && 5030b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) { 5040b57cec5SDimitry Andric // LexStart should point at first character of logical line. 5050b57cec5SDimitry Andric ++LexStart; 5060b57cec5SDimitry Andric break; 5070b57cec5SDimitry Andric } 5080b57cec5SDimitry Andric } 5090b57cec5SDimitry Andric return LexStart; 5100b57cec5SDimitry Andric } 5110b57cec5SDimitry Andric 5120b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 5130b57cec5SDimitry Andric const SourceManager &SM, 5140b57cec5SDimitry Andric const LangOptions &LangOpts) { 5150b57cec5SDimitry Andric assert(Loc.isFileID()); 5160b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 5170b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 5180b57cec5SDimitry Andric return Loc; 5190b57cec5SDimitry Andric 5200b57cec5SDimitry Andric bool Invalid = false; 5210b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 5220b57cec5SDimitry Andric if (Invalid) 5230b57cec5SDimitry Andric return Loc; 5240b57cec5SDimitry Andric 5250b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line 5260b57cec5SDimitry Andric // (or the buffer). We'll relex from that point. 5270b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second; 5280b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 5290b57cec5SDimitry Andric if (!LexStart || LexStart == StrData) 5300b57cec5SDimitry Andric return Loc; 5310b57cec5SDimitry Andric 5320b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 5330b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 5340b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 5350b57cec5SDimitry Andric Buffer.end()); 5360b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5370b57cec5SDimitry Andric 5380b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location. 5390b57cec5SDimitry Andric Token TheTok; 5400b57cec5SDimitry Andric do { 5410b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 5420b57cec5SDimitry Andric 5430b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) { 5440b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're 5450b57cec5SDimitry Andric // looking for. If the current token encompasses our source location, 5460b57cec5SDimitry Andric // return the beginning of that token. 5470b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 5480b57cec5SDimitry Andric return TheTok.getLocation(); 5490b57cec5SDimitry Andric 5500b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means 5510b57cec5SDimitry Andric // that it points into whitespace. We're done here. 5520b57cec5SDimitry Andric break; 5530b57cec5SDimitry Andric } 5540b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof); 5550b57cec5SDimitry Andric 5560b57cec5SDimitry Andric // We've passed our source location; just return the original source location. 5570b57cec5SDimitry Andric return Loc; 5580b57cec5SDimitry Andric } 5590b57cec5SDimitry Andric 5600b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 5610b57cec5SDimitry Andric const SourceManager &SM, 5620b57cec5SDimitry Andric const LangOptions &LangOpts) { 5630b57cec5SDimitry Andric if (Loc.isFileID()) 5640b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts); 5650b57cec5SDimitry Andric 5660b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc)) 5670b57cec5SDimitry Andric return Loc; 5680b57cec5SDimitry Andric 5690b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc); 5700b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 5710b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 5720b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo = 5730b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc); 5740b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first && 5750b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second); 5760b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 5770b57cec5SDimitry Andric } 5780b57cec5SDimitry Andric 5790b57cec5SDimitry Andric namespace { 5800b57cec5SDimitry Andric 5810b57cec5SDimitry Andric enum PreambleDirectiveKind { 5820b57cec5SDimitry Andric PDK_Skipped, 5830b57cec5SDimitry Andric PDK_Unknown 5840b57cec5SDimitry Andric }; 5850b57cec5SDimitry Andric 5860b57cec5SDimitry Andric } // namespace 5870b57cec5SDimitry Andric 5880b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 5890b57cec5SDimitry Andric const LangOptions &LangOpts, 5900b57cec5SDimitry Andric unsigned MaxLines) { 5910b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a 5920b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our 5930b57cec5SDimitry Andric // position within the file. 594fe6060f1SDimitry Andric const SourceLocation::UIntTy StartOffset = 1; 5950b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 5960b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 5970b57cec5SDimitry Andric Buffer.end()); 5980b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 5990b57cec5SDimitry Andric 6000b57cec5SDimitry Andric bool InPreprocessorDirective = false; 6010b57cec5SDimitry Andric Token TheTok; 6020b57cec5SDimitry Andric SourceLocation ActiveCommentLoc; 6030b57cec5SDimitry Andric 6040b57cec5SDimitry Andric unsigned MaxLineOffset = 0; 6050b57cec5SDimitry Andric if (MaxLines) { 6060b57cec5SDimitry Andric const char *CurPtr = Buffer.begin(); 6070b57cec5SDimitry Andric unsigned CurLine = 0; 6080b57cec5SDimitry Andric while (CurPtr != Buffer.end()) { 6090b57cec5SDimitry Andric char ch = *CurPtr++; 6100b57cec5SDimitry Andric if (ch == '\n') { 6110b57cec5SDimitry Andric ++CurLine; 6120b57cec5SDimitry Andric if (CurLine == MaxLines) 6130b57cec5SDimitry Andric break; 6140b57cec5SDimitry Andric } 6150b57cec5SDimitry Andric } 6160b57cec5SDimitry Andric if (CurPtr != Buffer.end()) 6170b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin(); 6180b57cec5SDimitry Andric } 6190b57cec5SDimitry Andric 6200b57cec5SDimitry Andric do { 6210b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6220b57cec5SDimitry Andric 6230b57cec5SDimitry Andric if (InPreprocessorDirective) { 6240b57cec5SDimitry Andric // If we've hit the end of the file, we're done. 6250b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) { 6260b57cec5SDimitry Andric break; 6270b57cec5SDimitry Andric } 6280b57cec5SDimitry Andric 6290b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this 6300b57cec5SDimitry Andric // token. 6310b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine()) 6320b57cec5SDimitry Andric continue; 6330b57cec5SDimitry Andric 6340b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look 6350b57cec5SDimitry Andric // at this token again below. 6360b57cec5SDimitry Andric InPreprocessorDirective = false; 6370b57cec5SDimitry Andric } 6380b57cec5SDimitry Andric 6390b57cec5SDimitry Andric // Keep track of the # of lines in the preamble. 6400b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) { 6410b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 6420b57cec5SDimitry Andric 6430b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble, 6440b57cec5SDimitry Andric // and we're about to exceed that limit, we're done. 6450b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset) 6460b57cec5SDimitry Andric break; 6470b57cec5SDimitry Andric } 6480b57cec5SDimitry Andric 6490b57cec5SDimitry Andric // Comments are okay; skip over them. 6500b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) { 6510b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid()) 6520b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation(); 6530b57cec5SDimitry Andric continue; 6540b57cec5SDimitry Andric } 6550b57cec5SDimitry Andric 6560b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 6570b57cec5SDimitry Andric // This is the start of a preprocessor directive. 6580b57cec5SDimitry Andric Token HashTok = TheTok; 6590b57cec5SDimitry Andric InPreprocessorDirective = true; 6600b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation(); 6610b57cec5SDimitry Andric 6620b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens, 6630b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at 6640b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives. 6650b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 6660b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 6670b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier(); 6680b57cec5SDimitry Andric PreambleDirectiveKind PDK 6690b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 6700b57cec5SDimitry Andric .Case("include", PDK_Skipped) 6710b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped) 6720b57cec5SDimitry Andric .Case("define", PDK_Skipped) 6730b57cec5SDimitry Andric .Case("undef", PDK_Skipped) 6740b57cec5SDimitry Andric .Case("line", PDK_Skipped) 6750b57cec5SDimitry Andric .Case("error", PDK_Skipped) 6760b57cec5SDimitry Andric .Case("pragma", PDK_Skipped) 6770b57cec5SDimitry Andric .Case("import", PDK_Skipped) 6780b57cec5SDimitry Andric .Case("include_next", PDK_Skipped) 6790b57cec5SDimitry Andric .Case("warning", PDK_Skipped) 6800b57cec5SDimitry Andric .Case("ident", PDK_Skipped) 6810b57cec5SDimitry Andric .Case("sccs", PDK_Skipped) 6820b57cec5SDimitry Andric .Case("assert", PDK_Skipped) 6830b57cec5SDimitry Andric .Case("unassert", PDK_Skipped) 6840b57cec5SDimitry Andric .Case("if", PDK_Skipped) 6850b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped) 6860b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped) 6870b57cec5SDimitry Andric .Case("elif", PDK_Skipped) 688fe6060f1SDimitry Andric .Case("elifdef", PDK_Skipped) 689fe6060f1SDimitry Andric .Case("elifndef", PDK_Skipped) 6900b57cec5SDimitry Andric .Case("else", PDK_Skipped) 6910b57cec5SDimitry Andric .Case("endif", PDK_Skipped) 6920b57cec5SDimitry Andric .Default(PDK_Unknown); 6930b57cec5SDimitry Andric 6940b57cec5SDimitry Andric switch (PDK) { 6950b57cec5SDimitry Andric case PDK_Skipped: 6960b57cec5SDimitry Andric continue; 6970b57cec5SDimitry Andric 6980b57cec5SDimitry Andric case PDK_Unknown: 6990b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'. 7000b57cec5SDimitry Andric break; 7010b57cec5SDimitry Andric } 7020b57cec5SDimitry Andric } 7030b57cec5SDimitry Andric 7040b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor 7050b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this 7060b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'. 7070b57cec5SDimitry Andric TheTok = HashTok; 7080b57cec5SDimitry Andric } 7090b57cec5SDimitry Andric 7100b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the 7110b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in 7120b57cec5SDimitry Andric // the preamble. 7130b57cec5SDimitry Andric break; 7140b57cec5SDimitry Andric } while (true); 7150b57cec5SDimitry Andric 7160b57cec5SDimitry Andric SourceLocation End; 7170b57cec5SDimitry Andric if (ActiveCommentLoc.isValid()) 7180b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment. 7190b57cec5SDimitry Andric else 7200b57cec5SDimitry Andric End = TheTok.getLocation(); 7210b57cec5SDimitry Andric 7220b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 7230b57cec5SDimitry Andric TheTok.isAtStartOfLine()); 7240b57cec5SDimitry Andric } 7250b57cec5SDimitry Andric 7260b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 7270b57cec5SDimitry Andric const SourceManager &SM, 7280b57cec5SDimitry Andric const LangOptions &LangOpts) { 7290b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion 7300b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and 7310b57cec5SDimitry Andric // trigraphs. 7320b57cec5SDimitry Andric bool Invalid = false; 7330b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 7340b57cec5SDimitry Andric 7350b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done. 7360b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 7370b57cec5SDimitry Andric return 0; 7380b57cec5SDimitry Andric 7390b57cec5SDimitry Andric unsigned PhysOffset = 0; 7400b57cec5SDimitry Andric 7410b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip 7420b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple 7430b57cec5SDimitry Andric // chars, this method is extremely fast. 7440b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 7450b57cec5SDimitry Andric if (CharNo == 0) 7460b57cec5SDimitry Andric return PhysOffset; 7470b57cec5SDimitry Andric ++TokPtr; 7480b57cec5SDimitry Andric --CharNo; 7490b57cec5SDimitry Andric ++PhysOffset; 7500b57cec5SDimitry Andric } 7510b57cec5SDimitry Andric 7520b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a 7530b57cec5SDimitry Andric // lexer to parse it correctly. 7540b57cec5SDimitry Andric for (; CharNo; --CharNo) { 7550b57cec5SDimitry Andric unsigned Size; 7560b57cec5SDimitry Andric Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 7570b57cec5SDimitry Andric TokPtr += Size; 7580b57cec5SDimitry Andric PhysOffset += Size; 7590b57cec5SDimitry Andric } 7600b57cec5SDimitry Andric 7610b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the 7620b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar 7630b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding 7640b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph. 7650b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 7660b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 7670b57cec5SDimitry Andric 7680b57cec5SDimitry Andric return PhysOffset; 7690b57cec5SDimitry Andric } 7700b57cec5SDimitry Andric 7710b57cec5SDimitry Andric /// Computes the source location just past the end of the 7720b57cec5SDimitry Andric /// token at this source location. 7730b57cec5SDimitry Andric /// 7740b57cec5SDimitry Andric /// This routine can be used to produce a source location that 7750b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and 7760b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a 7770b57cec5SDimitry Andric /// token where it expected something different that it received. If 7780b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if 7790b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid 7800b57cec5SDimitry Andric /// source location. 7810b57cec5SDimitry Andric /// 7820b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source 7830b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source 7840b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces 7850b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc. 7860b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 7870b57cec5SDimitry Andric const SourceManager &SM, 7880b57cec5SDimitry Andric const LangOptions &LangOpts) { 7890b57cec5SDimitry Andric if (Loc.isInvalid()) 7900b57cec5SDimitry Andric return {}; 7910b57cec5SDimitry Andric 7920b57cec5SDimitry Andric if (Loc.isMacroID()) { 7930b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 7940b57cec5SDimitry Andric return {}; // Points inside the macro expansion. 7950b57cec5SDimitry Andric } 7960b57cec5SDimitry Andric 7970b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 7980b57cec5SDimitry Andric if (Len > Offset) 7990b57cec5SDimitry Andric Len = Len - Offset; 8000b57cec5SDimitry Andric else 8010b57cec5SDimitry Andric return Loc; 8020b57cec5SDimitry Andric 8030b57cec5SDimitry Andric return Loc.getLocWithOffset(Len); 8040b57cec5SDimitry Andric } 8050b57cec5SDimitry Andric 8060b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first 8070b57cec5SDimitry Andric /// token of the macro expansion. 8080b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 8090b57cec5SDimitry Andric const SourceManager &SM, 8100b57cec5SDimitry Andric const LangOptions &LangOpts, 8110b57cec5SDimitry Andric SourceLocation *MacroBegin) { 8120b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8130b57cec5SDimitry Andric 8140b57cec5SDimitry Andric SourceLocation expansionLoc; 8150b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 8160b57cec5SDimitry Andric return false; 8170b57cec5SDimitry Andric 8180b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8190b57cec5SDimitry Andric // No other macro expansions, this is the first. 8200b57cec5SDimitry Andric if (MacroBegin) 8210b57cec5SDimitry Andric *MacroBegin = expansionLoc; 8220b57cec5SDimitry Andric return true; 8230b57cec5SDimitry Andric } 8240b57cec5SDimitry Andric 8250b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 8260b57cec5SDimitry Andric } 8270b57cec5SDimitry Andric 8280b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last 8290b57cec5SDimitry Andric /// token of the macro expansion. 8300b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 8310b57cec5SDimitry Andric const SourceManager &SM, 8320b57cec5SDimitry Andric const LangOptions &LangOpts, 8330b57cec5SDimitry Andric SourceLocation *MacroEnd) { 8340b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 8350b57cec5SDimitry Andric 8360b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc); 8370b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 8380b57cec5SDimitry Andric if (tokLen == 0) 8390b57cec5SDimitry Andric return false; 8400b57cec5SDimitry Andric 8410b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 8420b57cec5SDimitry Andric SourceLocation expansionLoc; 8430b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 8440b57cec5SDimitry Andric return false; 8450b57cec5SDimitry Andric 8460b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 8470b57cec5SDimitry Andric // No other macro expansions. 8480b57cec5SDimitry Andric if (MacroEnd) 8490b57cec5SDimitry Andric *MacroEnd = expansionLoc; 8500b57cec5SDimitry Andric return true; 8510b57cec5SDimitry Andric } 8520b57cec5SDimitry Andric 8530b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 8540b57cec5SDimitry Andric } 8550b57cec5SDimitry Andric 8560b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 8570b57cec5SDimitry Andric const SourceManager &SM, 8580b57cec5SDimitry Andric const LangOptions &LangOpts) { 8590b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8600b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8610b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID()); 8620b57cec5SDimitry Andric if (Range.isTokenRange()) { 8630b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 8640b57cec5SDimitry Andric if (End.isInvalid()) 8650b57cec5SDimitry Andric return {}; 8660b57cec5SDimitry Andric } 8670b57cec5SDimitry Andric 8680b57cec5SDimitry Andric // Break down the source locations. 8690b57cec5SDimitry Andric FileID FID; 8700b57cec5SDimitry Andric unsigned BeginOffs; 8710b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 8720b57cec5SDimitry Andric if (FID.isInvalid()) 8730b57cec5SDimitry Andric return {}; 8740b57cec5SDimitry Andric 8750b57cec5SDimitry Andric unsigned EndOffs; 8760b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) || 8770b57cec5SDimitry Andric BeginOffs > EndOffs) 8780b57cec5SDimitry Andric return {}; 8790b57cec5SDimitry Andric 8800b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End); 8810b57cec5SDimitry Andric } 8820b57cec5SDimitry Andric 883fe6060f1SDimitry Andric // Assumes that `Loc` is in an expansion. 884fe6060f1SDimitry Andric static bool isInExpansionTokenRange(const SourceLocation Loc, 885fe6060f1SDimitry Andric const SourceManager &SM) { 886fe6060f1SDimitry Andric return SM.getSLocEntry(SM.getFileID(Loc)) 887fe6060f1SDimitry Andric .getExpansion() 888fe6060f1SDimitry Andric .isExpansionTokenRange(); 889fe6060f1SDimitry Andric } 890fe6060f1SDimitry Andric 8910b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 8920b57cec5SDimitry Andric const SourceManager &SM, 8930b57cec5SDimitry Andric const LangOptions &LangOpts) { 8940b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 8950b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 8960b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid()) 8970b57cec5SDimitry Andric return {}; 8980b57cec5SDimitry Andric 8990b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID()) 9000b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9010b57cec5SDimitry Andric 9020b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) { 9030b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 9040b57cec5SDimitry Andric return {}; 9050b57cec5SDimitry Andric Range.setBegin(Begin); 9060b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9070b57cec5SDimitry Andric } 9080b57cec5SDimitry Andric 9090b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) { 910fe6060f1SDimitry Andric if (Range.isTokenRange()) { 911fe6060f1SDimitry Andric if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End)) 912fe6060f1SDimitry Andric return {}; 913fe6060f1SDimitry Andric // Use the *original* end, not the expanded one in `End`. 914fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM)); 915fe6060f1SDimitry Andric } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End)) 9160b57cec5SDimitry Andric return {}; 9170b57cec5SDimitry Andric Range.setEnd(End); 9180b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9190b57cec5SDimitry Andric } 9200b57cec5SDimitry Andric 9210b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID()); 9220b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd; 9230b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 9240b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 9250b57cec5SDimitry Andric &MacroEnd)) || 9260b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 9270b57cec5SDimitry Andric &MacroEnd)))) { 9280b57cec5SDimitry Andric Range.setBegin(MacroBegin); 9290b57cec5SDimitry Andric Range.setEnd(MacroEnd); 930fe6060f1SDimitry Andric // Use the *original* `End`, not the expanded one in `MacroEnd`. 931fe6060f1SDimitry Andric if (Range.isTokenRange()) 932fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(End, SM)); 9330b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 9340b57cec5SDimitry Andric } 9350b57cec5SDimitry Andric 9360b57cec5SDimitry Andric bool Invalid = false; 9370b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 9380b57cec5SDimitry Andric &Invalid); 9390b57cec5SDimitry Andric if (Invalid) 9400b57cec5SDimitry Andric return {}; 9410b57cec5SDimitry Andric 9420b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) { 9430b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 9440b57cec5SDimitry Andric &Invalid); 9450b57cec5SDimitry Andric if (Invalid) 9460b57cec5SDimitry Andric return {}; 9470b57cec5SDimitry Andric 9480b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() && 9490b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() == 9500b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) { 9510b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 9520b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End)); 9530b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts); 9540b57cec5SDimitry Andric } 9550b57cec5SDimitry Andric } 9560b57cec5SDimitry Andric 9570b57cec5SDimitry Andric return {}; 9580b57cec5SDimitry Andric } 9590b57cec5SDimitry Andric 9600b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range, 9610b57cec5SDimitry Andric const SourceManager &SM, 9620b57cec5SDimitry Andric const LangOptions &LangOpts, 9630b57cec5SDimitry Andric bool *Invalid) { 9640b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts); 9650b57cec5SDimitry Andric if (Range.isInvalid()) { 9660b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9670b57cec5SDimitry Andric return {}; 9680b57cec5SDimitry Andric } 9690b57cec5SDimitry Andric 9700b57cec5SDimitry Andric // Break down the source location. 9710b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 9720b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) { 9730b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9740b57cec5SDimitry Andric return {}; 9750b57cec5SDimitry Andric } 9760b57cec5SDimitry Andric 9770b57cec5SDimitry Andric unsigned EndOffs; 9780b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 9790b57cec5SDimitry Andric beginInfo.second > EndOffs) { 9800b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9810b57cec5SDimitry Andric return {}; 9820b57cec5SDimitry Andric } 9830b57cec5SDimitry Andric 9840b57cec5SDimitry Andric // Try to the load the file buffer. 9850b57cec5SDimitry Andric bool invalidTemp = false; 9860b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 9870b57cec5SDimitry Andric if (invalidTemp) { 9880b57cec5SDimitry Andric if (Invalid) *Invalid = true; 9890b57cec5SDimitry Andric return {}; 9900b57cec5SDimitry Andric } 9910b57cec5SDimitry Andric 9920b57cec5SDimitry Andric if (Invalid) *Invalid = false; 9930b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second); 9940b57cec5SDimitry Andric } 9950b57cec5SDimitry Andric 9960b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 9970b57cec5SDimitry Andric const SourceManager &SM, 9980b57cec5SDimitry Andric const LangOptions &LangOpts) { 9990b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10000b57cec5SDimitry Andric 10010b57cec5SDimitry Andric // Find the location of the immediate macro expansion. 10020b57cec5SDimitry Andric while (true) { 10030b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc); 10040b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 10050b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 10060b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart(); 10070b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion()) 10080b57cec5SDimitry Andric break; 10090b57cec5SDimitry Andric 10100b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come 10110b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )" 10120b57cec5SDimitry Andric 10130b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the 10140b57cec5SDimitry Andric // macro expansion. 10150b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10160b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc(); 10170b57cec5SDimitry Andric if (SpellLoc.isFileID()) 10180b57cec5SDimitry Andric break; // No inner macro. 10190b57cec5SDimitry Andric 10200b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion 10210b57cec5SDimitry Andric // location, it means there is no inner macro. 10220b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc); 10230b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID)) 10240b57cec5SDimitry Andric break; 10250b57cec5SDimitry Andric 10260b57cec5SDimitry Andric // Argument came from inner macro. 10270b57cec5SDimitry Andric Loc = SpellLoc; 10280b57cec5SDimitry Andric } 10290b57cec5SDimitry Andric 10300b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10310b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10320b57cec5SDimitry Andric // expanding this macro. 10330b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc); 10340b57cec5SDimitry Andric 10350b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10360b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10370b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10380b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10390b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10400b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10410b57cec5SDimitry Andric } 10420b57cec5SDimitry Andric 10430b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics( 10440b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 10450b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 10460b57cec5SDimitry Andric // Walk past macro argument expansions. 10470b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc)) 10480b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 10490b57cec5SDimitry Andric 1050*bdd1243dSDimitry Andric // If the macro's spelling isn't FileID or from scratch space, then it's 1051*bdd1243dSDimitry Andric // actually a token paste or stringization (or similar) and not a macro at 1052*bdd1243dSDimitry Andric // all. 1053*bdd1243dSDimitry Andric SourceLocation SpellLoc = SM.getSpellingLoc(Loc); 1054*bdd1243dSDimitry Andric if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc)) 10550b57cec5SDimitry Andric return {}; 10560b57cec5SDimitry Andric 10570b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 10580b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 10590b57cec5SDimitry Andric // expanding this macro. 10600b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 10610b57cec5SDimitry Andric 10620b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 10630b57cec5SDimitry Andric // name so that we can render it into the expansion note. 10640b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 10650b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 10660b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 10670b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 10680b57cec5SDimitry Andric } 10690b57cec5SDimitry Andric 1070349cc55cSDimitry Andric bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) { 1071349cc55cSDimitry Andric return isAsciiIdentifierContinue(c, LangOpts.DollarIdents); 10720b57cec5SDimitry Andric } 10730b57cec5SDimitry Andric 10740b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 10750b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0])); 10760b57cec5SDimitry Andric if (Str - 1 < BufferStart) 10770b57cec5SDimitry Andric return false; 10780b57cec5SDimitry Andric 10790b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') || 10800b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) { 10810b57cec5SDimitry Andric if (Str - 2 < BufferStart) 10820b57cec5SDimitry Andric return false; 10830b57cec5SDimitry Andric --Str; 10840b57cec5SDimitry Andric } 10850b57cec5SDimitry Andric --Str; 10860b57cec5SDimitry Andric 10870b57cec5SDimitry Andric // Rewind to first non-space character: 10880b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str)) 10890b57cec5SDimitry Andric --Str; 10900b57cec5SDimitry Andric 10910b57cec5SDimitry Andric return *Str == '\\'; 10920b57cec5SDimitry Andric } 10930b57cec5SDimitry Andric 10940b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc, 10950b57cec5SDimitry Andric const SourceManager &SM) { 10960b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID()) 10970b57cec5SDimitry Andric return {}; 10980b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 10990b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 11000b57cec5SDimitry Andric return {}; 11010b57cec5SDimitry Andric bool Invalid = false; 11020b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 11030b57cec5SDimitry Andric if (Invalid) 11040b57cec5SDimitry Andric return {}; 11050b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 11060b57cec5SDimitry Andric if (!Line) 11070b57cec5SDimitry Andric return {}; 11080b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data()); 11090b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 11100b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos 11110b57cec5SDimitry Andric ? "" 11120b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars); 11130b57cec5SDimitry Andric } 11140b57cec5SDimitry Andric 11150b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11160b57cec5SDimitry Andric // Diagnostics forwarding code. 11170b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11180b57cec5SDimitry Andric 11190b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 11200b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping. 11210b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow 11220b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined. 11230b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 11240b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 11250b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 11260b57cec5SDimitry Andric SourceLocation FileLoc, 11270b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) { 11280b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion"); 11290b57cec5SDimitry Andric 11300b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like 11310b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the 11320b57cec5SDimitry Andric // spelling location. 11330b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 11340b57cec5SDimitry Andric 11350b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 11360b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset. 11370b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 11380b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 11390b57cec5SDimitry Andric 11400b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the 11410b57cec5SDimitry Andric // original _Pragma(...) sequence. 11420b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 11430b57cec5SDimitry Andric 11440b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 11450b57cec5SDimitry Andric } 11460b57cec5SDimitry Andric 11470b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified 11480b57cec5SDimitry Andric /// offset in the current file. 11490b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc, 11500b57cec5SDimitry Andric unsigned TokLen) const { 11510b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd && 11520b57cec5SDimitry Andric "Location out of range for this buffer!"); 11530b57cec5SDimitry Andric 11540b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return 11550b57cec5SDimitry Andric // the file id from FileLoc with the offset specified. 11560b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart; 11570b57cec5SDimitry Andric if (FileLoc.isFileID()) 11580b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo); 11590b57cec5SDimitry Andric 11600b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the 11610b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined. 11620b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers"); 11630b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 11640b57cec5SDimitry Andric } 11650b57cec5SDimitry Andric 11660b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source 11670b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering. 11680b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 11690b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID); 11700b57cec5SDimitry Andric } 11710b57cec5SDimitry Andric 11720b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11730b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code. 11740b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 11750b57cec5SDimitry Andric 11760b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 11770b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 11780b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) { 11790b57cec5SDimitry Andric switch (Letter) { 11800b57cec5SDimitry Andric default: return 0; 11810b57cec5SDimitry Andric case '=': return '#'; 11820b57cec5SDimitry Andric case ')': return ']'; 11830b57cec5SDimitry Andric case '(': return '['; 11840b57cec5SDimitry Andric case '!': return '|'; 11850b57cec5SDimitry Andric case '\'': return '^'; 11860b57cec5SDimitry Andric case '>': return '}'; 11870b57cec5SDimitry Andric case '/': return '\\'; 11880b57cec5SDimitry Andric case '<': return '{'; 11890b57cec5SDimitry Andric case '-': return '~'; 11900b57cec5SDimitry Andric } 11910b57cec5SDimitry Andric } 11920b57cec5SDimitry Andric 11930b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when 11940b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 11950b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use 11960b57cec5SDimitry Andric /// whether trigraphs are enabled or not. 119781ad6265SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) { 11980b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP); 1199*bdd1243dSDimitry Andric if (!Res) 1200*bdd1243dSDimitry Andric return Res; 12010b57cec5SDimitry Andric 120281ad6265SDimitry Andric if (!Trigraphs) { 1203*bdd1243dSDimitry Andric if (L && !L->isLexingRawMode()) 12040b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored); 12050b57cec5SDimitry Andric return 0; 12060b57cec5SDimitry Andric } 12070b57cec5SDimitry Andric 1208*bdd1243dSDimitry Andric if (L && !L->isLexingRawMode()) 12090b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 12100b57cec5SDimitry Andric return Res; 12110b57cec5SDimitry Andric } 12120b57cec5SDimitry Andric 12130b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline, 12140b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 12150b57cec5SDimitry Andric /// trigraph equivalent on entry to this function. 12160b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 12170b57cec5SDimitry Andric unsigned Size = 0; 12180b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) { 12190b57cec5SDimitry Andric ++Size; 12200b57cec5SDimitry Andric 12210b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 12220b57cec5SDimitry Andric continue; 12230b57cec5SDimitry Andric 12240b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half. 12250b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 12260b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size]) 12270b57cec5SDimitry Andric ++Size; 12280b57cec5SDimitry Andric 12290b57cec5SDimitry Andric return Size; 12300b57cec5SDimitry Andric } 12310b57cec5SDimitry Andric 12320b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else. 12330b57cec5SDimitry Andric return 0; 12340b57cec5SDimitry Andric } 12350b57cec5SDimitry Andric 12360b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 12370b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found, 12380b57cec5SDimitry Andric /// otherwise return P. 12390b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) { 12400b57cec5SDimitry Andric while (true) { 12410b57cec5SDimitry Andric const char *AfterEscape; 12420b57cec5SDimitry Andric if (*P == '\\') { 12430b57cec5SDimitry Andric AfterEscape = P+1; 12440b57cec5SDimitry Andric } else if (*P == '?') { 12450b57cec5SDimitry Andric // If not a trigraph for escape, bail out. 12460b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/') 12470b57cec5SDimitry Andric return P; 12480b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not 12490b57cec5SDimitry Andric // support trigraphs. 12500b57cec5SDimitry Andric AfterEscape = P+3; 12510b57cec5SDimitry Andric } else { 12520b57cec5SDimitry Andric return P; 12530b57cec5SDimitry Andric } 12540b57cec5SDimitry Andric 12550b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 12560b57cec5SDimitry Andric if (NewLineSize == 0) return P; 12570b57cec5SDimitry Andric P = AfterEscape+NewLineSize; 12580b57cec5SDimitry Andric } 12590b57cec5SDimitry Andric } 12600b57cec5SDimitry Andric 1261*bdd1243dSDimitry Andric std::optional<Token> Lexer::findNextToken(SourceLocation Loc, 12620b57cec5SDimitry Andric const SourceManager &SM, 12630b57cec5SDimitry Andric const LangOptions &LangOpts) { 12640b57cec5SDimitry Andric if (Loc.isMacroID()) { 12650b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1266*bdd1243dSDimitry Andric return std::nullopt; 12670b57cec5SDimitry Andric } 12680b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 12690b57cec5SDimitry Andric 12700b57cec5SDimitry Andric // Break down the source location. 12710b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 12720b57cec5SDimitry Andric 12730b57cec5SDimitry Andric // Try to load the file buffer. 12740b57cec5SDimitry Andric bool InvalidTemp = false; 12750b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 12760b57cec5SDimitry Andric if (InvalidTemp) 1277*bdd1243dSDimitry Andric return std::nullopt; 12780b57cec5SDimitry Andric 12790b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second; 12800b57cec5SDimitry Andric 12810b57cec5SDimitry Andric // Lex from the start of the given location. 12820b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 12830b57cec5SDimitry Andric TokenBegin, File.end()); 12840b57cec5SDimitry Andric // Find the token. 12850b57cec5SDimitry Andric Token Tok; 12860b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok); 12870b57cec5SDimitry Andric return Tok; 12880b57cec5SDimitry Andric } 12890b57cec5SDimitry Andric 12900b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the 12910b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location 12920b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the 12930b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid. 12940b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken( 12950b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 12960b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1297*bdd1243dSDimitry Andric std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 12980b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind)) 12990b57cec5SDimitry Andric return {}; 13000b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation(); 13010b57cec5SDimitry Andric 13020b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any. 13030b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0; 13040b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) { 13050b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 13060b57cec5SDimitry Andric unsigned char C = *TokenEnd; 13070b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) { 13080b57cec5SDimitry Andric C = *(++TokenEnd); 13090b57cec5SDimitry Andric NumWhitespaceChars++; 13100b57cec5SDimitry Andric } 13110b57cec5SDimitry Andric 13120b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r 13130b57cec5SDimitry Andric if (C == '\n' || C == '\r') { 13140b57cec5SDimitry Andric char PrevC = C; 13150b57cec5SDimitry Andric C = *(++TokenEnd); 13160b57cec5SDimitry Andric NumWhitespaceChars++; 13170b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC) 13180b57cec5SDimitry Andric NumWhitespaceChars++; 13190b57cec5SDimitry Andric } 13200b57cec5SDimitry Andric } 13210b57cec5SDimitry Andric 13220b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 13230b57cec5SDimitry Andric } 13240b57cec5SDimitry Andric 13250b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 13260b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases: 13270b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph, 13280b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?', 13290b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not. 13300b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between 13310b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return 13320b57cec5SDimitry Andric /// the char after it. 13330b57cec5SDimitry Andric /// 13340b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we 13350b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented 13360b57cec5SDimitry Andric /// Ptr by Size bytes. 13370b57cec5SDimitry Andric /// 13380b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 13390b57cec5SDimitry Andric /// be updated to match. 13400b57cec5SDimitry Andric char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 13410b57cec5SDimitry Andric Token *Tok) { 13420b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 13430b57cec5SDimitry Andric if (Ptr[0] == '\\') { 13440b57cec5SDimitry Andric ++Size; 13450b57cec5SDimitry Andric ++Ptr; 13460b57cec5SDimitry Andric Slash: 13470b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 13480b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 13490b57cec5SDimitry Andric 13500b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and 13510b57cec5SDimitry Andric // newline. 13520b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 13530b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13540b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13550b57cec5SDimitry Andric 13560b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline. 13570b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 13580b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space); 13590b57cec5SDimitry Andric 13600b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 13610b57cec5SDimitry Andric Size += EscapedNewLineSize; 13620b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 13630b57cec5SDimitry Andric 13640b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 13650b57cec5SDimitry Andric return getCharAndSizeSlow(Ptr, Size, Tok); 13660b57cec5SDimitry Andric } 13670b57cec5SDimitry Andric 13680b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 13690b57cec5SDimitry Andric return '\\'; 13700b57cec5SDimitry Andric } 13710b57cec5SDimitry Andric 13720b57cec5SDimitry Andric // If this is a trigraph, process it. 13730b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') { 13740b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit 13750b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it. 137681ad6265SDimitry Andric if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr, 137781ad6265SDimitry Andric LangOpts.Trigraphs)) { 13780b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 13790b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 13800b57cec5SDimitry Andric 13810b57cec5SDimitry Andric Ptr += 3; 13820b57cec5SDimitry Andric Size += 3; 13830b57cec5SDimitry Andric if (C == '\\') goto Slash; 13840b57cec5SDimitry Andric return C; 13850b57cec5SDimitry Andric } 13860b57cec5SDimitry Andric } 13870b57cec5SDimitry Andric 13880b57cec5SDimitry Andric // If this is neither, return a single character. 13890b57cec5SDimitry Andric ++Size; 13900b57cec5SDimitry Andric return *Ptr; 13910b57cec5SDimitry Andric } 13920b57cec5SDimitry Andric 13930b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 13940b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 13950b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes. 13960b57cec5SDimitry Andric /// 13970b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 13980b57cec5SDimitry Andric /// be updated to match. 13990b57cec5SDimitry Andric char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 14000b57cec5SDimitry Andric const LangOptions &LangOpts) { 14010b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 14020b57cec5SDimitry Andric if (Ptr[0] == '\\') { 14030b57cec5SDimitry Andric ++Size; 14040b57cec5SDimitry Andric ++Ptr; 14050b57cec5SDimitry Andric Slash: 14060b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 14070b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 14080b57cec5SDimitry Andric 14090b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline. 14100b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 14110b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 14120b57cec5SDimitry Andric Size += EscapedNewLineSize; 14130b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 14140b57cec5SDimitry Andric 14150b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 14160b57cec5SDimitry Andric return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 14170b57cec5SDimitry Andric } 14180b57cec5SDimitry Andric 14190b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 14200b57cec5SDimitry Andric return '\\'; 14210b57cec5SDimitry Andric } 14220b57cec5SDimitry Andric 14230b57cec5SDimitry Andric // If this is a trigraph, process it. 14240b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 14250b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return 14260b57cec5SDimitry Andric // it. 14270b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) { 14280b57cec5SDimitry Andric Ptr += 3; 14290b57cec5SDimitry Andric Size += 3; 14300b57cec5SDimitry Andric if (C == '\\') goto Slash; 14310b57cec5SDimitry Andric return C; 14320b57cec5SDimitry Andric } 14330b57cec5SDimitry Andric } 14340b57cec5SDimitry Andric 14350b57cec5SDimitry Andric // If this is neither, return a single character. 14360b57cec5SDimitry Andric ++Size; 14370b57cec5SDimitry Andric return *Ptr; 14380b57cec5SDimitry Andric } 14390b57cec5SDimitry Andric 14400b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14410b57cec5SDimitry Andric // Helper methods for lexing. 14420b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 14430b57cec5SDimitry Andric 14440b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file. 14450b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 14460b57cec5SDimitry Andric BufferPtr = BufferStart + Offset; 14470b57cec5SDimitry Andric if (BufferPtr > BufferEnd) 14480b57cec5SDimitry Andric BufferPtr = BufferEnd; 14490b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two 14500b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the 14510b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line. 14520b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine; 14530b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine; 14540b57cec5SDimitry Andric } 14550b57cec5SDimitry Andric 1456349cc55cSDimitry Andric static bool isUnicodeWhitespace(uint32_t Codepoint) { 1457349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 1458349cc55cSDimitry Andric UnicodeWhitespaceCharRanges); 1459349cc55cSDimitry Andric return UnicodeWhitespaceChars.contains(Codepoint); 1460349cc55cSDimitry Andric } 1461349cc55cSDimitry Andric 1462*bdd1243dSDimitry Andric static llvm::SmallString<5> codepointAsHexString(uint32_t C) { 1463*bdd1243dSDimitry Andric llvm::SmallString<5> CharBuf; 1464*bdd1243dSDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 1465*bdd1243dSDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1466*bdd1243dSDimitry Andric return CharBuf; 1467*bdd1243dSDimitry Andric } 1468*bdd1243dSDimitry Andric 1469*bdd1243dSDimitry Andric // To mitigate https://github.com/llvm/llvm-project/issues/54732, 1470*bdd1243dSDimitry Andric // we allow "Mathematical Notation Characters" in identifiers. 1471*bdd1243dSDimitry Andric // This is a proposed profile that extends the XID_Start/XID_continue 1472*bdd1243dSDimitry Andric // with mathematical symbols, superscipts and subscripts digits 1473*bdd1243dSDimitry Andric // found in some production software. 1474*bdd1243dSDimitry Andric // https://www.unicode.org/L2/L2022/22230-math-profile.pdf 1475*bdd1243dSDimitry Andric static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts, 1476*bdd1243dSDimitry Andric bool IsStart, bool &IsExtension) { 1477*bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathStartChars( 1478*bdd1243dSDimitry Andric MathematicalNotationProfileIDStartRanges); 1479*bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathContinueChars( 1480*bdd1243dSDimitry Andric MathematicalNotationProfileIDContinueRanges); 1481*bdd1243dSDimitry Andric if (MathStartChars.contains(C) || 1482*bdd1243dSDimitry Andric (!IsStart && MathContinueChars.contains(C))) { 1483*bdd1243dSDimitry Andric IsExtension = true; 1484*bdd1243dSDimitry Andric return true; 1485*bdd1243dSDimitry Andric } 1486*bdd1243dSDimitry Andric return false; 1487*bdd1243dSDimitry Andric } 1488*bdd1243dSDimitry Andric 1489*bdd1243dSDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts, 1490*bdd1243dSDimitry Andric bool &IsExtension) { 14910b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 14920b57cec5SDimitry Andric return false; 1493480093f4SDimitry Andric } else if (LangOpts.DollarIdents && '$' == C) { 1494480093f4SDimitry Andric return true; 1495fcaf7f86SDimitry Andric } else if (LangOpts.CPlusPlus || LangOpts.C2x) { 1496349cc55cSDimitry Andric // A non-leading codepoint must have the XID_Continue property. 1497349cc55cSDimitry Andric // XIDContinueRanges doesn't contains characters also in XIDStartRanges, 1498349cc55cSDimitry Andric // so we need to check both tables. 1499fcaf7f86SDimitry Andric // '_' doesn't have the XID_Continue property but is allowed in C and C++. 1500349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1501349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges); 1502*bdd1243dSDimitry Andric if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C)) 1503*bdd1243dSDimitry Andric return true; 1504*bdd1243dSDimitry Andric return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false, 1505*bdd1243dSDimitry Andric IsExtension); 1506349cc55cSDimitry Andric } else if (LangOpts.C11) { 15070b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 15080b57cec5SDimitry Andric C11AllowedIDCharRanges); 15090b57cec5SDimitry Andric return C11AllowedIDChars.contains(C); 15100b57cec5SDimitry Andric } else { 15110b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 15120b57cec5SDimitry Andric C99AllowedIDCharRanges); 15130b57cec5SDimitry Andric return C99AllowedIDChars.contains(C); 15140b57cec5SDimitry Andric } 15150b57cec5SDimitry Andric } 15160b57cec5SDimitry Andric 1517*bdd1243dSDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts, 1518*bdd1243dSDimitry Andric bool &IsExtension) { 1519*bdd1243dSDimitry Andric assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint"); 1520*bdd1243dSDimitry Andric IsExtension = false; 15210b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 15220b57cec5SDimitry Andric return false; 1523349cc55cSDimitry Andric } 1524fcaf7f86SDimitry Andric if (LangOpts.CPlusPlus || LangOpts.C2x) { 1525349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges); 1526*bdd1243dSDimitry Andric if (XIDStartChars.contains(C)) 1527*bdd1243dSDimitry Andric return true; 1528*bdd1243dSDimitry Andric return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true, 1529*bdd1243dSDimitry Andric IsExtension); 1530349cc55cSDimitry Andric } 1531*bdd1243dSDimitry Andric if (!isAllowedIDChar(C, LangOpts, IsExtension)) 1532349cc55cSDimitry Andric return false; 1533349cc55cSDimitry Andric if (LangOpts.C11) { 15340b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 15350b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges); 15360b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C); 1537349cc55cSDimitry Andric } 15380b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 15390b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 15400b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C); 15410b57cec5SDimitry Andric } 15420b57cec5SDimitry Andric 1543*bdd1243dSDimitry Andric static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C, 1544*bdd1243dSDimitry Andric CharSourceRange Range) { 1545*bdd1243dSDimitry Andric 1546*bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathStartChars( 1547*bdd1243dSDimitry Andric MathematicalNotationProfileIDStartRanges); 1548*bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathContinueChars( 1549*bdd1243dSDimitry Andric MathematicalNotationProfileIDContinueRanges); 1550*bdd1243dSDimitry Andric 1551*bdd1243dSDimitry Andric (void)MathStartChars; 1552*bdd1243dSDimitry Andric (void)MathContinueChars; 1553*bdd1243dSDimitry Andric assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) && 1554*bdd1243dSDimitry Andric "Unexpected mathematical notation codepoint"); 1555*bdd1243dSDimitry Andric Diags.Report(Range.getBegin(), diag::ext_mathematical_notation) 1556*bdd1243dSDimitry Andric << codepointAsHexString(C) << Range; 1557*bdd1243dSDimitry Andric } 1558*bdd1243dSDimitry Andric 15590b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 15600b57cec5SDimitry Andric const char *End) { 15610b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 15620b57cec5SDimitry Andric L.getSourceLocation(End)); 15630b57cec5SDimitry Andric } 15640b57cec5SDimitry Andric 15650b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 15660b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) { 15670b57cec5SDimitry Andric // Check C99 compatibility. 15680b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 15690b57cec5SDimitry Andric enum { 15700b57cec5SDimitry Andric CannotAppearInIdentifier = 0, 15710b57cec5SDimitry Andric CannotStartIdentifier 15720b57cec5SDimitry Andric }; 15730b57cec5SDimitry Andric 15740b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 15750b57cec5SDimitry Andric C99AllowedIDCharRanges); 15760b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 15770b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 15780b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) { 15790b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 15800b57cec5SDimitry Andric << Range 15810b57cec5SDimitry Andric << CannotAppearInIdentifier; 15820b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 15830b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 15840b57cec5SDimitry Andric << Range 15850b57cec5SDimitry Andric << CannotStartIdentifier; 15860b57cec5SDimitry Andric } 15870b57cec5SDimitry Andric } 15880b57cec5SDimitry Andric } 15890b57cec5SDimitry Andric 15900b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier 15910b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier 15920b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier 15930b57cec5SDimitry Andric /// character and warn if so. 15940b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 15950b57cec5SDimitry Andric CharSourceRange Range) { 15960b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 15970b57cec5SDimitry Andric struct HomoglyphPair { 15980b57cec5SDimitry Andric uint32_t Character; 15990b57cec5SDimitry Andric char LooksLike; 16000b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; } 16010b57cec5SDimitry Andric }; 16020b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = { 16030b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN 16040b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 16050b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK 16060b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE 16070b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 16080b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER 16090b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER 16100b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION 16110b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES 16120b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR 16130b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS 16140b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN 16150b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH 16160b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS 16170b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR 16180b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES 16190b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND 16200b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO 16210b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR 16220b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON 16230b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 16240b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 16250b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 16260b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 16270b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 16280b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 16290b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 16300b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 16310b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 16320b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 16330b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA 16340b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 16350b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 16360b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 16370b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON 16380b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 16390b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 16400b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 16410b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 16420b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 16430b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 16440b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 16450b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 16460b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 16470b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 16480b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 16490b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 16500b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 16510b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE 16520b57cec5SDimitry Andric {0, 0} 16530b57cec5SDimitry Andric }; 16540b57cec5SDimitry Andric auto Homoglyph = 16550b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs), 16560b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 16570b57cec5SDimitry Andric if (Homoglyph->Character == C) { 16580b57cec5SDimitry Andric if (Homoglyph->LooksLike) { 16590b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 16600b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1661*bdd1243dSDimitry Andric << Range << codepointAsHexString(C) << LooksLikeStr; 16620b57cec5SDimitry Andric } else { 16630b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1664*bdd1243dSDimitry Andric << Range << codepointAsHexString(C); 16650b57cec5SDimitry Andric } 16660b57cec5SDimitry Andric } 16670b57cec5SDimitry Andric } 16680b57cec5SDimitry Andric 1669349cc55cSDimitry Andric static void diagnoseInvalidUnicodeCodepointInIdentifier( 1670349cc55cSDimitry Andric DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint, 1671349cc55cSDimitry Andric CharSourceRange Range, bool IsFirst) { 1672349cc55cSDimitry Andric if (isASCII(CodePoint)) 1673349cc55cSDimitry Andric return; 1674349cc55cSDimitry Andric 1675*bdd1243dSDimitry Andric bool IsExtension; 1676*bdd1243dSDimitry Andric bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension); 1677*bdd1243dSDimitry Andric bool IsIDContinue = 1678*bdd1243dSDimitry Andric IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension); 1679349cc55cSDimitry Andric 1680349cc55cSDimitry Andric if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue)) 1681349cc55cSDimitry Andric return; 1682349cc55cSDimitry Andric 1683349cc55cSDimitry Andric bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue; 1684349cc55cSDimitry Andric 1685349cc55cSDimitry Andric if (!IsFirst || InvalidOnlyAtStart) { 1686349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier) 1687*bdd1243dSDimitry Andric << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart) 1688349cc55cSDimitry Andric << FixItHint::CreateRemoval(Range); 1689349cc55cSDimitry Andric } else { 1690349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed) 1691*bdd1243dSDimitry Andric << Range << codepointAsHexString(CodePoint) 1692*bdd1243dSDimitry Andric << FixItHint::CreateRemoval(Range); 1693349cc55cSDimitry Andric } 1694349cc55cSDimitry Andric } 1695349cc55cSDimitry Andric 16960b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 16970b57cec5SDimitry Andric Token &Result) { 16980b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size; 16990b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1700349cc55cSDimitry Andric if (CodePoint == 0) { 17010b57cec5SDimitry Andric return false; 1702349cc55cSDimitry Andric } 1703*bdd1243dSDimitry Andric bool IsExtension = false; 1704*bdd1243dSDimitry Andric if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) { 1705349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1706349cc55cSDimitry Andric return false; 1707349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1708349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1709349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1710349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1711349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 1712349cc55cSDimitry Andric /*IsFirst=*/false); 1713349cc55cSDimitry Andric 1714349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1715349cc55cSDimitry Andric // a valid identifier part. 1716349cc55cSDimitry Andric // Carry on as if the codepoint was valid for recovery purposes. 1717*bdd1243dSDimitry Andric } else if (!isLexingRawMode()) { 1718*bdd1243dSDimitry Andric if (IsExtension) 1719*bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1720*bdd1243dSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr)); 1721*bdd1243dSDimitry Andric 17220b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 17230b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 17240b57cec5SDimitry Andric /*IsFirst=*/false); 1725*bdd1243dSDimitry Andric } 17260b57cec5SDimitry Andric 17270b57cec5SDimitry Andric Result.setFlag(Token::HasUCN); 17280b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 17290b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 17300b57cec5SDimitry Andric CurPtr = UCNPtr; 17310b57cec5SDimitry Andric else 17320b57cec5SDimitry Andric while (CurPtr != UCNPtr) 17330b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 17340b57cec5SDimitry Andric return true; 17350b57cec5SDimitry Andric } 17360b57cec5SDimitry Andric 17370b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 17380b57cec5SDimitry Andric const char *UnicodePtr = CurPtr; 17390b57cec5SDimitry Andric llvm::UTF32 CodePoint; 17400b57cec5SDimitry Andric llvm::ConversionResult Result = 17410b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 17420b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 17430b57cec5SDimitry Andric &CodePoint, 17440b57cec5SDimitry Andric llvm::strictConversion); 1745349cc55cSDimitry Andric if (Result != llvm::conversionOK) 17460b57cec5SDimitry Andric return false; 17470b57cec5SDimitry Andric 1748*bdd1243dSDimitry Andric bool IsExtension = false; 1749*bdd1243dSDimitry Andric if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts, 1750*bdd1243dSDimitry Andric IsExtension)) { 1751349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint)) 1752349cc55cSDimitry Andric return false; 1753349cc55cSDimitry Andric 1754349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1755349cc55cSDimitry Andric !PP->isPreprocessedOutput()) 1756349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1757349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint, 1758349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), /*IsFirst=*/false); 1759349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a 1760349cc55cSDimitry Andric // a valid identifier part. Carry on as if the codepoint was 1761349cc55cSDimitry Andric // valid for recovery purposes. 1762349cc55cSDimitry Andric } else if (!isLexingRawMode()) { 1763*bdd1243dSDimitry Andric if (IsExtension) 1764*bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint, 1765*bdd1243dSDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr)); 17660b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 17670b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), 17680b57cec5SDimitry Andric /*IsFirst=*/false); 17690b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 17700b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr)); 17710b57cec5SDimitry Andric } 17720b57cec5SDimitry Andric 17730b57cec5SDimitry Andric CurPtr = UnicodePtr; 17740b57cec5SDimitry Andric return true; 17750b57cec5SDimitry Andric } 17760b57cec5SDimitry Andric 1777349cc55cSDimitry Andric bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C, 1778349cc55cSDimitry Andric const char *CurPtr) { 1779*bdd1243dSDimitry Andric bool IsExtension = false; 1780*bdd1243dSDimitry Andric if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) { 1781349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1782349cc55cSDimitry Andric !PP->isPreprocessedOutput()) { 1783*bdd1243dSDimitry Andric if (IsExtension) 1784*bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), C, 1785*bdd1243dSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 1786349cc55cSDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 1787349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), 1788349cc55cSDimitry Andric /*IsFirst=*/true); 1789349cc55cSDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 1790349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 1791349cc55cSDimitry Andric } 1792349cc55cSDimitry Andric 1793349cc55cSDimitry Andric MIOpt.ReadToken(); 1794349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 1795349cc55cSDimitry Andric } 1796349cc55cSDimitry Andric 1797349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 1798349cc55cSDimitry Andric !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) && 1799*bdd1243dSDimitry Andric !isUnicodeWhitespace(C)) { 1800349cc55cSDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 1801349cc55cSDimitry Andric // Instead of letting the parser complain about the unknown token, 1802349cc55cSDimitry Andric // just drop the character. 1803349cc55cSDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually 1804349cc55cSDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that 1805349cc55cSDimitry Andric // we not throw away any possible preprocessor tokens, but there's a 1806349cc55cSDimitry Andric // loophole in the mapping of Unicode characters to basic character set 1807349cc55cSDimitry Andric // characters that allows us to map these particular characters to, say, 1808349cc55cSDimitry Andric // whitespace. 1809349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier( 1810349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, C, 1811349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true); 1812349cc55cSDimitry Andric BufferPtr = CurPtr; 1813349cc55cSDimitry Andric return false; 1814349cc55cSDimitry Andric } 1815349cc55cSDimitry Andric 1816349cc55cSDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show 1817349cc55cSDimitry Andric // up by accident. 1818349cc55cSDimitry Andric MIOpt.ReadToken(); 1819349cc55cSDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 1820349cc55cSDimitry Andric return true; 1821349cc55cSDimitry Andric } 1822349cc55cSDimitry Andric 1823349cc55cSDimitry Andric bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) { 1824349cc55cSDimitry Andric // Match [_A-Za-z0-9]*, we have already matched an identifier start. 1825349cc55cSDimitry Andric while (true) { 1826349cc55cSDimitry Andric unsigned char C = *CurPtr; 1827349cc55cSDimitry Andric // Fast path. 1828349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1829349cc55cSDimitry Andric ++CurPtr; 1830349cc55cSDimitry Andric continue; 1831349cc55cSDimitry Andric } 1832349cc55cSDimitry Andric 18330b57cec5SDimitry Andric unsigned Size; 1834349cc55cSDimitry Andric // Slow path: handle trigraph, unicode codepoints, UCNs. 1835349cc55cSDimitry Andric C = getCharAndSize(CurPtr, Size); 1836349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 1837349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1838349cc55cSDimitry Andric continue; 1839349cc55cSDimitry Andric } 1840349cc55cSDimitry Andric if (C == '$') { 1841349cc55cSDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done. 1842349cc55cSDimitry Andric if (!LangOpts.DollarIdents) 1843349cc55cSDimitry Andric break; 1844349cc55cSDimitry Andric // Otherwise, emit a diagnostic and continue. 1845349cc55cSDimitry Andric if (!isLexingRawMode()) 1846349cc55cSDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier); 1847349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1848349cc55cSDimitry Andric continue; 1849349cc55cSDimitry Andric } 1850349cc55cSDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1851349cc55cSDimitry Andric continue; 1852349cc55cSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1853349cc55cSDimitry Andric continue; 1854349cc55cSDimitry Andric // Neither an expected Unicode codepoint nor a UCN. 1855349cc55cSDimitry Andric break; 1856349cc55cSDimitry Andric } 18570b57cec5SDimitry Andric 18580b57cec5SDimitry Andric const char *IdStart = BufferPtr; 18590b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 18600b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart); 18610b57cec5SDimitry Andric 18620b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to 18630b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it. 18640b57cec5SDimitry Andric if (LexingRawMode) 18650b57cec5SDimitry Andric return true; 18660b57cec5SDimitry Andric 18670b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind, 18680b57cec5SDimitry Andric // looking up the identifier in the identifier table. 18690b57cec5SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 18700b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code 18710b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it. 18720b57cec5SDimitry Andric 18730b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat 18740b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword. 18750b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'. 18760b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) { 18770b57cec5SDimitry Andric // Return the code-completion token. 18780b57cec5SDimitry Andric Result.setKind(tok::code_completion); 18790b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters. 18800b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in 18810b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that 18820b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 18830b57cec5SDimitry Andric // simpler. 18840b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0"); 18850b57cec5SDimitry Andric ++CurPtr; 18860b57cec5SDimitry Andric // Note that code completion token is not added as a separate character 18870b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need 18880b57cec5SDimitry Andric // to check if the buffer has ended. 18890b57cec5SDimitry Andric if (CurPtr < BufferEnd) { 1890349cc55cSDimitry Andric while (isAsciiIdentifierContinue(*CurPtr)) 18910b57cec5SDimitry Andric ++CurPtr; 18920b57cec5SDimitry Andric } 18930b57cec5SDimitry Andric BufferPtr = CurPtr; 18940b57cec5SDimitry Andric return true; 18950b57cec5SDimitry Andric } 18960b57cec5SDimitry Andric 18970b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the 18980b57cec5SDimitry Andric // preprocessor, which may macro expand it or something. 18990b57cec5SDimitry Andric if (II->isHandleIdentifierCase()) 19000b57cec5SDimitry Andric return PP->HandleIdentifier(Result); 19010b57cec5SDimitry Andric 19020b57cec5SDimitry Andric return true; 19030b57cec5SDimitry Andric } 19040b57cec5SDimitry Andric 19050b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant. 19060b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens). 19070b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 19080b57cec5SDimitry Andric unsigned Size; 19090b57cec5SDimitry Andric char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 19100b57cec5SDimitry Andric if (C1 != '0') 19110b57cec5SDimitry Andric return false; 19120b57cec5SDimitry Andric char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 19130b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X'); 19140b57cec5SDimitry Andric } 19150b57cec5SDimitry Andric 19160b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point 19170b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the 19180b57cec5SDimitry Andric /// constant. 19190b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 19200b57cec5SDimitry Andric unsigned Size; 19210b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 19220b57cec5SDimitry Andric char PrevCh = 0; 19230b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) { 19240b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 19250b57cec5SDimitry Andric PrevCh = C; 19260b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 19270b57cec5SDimitry Andric } 19280b57cec5SDimitry Andric 19290b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 19300b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 19310b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex. 19320b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 19330b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 19340b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 19350b57cec5SDimitry Andric } 19360b57cec5SDimitry Andric 19370b57cec5SDimitry Andric // If we have a hex FP constant, continue. 19380b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 19390b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 19400b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's 19410b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix. 19420b57cec5SDimitry Andric bool IsHexFloat = true; 19430b57cec5SDimitry Andric if (!LangOpts.C99) { 19440b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts)) 19450b57cec5SDimitry Andric IsHexFloat = false; 194681ad6265SDimitry Andric else if (!LangOpts.CPlusPlus17 && 19470b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr) 19480b57cec5SDimitry Andric IsHexFloat = false; 19490b57cec5SDimitry Andric } 19500b57cec5SDimitry Andric if (IsHexFloat) 19510b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 19520b57cec5SDimitry Andric } 19530b57cec5SDimitry Andric 19540b57cec5SDimitry Andric // If we have a digit separator, continue. 195581ad6265SDimitry Andric if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C2x)) { 19560b57cec5SDimitry Andric unsigned NextSize; 195781ad6265SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, LangOpts); 1958349cc55cSDimitry Andric if (isAsciiIdentifierContinue(Next)) { 19590b57cec5SDimitry Andric if (!isLexingRawMode()) 196081ad6265SDimitry Andric Diag(CurPtr, LangOpts.CPlusPlus 1961fe6060f1SDimitry Andric ? diag::warn_cxx11_compat_digit_separator 1962fe6060f1SDimitry Andric : diag::warn_c2x_compat_digit_separator); 19630b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 19640b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result); 19650b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19660b57cec5SDimitry Andric } 19670b57cec5SDimitry Andric } 19680b57cec5SDimitry Andric 19690b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 19700b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 19710b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19720b57cec5SDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 19730b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 19740b57cec5SDimitry Andric 19750b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 19760b57cec5SDimitry Andric const char *TokStart = BufferPtr; 19770b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 19780b57cec5SDimitry Andric Result.setLiteralData(TokStart); 19790b57cec5SDimitry Andric return true; 19800b57cec5SDimitry Andric } 19810b57cec5SDimitry Andric 19820b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 19830b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98. 19840b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 19850b57cec5SDimitry Andric bool IsStringLiteral) { 198681ad6265SDimitry Andric assert(LangOpts.CPlusPlus); 19870b57cec5SDimitry Andric 19880b57cec5SDimitry Andric // Maximally munch an identifier. 19890b57cec5SDimitry Andric unsigned Size; 19900b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 19910b57cec5SDimitry Andric bool Consumed = false; 19920b57cec5SDimitry Andric 1993349cc55cSDimitry Andric if (!isAsciiIdentifierStart(C)) { 19940b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 19950b57cec5SDimitry Andric Consumed = true; 19960b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 19970b57cec5SDimitry Andric Consumed = true; 19980b57cec5SDimitry Andric else 19990b57cec5SDimitry Andric return CurPtr; 20000b57cec5SDimitry Andric } 20010b57cec5SDimitry Andric 200281ad6265SDimitry Andric if (!LangOpts.CPlusPlus11) { 20030b57cec5SDimitry Andric if (!isLexingRawMode()) 20040b57cec5SDimitry Andric Diag(CurPtr, 20050b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal 20060b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal) 20070b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 20080b57cec5SDimitry Andric return CurPtr; 20090b57cec5SDimitry Andric } 20100b57cec5SDimitry Andric 20110b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 20120b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming 20130b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before 20140b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more 20150b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that. 20160b57cec5SDimitry Andric if (!Consumed) { 20170b57cec5SDimitry Andric bool IsUDSuffix = false; 20180b57cec5SDimitry Andric if (C == '_') 20190b57cec5SDimitry Andric IsUDSuffix = true; 202081ad6265SDimitry Andric else if (IsStringLiteral && LangOpts.CPlusPlus14) { 20210b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a 20220b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be 20230b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator). 20240b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3; 20250b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C }; 20260b57cec5SDimitry Andric unsigned Consumed = Size; 20270b57cec5SDimitry Andric unsigned Chars = 1; 20280b57cec5SDimitry Andric while (true) { 20290b57cec5SDimitry Andric unsigned NextSize; 203081ad6265SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, LangOpts); 2031349cc55cSDimitry Andric if (!isAsciiIdentifierContinue(Next)) { 20325ffd83dbSDimitry Andric // End of suffix. Check whether this is on the allowed list. 20330b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars); 203481ad6265SDimitry Andric IsUDSuffix = 203581ad6265SDimitry Andric StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix); 20360b57cec5SDimitry Andric break; 20370b57cec5SDimitry Andric } 20380b57cec5SDimitry Andric 20390b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength) 20400b57cec5SDimitry Andric // Too long: can't be a standard suffix. 20410b57cec5SDimitry Andric break; 20420b57cec5SDimitry Andric 20430b57cec5SDimitry Andric Buffer[Chars++] = Next; 20440b57cec5SDimitry Andric Consumed += NextSize; 20450b57cec5SDimitry Andric } 20460b57cec5SDimitry Andric } 20470b57cec5SDimitry Andric 20480b57cec5SDimitry Andric if (!IsUDSuffix) { 20490b57cec5SDimitry Andric if (!isLexingRawMode()) 205081ad6265SDimitry Andric Diag(CurPtr, LangOpts.MSVCCompat 20510b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal 20520b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal) 20530b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 20540b57cec5SDimitry Andric return CurPtr; 20550b57cec5SDimitry Andric } 20560b57cec5SDimitry Andric 20570b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 20580b57cec5SDimitry Andric } 20590b57cec5SDimitry Andric 20600b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix); 20610b57cec5SDimitry Andric while (true) { 20620b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 2063349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) { 2064349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 2065349cc55cSDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 2066349cc55cSDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 2067349cc55cSDimitry Andric } else 2068349cc55cSDimitry Andric break; 20690b57cec5SDimitry Andric } 20700b57cec5SDimitry Andric 20710b57cec5SDimitry Andric return CurPtr; 20720b57cec5SDimitry Andric } 20730b57cec5SDimitry Andric 20740b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 20750b57cec5SDimitry Andric /// either " or L" or u8" or u" or U". 20760b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 20770b57cec5SDimitry Andric tok::TokenKind Kind) { 20780b57cec5SDimitry Andric const char *AfterQuote = CurPtr; 20790b57cec5SDimitry Andric // Does this string contain the \0 character? 20800b57cec5SDimitry Andric const char *NulCharacter = nullptr; 20810b57cec5SDimitry Andric 20820b57cec5SDimitry Andric if (!isLexingRawMode() && 20830b57cec5SDimitry Andric (Kind == tok::utf8_string_literal || 20840b57cec5SDimitry Andric Kind == tok::utf16_string_literal || 20850b57cec5SDimitry Andric Kind == tok::utf32_string_literal)) 208681ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal 20870b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 20880b57cec5SDimitry Andric 20890b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 20900b57cec5SDimitry Andric while (C != '"') { 20910b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 20920b57cec5SDimitry Andric // getAndAdvanceChar. 20930b57cec5SDimitry Andric if (C == '\\') 20940b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 20950b57cec5SDimitry Andric 20960b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 20970b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 20980b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 20990b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 21000b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 21010b57cec5SDimitry Andric return true; 21020b57cec5SDimitry Andric } 21030b57cec5SDimitry Andric 21040b57cec5SDimitry Andric if (C == 0) { 21050b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 21060b57cec5SDimitry Andric if (ParsingFilename) 21070b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 21080b57cec5SDimitry Andric else 21090b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 21100b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 21110b57cec5SDimitry Andric cutOffLexing(); 21120b57cec5SDimitry Andric return true; 21130b57cec5SDimitry Andric } 21140b57cec5SDimitry Andric 21150b57cec5SDimitry Andric NulCharacter = CurPtr-1; 21160b57cec5SDimitry Andric } 21170b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 21180b57cec5SDimitry Andric } 21190b57cec5SDimitry Andric 21200b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 212181ad6265SDimitry Andric if (LangOpts.CPlusPlus) 21220b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 21230b57cec5SDimitry Andric 21240b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 21250b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 21260b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 21270b57cec5SDimitry Andric 21280b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var. 21290b57cec5SDimitry Andric const char *TokStart = BufferPtr; 21300b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 21310b57cec5SDimitry Andric Result.setLiteralData(TokStart); 21320b57cec5SDimitry Andric return true; 21330b57cec5SDimitry Andric } 21340b57cec5SDimitry Andric 21350b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 21360b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR". 21370b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 21380b57cec5SDimitry Andric tok::TokenKind Kind) { 21390b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 21400b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string, 21410b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs, 21420b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted. 21430b57cec5SDimitry Andric 21440b57cec5SDimitry Andric if (!isLexingRawMode()) 21450b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 21460b57cec5SDimitry Andric 21470b57cec5SDimitry Andric unsigned PrefixLen = 0; 21480b57cec5SDimitry Andric 21490b57cec5SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 21500b57cec5SDimitry Andric ++PrefixLen; 21510b57cec5SDimitry Andric 21520b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter. 21530b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') { 21540b57cec5SDimitry Andric if (!isLexingRawMode()) { 21550b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen]; 21560b57cec5SDimitry Andric if (PrefixLen == 16) { 21570b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long); 21580b57cec5SDimitry Andric } else { 21590b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 21600b57cec5SDimitry Andric << StringRef(PrefixEnd, 1); 21610b57cec5SDimitry Andric } 21620b57cec5SDimitry Andric } 21630b57cec5SDimitry Andric 21640b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 21650b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but 21660b57cec5SDimitry Andric // there's not much we can do about that. 21670b57cec5SDimitry Andric while (true) { 21680b57cec5SDimitry Andric char C = *CurPtr++; 21690b57cec5SDimitry Andric 21700b57cec5SDimitry Andric if (C == '"') 21710b57cec5SDimitry Andric break; 21720b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) { 21730b57cec5SDimitry Andric --CurPtr; 21740b57cec5SDimitry Andric break; 21750b57cec5SDimitry Andric } 21760b57cec5SDimitry Andric } 21770b57cec5SDimitry Andric 21780b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 21790b57cec5SDimitry Andric return true; 21800b57cec5SDimitry Andric } 21810b57cec5SDimitry Andric 21820b57cec5SDimitry Andric // Save prefix and move CurPtr past it 21830b57cec5SDimitry Andric const char *Prefix = CurPtr; 21840b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '(' 21850b57cec5SDimitry Andric 21860b57cec5SDimitry Andric while (true) { 21870b57cec5SDimitry Andric char C = *CurPtr++; 21880b57cec5SDimitry Andric 21890b57cec5SDimitry Andric if (C == ')') { 21900b57cec5SDimitry Andric // Check for prefix match and closing quote. 21910b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 21920b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"' 21930b57cec5SDimitry Andric break; 21940b57cec5SDimitry Andric } 21950b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 21960b57cec5SDimitry Andric if (!isLexingRawMode()) 21970b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string) 21980b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen); 21990b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 22000b57cec5SDimitry Andric return true; 22010b57cec5SDimitry Andric } 22020b57cec5SDimitry Andric } 22030b57cec5SDimitry Andric 22040b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 220581ad6265SDimitry Andric if (LangOpts.CPlusPlus) 22060b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 22070b57cec5SDimitry Andric 22080b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 22090b57cec5SDimitry Andric const char *TokStart = BufferPtr; 22100b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 22110b57cec5SDimitry Andric Result.setLiteralData(TokStart); 22120b57cec5SDimitry Andric return true; 22130b57cec5SDimitry Andric } 22140b57cec5SDimitry Andric 22150b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 22160b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames. 22170b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 22180b57cec5SDimitry Andric // Does this string contain the \0 character? 22190b57cec5SDimitry Andric const char *NulCharacter = nullptr; 22200b57cec5SDimitry Andric const char *AfterLessPos = CurPtr; 22210b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 22220b57cec5SDimitry Andric while (C != '>') { 22230b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 22240b57cec5SDimitry Andric // getAndAdvanceChar. 22250b57cec5SDimitry Andric if (C == '\\') 22260b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22270b57cec5SDimitry Andric 2228fe6060f1SDimitry Andric if (isVerticalWhitespace(C) || // Newline. 22290b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 22300b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone < 22310b57cec5SDimitry Andric // character. Return this as such. 22320b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less); 22330b57cec5SDimitry Andric return true; 22340b57cec5SDimitry Andric } 22350b57cec5SDimitry Andric 22360b57cec5SDimitry Andric if (C == 0) { 22370b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) { 22380b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 22390b57cec5SDimitry Andric cutOffLexing(); 22400b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 22410b57cec5SDimitry Andric return true; 22420b57cec5SDimitry Andric } 22430b57cec5SDimitry Andric NulCharacter = CurPtr-1; 22440b57cec5SDimitry Andric } 22450b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 22460b57cec5SDimitry Andric } 22470b57cec5SDimitry Andric 22480b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 22490b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 22500b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 22510b57cec5SDimitry Andric 22520b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 22530b57cec5SDimitry Andric const char *TokStart = BufferPtr; 22540b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name); 22550b57cec5SDimitry Andric Result.setLiteralData(TokStart); 22560b57cec5SDimitry Andric return true; 22570b57cec5SDimitry Andric } 22580b57cec5SDimitry Andric 22590b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart, 22600b57cec5SDimitry Andric const char *CompletionPoint, 22610b57cec5SDimitry Andric bool IsAngled) { 22620b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash. 22630b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart); 22645ffd83dbSDimitry Andric llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/"; 22655ffd83dbSDimitry Andric auto Slash = PartialPath.find_last_of(SlashChars); 22660b57cec5SDimitry Andric StringRef Dir = 22670b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 22680b57cec5SDimitry Andric const char *StartOfFilename = 22690b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 22700b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point. 22710b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 22720b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 22735ffd83dbSDimitry Andric // We should replace the characters up to the closing quote or closest slash, 22745ffd83dbSDimitry Andric // if any. 22750b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) { 22760b57cec5SDimitry Andric char Next = *(CompletionPoint + 1); 22770b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n') 22780b57cec5SDimitry Andric break; 22790b57cec5SDimitry Andric ++CompletionPoint; 22800b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"')) 22810b57cec5SDimitry Andric break; 22825ffd83dbSDimitry Andric if (llvm::is_contained(SlashChars, Next)) 22835ffd83dbSDimitry Andric break; 22840b57cec5SDimitry Andric } 22855ffd83dbSDimitry Andric 22860b57cec5SDimitry Andric PP->setCodeCompletionTokenRange( 22870b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 22880b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 22890b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled); 22900b57cec5SDimitry Andric } 22910b57cec5SDimitry Andric 22920b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having 22930b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'. 22940b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 22950b57cec5SDimitry Andric tok::TokenKind Kind) { 22960b57cec5SDimitry Andric // Does this character contain the \0 character? 22970b57cec5SDimitry Andric const char *NulCharacter = nullptr; 22980b57cec5SDimitry Andric 22990b57cec5SDimitry Andric if (!isLexingRawMode()) { 23000b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 230181ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus 23020b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 23030b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 23040b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant) 23050b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 23060b57cec5SDimitry Andric } 23070b57cec5SDimitry Andric 23080b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 23090b57cec5SDimitry Andric if (C == '\'') { 23100b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 23110b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character); 23120b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 23130b57cec5SDimitry Andric return true; 23140b57cec5SDimitry Andric } 23150b57cec5SDimitry Andric 23160b57cec5SDimitry Andric while (C != '\'') { 23170b57cec5SDimitry Andric // Skip escaped characters. 23180b57cec5SDimitry Andric if (C == '\\') 23190b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 23200b57cec5SDimitry Andric 23210b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 23220b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 23230b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 23240b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 23250b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 23260b57cec5SDimitry Andric return true; 23270b57cec5SDimitry Andric } 23280b57cec5SDimitry Andric 23290b57cec5SDimitry Andric if (C == 0) { 23300b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 23310b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 23320b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 23330b57cec5SDimitry Andric cutOffLexing(); 23340b57cec5SDimitry Andric return true; 23350b57cec5SDimitry Andric } 23360b57cec5SDimitry Andric 23370b57cec5SDimitry Andric NulCharacter = CurPtr-1; 23380b57cec5SDimitry Andric } 23390b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 23400b57cec5SDimitry Andric } 23410b57cec5SDimitry Andric 23420b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 234381ad6265SDimitry Andric if (LangOpts.CPlusPlus) 23440b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false); 23450b57cec5SDimitry Andric 23460b57cec5SDimitry Andric // If a nul character existed in the character, warn about it. 23470b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 23480b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0; 23490b57cec5SDimitry Andric 23500b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 23510b57cec5SDimitry Andric const char *TokStart = BufferPtr; 23520b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 23530b57cec5SDimitry Andric Result.setLiteralData(TokStart); 23540b57cec5SDimitry Andric return true; 23550b57cec5SDimitry Andric } 23560b57cec5SDimitry Andric 23570b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 23580b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return. 23590b57cec5SDimitry Andric /// 23600b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 23610b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 23620b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 23630b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace. 23640b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 23650b57cec5SDimitry Andric 23660b57cec5SDimitry Andric unsigned char Char = *CurPtr; 23670b57cec5SDimitry Andric 2368e8d8bef9SDimitry Andric const char *lastNewLine = nullptr; 2369e8d8bef9SDimitry Andric auto setLastNewLine = [&](const char *Ptr) { 2370e8d8bef9SDimitry Andric lastNewLine = Ptr; 2371e8d8bef9SDimitry Andric if (!NewLinePtr) 2372e8d8bef9SDimitry Andric NewLinePtr = Ptr; 2373e8d8bef9SDimitry Andric }; 2374e8d8bef9SDimitry Andric if (SawNewline) 2375e8d8bef9SDimitry Andric setLastNewLine(CurPtr - 1); 2376e8d8bef9SDimitry Andric 23770b57cec5SDimitry Andric // Skip consecutive spaces efficiently. 23780b57cec5SDimitry Andric while (true) { 23790b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively. 23800b57cec5SDimitry Andric while (isHorizontalWhitespace(Char)) 23810b57cec5SDimitry Andric Char = *++CurPtr; 23820b57cec5SDimitry Andric 23830b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done. 23840b57cec5SDimitry Andric if (!isVerticalWhitespace(Char)) 23850b57cec5SDimitry Andric break; 23860b57cec5SDimitry Andric 23870b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 23880b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this. 23890b57cec5SDimitry Andric BufferPtr = CurPtr; 23900b57cec5SDimitry Andric return false; 23910b57cec5SDimitry Andric } 23920b57cec5SDimitry Andric 23930b57cec5SDimitry Andric // OK, but handle newline. 2394e8d8bef9SDimitry Andric if (*CurPtr == '\n') 2395e8d8bef9SDimitry Andric setLastNewLine(CurPtr); 23960b57cec5SDimitry Andric SawNewline = true; 23970b57cec5SDimitry Andric Char = *++CurPtr; 23980b57cec5SDimitry Andric } 23990b57cec5SDimitry Andric 24000b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now. 24010b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 24020b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 24030b57cec5SDimitry Andric if (SawNewline) { 24040b57cec5SDimitry Andric IsAtStartOfLine = true; 24050b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 24060b57cec5SDimitry Andric } 24070b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 24080b57cec5SDimitry Andric return true; 24090b57cec5SDimitry Andric } 24100b57cec5SDimitry Andric 24110b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space. 24120b57cec5SDimitry Andric char PrevChar = CurPtr[-1]; 24130b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 24140b57cec5SDimitry Andric 24150b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 24160b57cec5SDimitry Andric if (SawNewline) { 24170b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 24180b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 2419e8d8bef9SDimitry Andric 2420e8d8bef9SDimitry Andric if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) { 2421e8d8bef9SDimitry Andric if (auto *Handler = PP->getEmptylineHandler()) 2422e8d8bef9SDimitry Andric Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1), 2423e8d8bef9SDimitry Andric getSourceLocation(lastNewLine))); 2424e8d8bef9SDimitry Andric } 24250b57cec5SDimitry Andric } 24260b57cec5SDimitry Andric 24270b57cec5SDimitry Andric BufferPtr = CurPtr; 24280b57cec5SDimitry Andric return false; 24290b57cec5SDimitry Andric } 24300b57cec5SDimitry Andric 24310b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the 24320b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and 24330b57cec5SDimitry Andric /// return. 24340b57cec5SDimitry Andric /// 24350b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 24360b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 24370b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 24380b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 24390b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an 24400b57cec5SDimitry Andric // extension warning. 244181ad6265SDimitry Andric if (!LineComment) { 24421fd87a68SDimitry Andric if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags. 24430b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment); 24440b57cec5SDimitry Andric 24450b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation 24460b57cec5SDimitry Andric // unit. 244781ad6265SDimitry Andric LineComment = true; 24480b57cec5SDimitry Andric } 24490b57cec5SDimitry Andric 24500b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that 24510b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in 24520b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop. 24530b57cec5SDimitry Andric // 24540b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer) 24550b57cec5SDimitry Andric // character that ends the line comment. 2456753f127fSDimitry Andric 2457753f127fSDimitry Andric // C++23 [lex.phases] p1 2458753f127fSDimitry Andric // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2459753f127fSDimitry Andric // diagnostic only once per entire ill-formed subsequence to avoid 2460753f127fSDimitry Andric // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2461753f127fSDimitry Andric bool UnicodeDecodingAlreadyDiagnosed = false; 2462753f127fSDimitry Andric 24630b57cec5SDimitry Andric char C; 24640b57cec5SDimitry Andric while (true) { 24650b57cec5SDimitry Andric C = *CurPtr; 24660b57cec5SDimitry Andric // Skip over characters in the fast loop. 2467753f127fSDimitry Andric while (isASCII(C) && C != 0 && // Potentially EOF. 2468753f127fSDimitry Andric C != '\n' && C != '\r') { // Newline or DOS-style newline. 24690b57cec5SDimitry Andric C = *++CurPtr; 2470753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 2471753f127fSDimitry Andric } 2472753f127fSDimitry Andric 2473753f127fSDimitry Andric if (!isASCII(C)) { 2474753f127fSDimitry Andric unsigned Length = llvm::getUTF8SequenceSize( 2475753f127fSDimitry Andric (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd); 2476753f127fSDimitry Andric if (Length == 0) { 2477753f127fSDimitry Andric if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2478753f127fSDimitry Andric Diag(CurPtr, diag::warn_invalid_utf8_in_comment); 2479753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = true; 2480753f127fSDimitry Andric ++CurPtr; 2481753f127fSDimitry Andric } else { 2482753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 2483753f127fSDimitry Andric CurPtr += Length; 2484753f127fSDimitry Andric } 2485753f127fSDimitry Andric continue; 2486753f127fSDimitry Andric } 24870b57cec5SDimitry Andric 24880b57cec5SDimitry Andric const char *NextLine = CurPtr; 24890b57cec5SDimitry Andric if (C != 0) { 24900b57cec5SDimitry Andric // We found a newline, see if it's escaped. 24910b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1; 24920b57cec5SDimitry Andric bool HasSpace = false; 24930b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 24940b57cec5SDimitry Andric --EscapePtr; 24950b57cec5SDimitry Andric HasSpace = true; 24960b57cec5SDimitry Andric } 24970b57cec5SDimitry Andric 24980b57cec5SDimitry Andric if (*EscapePtr == '\\') 24990b57cec5SDimitry Andric // Escaped newline. 25000b57cec5SDimitry Andric CurPtr = EscapePtr; 25010b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 25020b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs) 25030b57cec5SDimitry Andric // Trigraph-escaped newline. 25040b57cec5SDimitry Andric CurPtr = EscapePtr-2; 25050b57cec5SDimitry Andric else 25060b57cec5SDimitry Andric break; // This is a newline, we're done. 25070b57cec5SDimitry Andric 25080b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 25090b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode()) 25100b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space); 25110b57cec5SDimitry Andric } 25120b57cec5SDimitry Andric 25130b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 25140b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting 25150b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline, 25160b57cec5SDimitry Andric // we'll handle it below. 25170b57cec5SDimitry Andric const char *OldPtr = CurPtr; 25180b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode(); 25190b57cec5SDimitry Andric LexingRawMode = true; 25200b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 25210b57cec5SDimitry Andric LexingRawMode = OldRawMode; 25220b57cec5SDimitry Andric 25230b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed. 25240b57cec5SDimitry Andric // We're done and can skip forward to the newline. 25250b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) { 25260b57cec5SDimitry Andric CurPtr = NextLine; 25270b57cec5SDimitry Andric break; 25280b57cec5SDimitry Andric } 25290b57cec5SDimitry Andric 25300b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or 25310b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic 25320b57cec5SDimitry Andric // unless the next line is also a // comment. 25330b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' && 25340b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 25350b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr) 25360b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 25370b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next 25380b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic. 25390b57cec5SDimitry Andric if (isWhitespace(C)) { 25400b57cec5SDimitry Andric const char *ForwardPtr = CurPtr; 25410b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace. 25420b57cec5SDimitry Andric ++ForwardPtr; 25430b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 25440b57cec5SDimitry Andric break; 25450b57cec5SDimitry Andric } 25460b57cec5SDimitry Andric 25470b57cec5SDimitry Andric if (!isLexingRawMode()) 25480b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment); 25490b57cec5SDimitry Andric break; 25500b57cec5SDimitry Andric } 25510b57cec5SDimitry Andric } 25520b57cec5SDimitry Andric 25530b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 25540b57cec5SDimitry Andric --CurPtr; 25550b57cec5SDimitry Andric break; 25560b57cec5SDimitry Andric } 25570b57cec5SDimitry Andric 25580b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 25590b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 25600b57cec5SDimitry Andric cutOffLexing(); 25610b57cec5SDimitry Andric return false; 25620b57cec5SDimitry Andric } 25630b57cec5SDimitry Andric } 25640b57cec5SDimitry Andric 25650b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the 25660b57cec5SDimitry Andric // comment unless we're in a #if 0 block. 25670b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 25680b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 25690b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 25700b57cec5SDimitry Andric BufferPtr = CurPtr; 25710b57cec5SDimitry Andric return true; // A token has to be returned. 25720b57cec5SDimitry Andric } 25730b57cec5SDimitry Andric 25740b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 25750b57cec5SDimitry Andric if (inKeepCommentMode()) 25760b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr); 25770b57cec5SDimitry Andric 25780b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 25790b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token. 25800b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 25810b57cec5SDimitry Andric BufferPtr = CurPtr; 25820b57cec5SDimitry Andric return false; 25830b57cec5SDimitry Andric } 25840b57cec5SDimitry Andric 25850b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or 25860b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't 25870b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that 25880b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the 25890b57cec5SDimitry Andric /// comment above in that mode. 2590e8d8bef9SDimitry Andric NewLinePtr = CurPtr++; 25910b57cec5SDimitry Andric 25920b57cec5SDimitry Andric // The next returned token is at the start of the line. 25930b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 25940b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 25950b57cec5SDimitry Andric // No leading whitespace seen so far. 25960b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 25970b57cec5SDimitry Andric BufferPtr = CurPtr; 25980b57cec5SDimitry Andric return false; 25990b57cec5SDimitry Andric } 26000b57cec5SDimitry Andric 26010b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate 26020b57cec5SDimitry Andric /// way and return it. 26030b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 26040b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment 26050b57cec5SDimitry Andric // directly. 26060b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 26070b57cec5SDimitry Andric 26080b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode) 26090b57cec5SDimitry Andric return true; 26100b57cec5SDimitry Andric 26110b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into 26120b57cec5SDimitry Andric // a C-style block comment. 26130b57cec5SDimitry Andric bool Invalid = false; 26140b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid); 26150b57cec5SDimitry Andric if (Invalid) 26160b57cec5SDimitry Andric return true; 26170b57cec5SDimitry Andric 26180b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 26190b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*". 26200b57cec5SDimitry Andric Spelling += "*/"; // add suffix. 26210b57cec5SDimitry Andric 26220b57cec5SDimitry Andric Result.setKind(tok::comment); 26230b57cec5SDimitry Andric PP->CreateString(Spelling, Result, 26240b57cec5SDimitry Andric Result.getLocation(), Result.getLocation()); 26250b57cec5SDimitry Andric return true; 26260b57cec5SDimitry Andric } 26270b57cec5SDimitry Andric 26280b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 26290b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 26300b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment. 263181ad6265SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L, 263281ad6265SDimitry Andric bool Trigraphs) { 26330b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 26340b57cec5SDimitry Andric 2635fe6060f1SDimitry Andric // Position of the first trigraph in the ending sequence. 263604eeddc0SDimitry Andric const char *TrigraphPos = nullptr; 2637fe6060f1SDimitry Andric // Position of the first whitespace after a '\' in the ending sequence. 263804eeddc0SDimitry Andric const char *SpacePos = nullptr; 2639fe6060f1SDimitry Andric 2640fe6060f1SDimitry Andric while (true) { 26410b57cec5SDimitry Andric // Back up off the newline. 26420b57cec5SDimitry Andric --CurPtr; 26430b57cec5SDimitry Andric 26440b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character. 26450b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 26460b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline. 26470b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1]) 26480b57cec5SDimitry Andric return false; 26490b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline. 26500b57cec5SDimitry Andric --CurPtr; 26510b57cec5SDimitry Andric } 26520b57cec5SDimitry Andric 26530b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace 26540b57cec5SDimitry Andric // between the slash and newline. 26550b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2656fe6060f1SDimitry Andric SpacePos = CurPtr; 26570b57cec5SDimitry Andric --CurPtr; 26580b57cec5SDimitry Andric } 26590b57cec5SDimitry Andric 2660fe6060f1SDimitry Andric // If we have a slash, this is an escaped newline. 26610b57cec5SDimitry Andric if (*CurPtr == '\\') { 2662fe6060f1SDimitry Andric --CurPtr; 2663fe6060f1SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') { 2664fe6060f1SDimitry Andric // This is a trigraph encoding of a slash. 2665fe6060f1SDimitry Andric TrigraphPos = CurPtr - 2; 2666fe6060f1SDimitry Andric CurPtr -= 3; 26670b57cec5SDimitry Andric } else { 26680b57cec5SDimitry Andric return false; 2669fe6060f1SDimitry Andric } 26700b57cec5SDimitry Andric 2671fe6060f1SDimitry Andric // If the character preceding the escaped newline is a '*', then after line 2672fe6060f1SDimitry Andric // splicing we have a '*/' ending the comment. 2673fe6060f1SDimitry Andric if (*CurPtr == '*') 2674fe6060f1SDimitry Andric break; 26750b57cec5SDimitry Andric 2676fe6060f1SDimitry Andric if (*CurPtr != '\n' && *CurPtr != '\r') 2677fe6060f1SDimitry Andric return false; 2678fe6060f1SDimitry Andric } 2679fe6060f1SDimitry Andric 2680fe6060f1SDimitry Andric if (TrigraphPos) { 26810b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and 26820b57cec5SDimitry Andric // ignore this * character. 268381ad6265SDimitry Andric if (!Trigraphs) { 26840b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2685fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment); 26860b57cec5SDimitry Andric return false; 26870b57cec5SDimitry Andric } 26880b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2689fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ends_block_comment); 26900b57cec5SDimitry Andric } 26910b57cec5SDimitry Andric 26920b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters. 26930b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2694fe6060f1SDimitry Andric L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end); 26950b57cec5SDimitry Andric 26960b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 2697fe6060f1SDimitry Andric if (SpacePos && !L->isLexingRawMode()) 2698fe6060f1SDimitry Andric L->Diag(SpacePos, diag::backslash_newline_space); 26990b57cec5SDimitry Andric 27000b57cec5SDimitry Andric return true; 27010b57cec5SDimitry Andric } 27020b57cec5SDimitry Andric 27030b57cec5SDimitry Andric #ifdef __SSE2__ 27040b57cec5SDimitry Andric #include <emmintrin.h> 27050b57cec5SDimitry Andric #elif __ALTIVEC__ 27060b57cec5SDimitry Andric #include <altivec.h> 27070b57cec5SDimitry Andric #undef bool 27080b57cec5SDimitry Andric #endif 27090b57cec5SDimitry Andric 27100b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment. 27110b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment. 27120b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block 27130b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing 27140b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between 27150b57cec5SDimitry Andric /// the terminating * and /. 27160b57cec5SDimitry Andric /// 27170b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 27180b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 27190b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 27200b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 27210b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once 27220b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common 27230b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their 27240b57cec5SDimitry Andric // comments. 27250b57cec5SDimitry Andric 27260b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle 27270b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline 27280b57cec5SDimitry Andric // after it. 27290b57cec5SDimitry Andric unsigned CharSize; 27300b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize); 27310b57cec5SDimitry Andric CurPtr += CharSize; 27320b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) { 27330b57cec5SDimitry Andric if (!isLexingRawMode()) 27340b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 27350b57cec5SDimitry Andric --CurPtr; 27360b57cec5SDimitry Andric 27370b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 27380b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 27390b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 27400b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 27410b57cec5SDimitry Andric return true; 27420b57cec5SDimitry Andric } 27430b57cec5SDimitry Andric 27440b57cec5SDimitry Andric BufferPtr = CurPtr; 27450b57cec5SDimitry Andric return false; 27460b57cec5SDimitry Andric } 27470b57cec5SDimitry Andric 27480b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so, 27490b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it. 27500b57cec5SDimitry Andric if (C == '/') 27510b57cec5SDimitry Andric C = *CurPtr++; 27520b57cec5SDimitry Andric 2753753f127fSDimitry Andric // C++23 [lex.phases] p1 2754753f127fSDimitry Andric // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a 2755753f127fSDimitry Andric // diagnostic only once per entire ill-formed subsequence to avoid 2756753f127fSDimitry Andric // emiting to many diagnostics (see http://unicode.org/review/pr-121.html). 2757753f127fSDimitry Andric bool UnicodeDecodingAlreadyDiagnosed = false; 2758753f127fSDimitry Andric 27590b57cec5SDimitry Andric while (true) { 27600b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a 27610b57cec5SDimitry Andric // (probably ending) '/' character. 27620b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd && 27630b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it 27640b57cec5SDimitry Andric // doesn't check for '\0'. 27650b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 27660b57cec5SDimitry Andric // While not aligned to a 16-byte boundary. 2767753f127fSDimitry Andric while (C != '/' && (intptr_t)CurPtr % 16 != 0) { 2768753f127fSDimitry Andric if (!isASCII(C)) 2769753f127fSDimitry Andric goto MultiByteUTF8; 27700b57cec5SDimitry Andric C = *CurPtr++; 2771753f127fSDimitry Andric } 27720b57cec5SDimitry Andric if (C == '/') goto FoundSlash; 27730b57cec5SDimitry Andric 27740b57cec5SDimitry Andric #ifdef __SSE2__ 27750b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/'); 2776753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) { 2777753f127fSDimitry Andric int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr); 2778753f127fSDimitry Andric if (LLVM_UNLIKELY(Mask != 0)) { 2779753f127fSDimitry Andric goto MultiByteUTF8; 2780753f127fSDimitry Andric } 2781753f127fSDimitry Andric // look for slashes 27820b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 27830b57cec5SDimitry Andric Slashes)); 27840b57cec5SDimitry Andric if (cmp != 0) { 27850b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's 27860b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of 27870b57cec5SDimitry Andric // the outer loop. 27880b57cec5SDimitry Andric CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 27890b57cec5SDimitry Andric goto FoundSlash; 27900b57cec5SDimitry Andric } 27910b57cec5SDimitry Andric CurPtr += 16; 27920b57cec5SDimitry Andric } 27930b57cec5SDimitry Andric #elif __ALTIVEC__ 2794753f127fSDimitry Andric __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2795753f127fSDimitry Andric 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 2796753f127fSDimitry Andric 0x80, 0x80, 0x80, 0x80}; 27970b57cec5SDimitry Andric __vector unsigned char Slashes = { 27980b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/', 27990b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/' 28000b57cec5SDimitry Andric }; 2801753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) { 2802753f127fSDimitry Andric if (LLVM_UNLIKELY( 2803753f127fSDimitry Andric vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF))) 2804753f127fSDimitry Andric goto MultiByteUTF8; 2805753f127fSDimitry Andric if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) { 2806753f127fSDimitry Andric break; 2807753f127fSDimitry Andric } 28080b57cec5SDimitry Andric CurPtr += 16; 2809753f127fSDimitry Andric } 2810753f127fSDimitry Andric 28110b57cec5SDimitry Andric #else 2812753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) { 2813753f127fSDimitry Andric bool HasNonASCII = false; 2814753f127fSDimitry Andric for (unsigned I = 0; I < 16; ++I) 2815753f127fSDimitry Andric HasNonASCII |= !isASCII(CurPtr[I]); 2816753f127fSDimitry Andric 2817753f127fSDimitry Andric if (LLVM_UNLIKELY(HasNonASCII)) 2818753f127fSDimitry Andric goto MultiByteUTF8; 2819753f127fSDimitry Andric 2820753f127fSDimitry Andric bool HasSlash = false; 2821753f127fSDimitry Andric for (unsigned I = 0; I < 16; ++I) 2822753f127fSDimitry Andric HasSlash |= CurPtr[I] == '/'; 2823753f127fSDimitry Andric if (HasSlash) 2824753f127fSDimitry Andric break; 2825753f127fSDimitry Andric CurPtr += 16; 28260b57cec5SDimitry Andric } 28270b57cec5SDimitry Andric #endif 28280b57cec5SDimitry Andric 28290b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one. 28300b57cec5SDimitry Andric C = *CurPtr++; 28310b57cec5SDimitry Andric } 28320b57cec5SDimitry Andric 2833753f127fSDimitry Andric // Loop to scan the remainder, warning on invalid UTF-8 2834753f127fSDimitry Andric // if the corresponding warning is enabled, emitting a diagnostic only once 2835753f127fSDimitry Andric // per sequence that cannot be decoded. 2836753f127fSDimitry Andric while (C != '/' && C != '\0') { 2837753f127fSDimitry Andric if (isASCII(C)) { 2838753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 28390b57cec5SDimitry Andric C = *CurPtr++; 2840753f127fSDimitry Andric continue; 2841753f127fSDimitry Andric } 2842753f127fSDimitry Andric MultiByteUTF8: 2843753f127fSDimitry Andric // CurPtr is 1 code unit past C, so to decode 2844753f127fSDimitry Andric // the codepoint, we need to read from the previous position. 2845753f127fSDimitry Andric unsigned Length = llvm::getUTF8SequenceSize( 2846753f127fSDimitry Andric (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd); 2847753f127fSDimitry Andric if (Length == 0) { 2848753f127fSDimitry Andric if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode()) 2849753f127fSDimitry Andric Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment); 2850753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = true; 2851753f127fSDimitry Andric } else { 2852753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false; 2853753f127fSDimitry Andric CurPtr += Length - 1; 2854753f127fSDimitry Andric } 2855753f127fSDimitry Andric C = *CurPtr++; 2856753f127fSDimitry Andric } 28570b57cec5SDimitry Andric 28580b57cec5SDimitry Andric if (C == '/') { 28590b57cec5SDimitry Andric FoundSlash: 28600b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done! 28610b57cec5SDimitry Andric break; 28620b57cec5SDimitry Andric 28630b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 286481ad6265SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this, 286581ad6265SDimitry Andric LangOpts.Trigraphs)) { 28660b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the 28670b57cec5SDimitry Andric // * and /. We're done! 28680b57cec5SDimitry Andric break; 28690b57cec5SDimitry Andric } 28700b57cec5SDimitry Andric } 28710b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') { 28720b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this 28730b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with 28740b57cec5SDimitry Andric // embedded escaped newlines, but oh well. 28750b57cec5SDimitry Andric if (!isLexingRawMode()) 28760b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment); 28770b57cec5SDimitry Andric } 28780b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) { 28790b57cec5SDimitry Andric if (!isLexingRawMode()) 28800b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 28810b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately 28820b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the 28830b57cec5SDimitry Andric // comment, which surely would confuse the parser. 28840b57cec5SDimitry Andric --CurPtr; 28850b57cec5SDimitry Andric 28860b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 28870b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 28880b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 28890b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 28900b57cec5SDimitry Andric return true; 28910b57cec5SDimitry Andric } 28920b57cec5SDimitry Andric 28930b57cec5SDimitry Andric BufferPtr = CurPtr; 28940b57cec5SDimitry Andric return false; 28950b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 28960b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 28970b57cec5SDimitry Andric cutOffLexing(); 28980b57cec5SDimitry Andric return false; 28990b57cec5SDimitry Andric } 29000b57cec5SDimitry Andric 29010b57cec5SDimitry Andric C = *CurPtr++; 29020b57cec5SDimitry Andric } 29030b57cec5SDimitry Andric 29040b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block. 29050b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 29060b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 29070b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 29080b57cec5SDimitry Andric BufferPtr = CurPtr; 29090b57cec5SDimitry Andric return true; // A token has to be returned. 29100b57cec5SDimitry Andric } 29110b57cec5SDimitry Andric 29120b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 29130b57cec5SDimitry Andric if (inKeepCommentMode()) { 29140b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 29150b57cec5SDimitry Andric return true; 29160b57cec5SDimitry Andric } 29170b57cec5SDimitry Andric 29180b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be 29190b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it 29200b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would 29210b57cec5SDimitry Andric // have already returned above with the comment as a token. 29220b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 29230b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 29240b57cec5SDimitry Andric return false; 29250b57cec5SDimitry Andric } 29260b57cec5SDimitry Andric 29270b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token. 29280b57cec5SDimitry Andric BufferPtr = CurPtr; 29290b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 29300b57cec5SDimitry Andric return false; 29310b57cec5SDimitry Andric } 29320b57cec5SDimitry Andric 29330b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 29340b57cec5SDimitry Andric // Primary Lexing Entry Points 29350b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 29360b57cec5SDimitry Andric 29370b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 29380b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode. 29390b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 29400b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false && 29410b57cec5SDimitry Andric "Must be in a preprocessing directive!"); 29420b57cec5SDimitry Andric Token Tmp; 2943480093f4SDimitry Andric Tmp.startToken(); 29440b57cec5SDimitry Andric 29450b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 29460b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 29470b57cec5SDimitry Andric while (true) { 29480b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp); 29490b57cec5SDimitry Andric switch (Char) { 29500b57cec5SDimitry Andric default: 29510b57cec5SDimitry Andric if (Result) 29520b57cec5SDimitry Andric Result->push_back(Char); 29530b57cec5SDimitry Andric break; 29540b57cec5SDimitry Andric case 0: // Null. 29550b57cec5SDimitry Andric // Found end of file? 29560b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) { 29570b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 29580b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 29590b57cec5SDimitry Andric cutOffLexing(); 29600b57cec5SDimitry Andric return; 29610b57cec5SDimitry Andric } 29620b57cec5SDimitry Andric 29630b57cec5SDimitry Andric // Nope, normal character, continue. 29640b57cec5SDimitry Andric if (Result) 29650b57cec5SDimitry Andric Result->push_back(Char); 29660b57cec5SDimitry Andric break; 29670b57cec5SDimitry Andric } 29680b57cec5SDimitry Andric // FALL THROUGH. 2969*bdd1243dSDimitry Andric [[fallthrough]]; 29700b57cec5SDimitry Andric case '\r': 29710b57cec5SDimitry Andric case '\n': 29720b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n. 29730b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 29740b57cec5SDimitry Andric BufferPtr = CurPtr-1; 29750b57cec5SDimitry Andric 29760b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition. 29770b57cec5SDimitry Andric Lex(Tmp); 29780b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) { 29790b57cec5SDimitry Andric if (PP) 29800b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 29810b57cec5SDimitry Andric Lex(Tmp); 29820b57cec5SDimitry Andric } 29830b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!"); 29840b57cec5SDimitry Andric 29850b57cec5SDimitry Andric // Finally, we're done; 29860b57cec5SDimitry Andric return; 29870b57cec5SDimitry Andric } 29880b57cec5SDimitry Andric } 29890b57cec5SDimitry Andric } 29900b57cec5SDimitry Andric 29910b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this 29920b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required. 29930b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be 29940b57cec5SDimitry Andric /// called again. 29950b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 29960b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive, 29970b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will 29980b57cec5SDimitry Andric // then be the end of file. 29990b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 30000b57cec5SDimitry Andric // Done parsing the "line". 30010b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 30020b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 30030b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod); 30040b57cec5SDimitry Andric 30050b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 30060b57cec5SDimitry Andric if (PP) 30070b57cec5SDimitry Andric resetExtendedTokenMode(); 30080b57cec5SDimitry Andric return true; // Have a token. 30090b57cec5SDimitry Andric } 30100b57cec5SDimitry Andric 30110b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller 30120b57cec5SDimitry Andric // that put us in raw mode handle the event. 30130b57cec5SDimitry Andric if (isLexingRawMode()) { 30140b57cec5SDimitry Andric Result.startToken(); 30150b57cec5SDimitry Andric BufferPtr = BufferEnd; 30160b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof); 30170b57cec5SDimitry Andric return true; 30180b57cec5SDimitry Andric } 30190b57cec5SDimitry Andric 30200b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 30210b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack); 3022fe6060f1SDimitry Andric // If the preamble cuts off the end of a header guard, consider it guarded. 3023fe6060f1SDimitry Andric // The guard is valid for the preamble content itself, and for tools the 3024fe6060f1SDimitry Andric // most useful answer is "yes, this file has a header guard". 3025fe6060f1SDimitry Andric if (!ConditionalStack.empty()) 3026fe6060f1SDimitry Andric MIOpt.ExitTopLevelConditional(); 30270b57cec5SDimitry Andric ConditionalStack.clear(); 30280b57cec5SDimitry Andric } 30290b57cec5SDimitry Andric 30300b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline. 30310b57cec5SDimitry Andric 30320b57cec5SDimitry Andric // If we are in a #if directive, emit an error. 30330b57cec5SDimitry Andric while (!ConditionalStack.empty()) { 30340b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc) 30350b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc, 30360b57cec5SDimitry Andric diag::err_pp_unterminated_conditional); 30370b57cec5SDimitry Andric ConditionalStack.pop_back(); 30380b57cec5SDimitry Andric } 30390b57cec5SDimitry Andric 30400b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 30410b57cec5SDimitry Andric // a pedwarn. 30420b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 30430b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics(); 304481ad6265SDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd); 30450b57cec5SDimitry Andric unsigned DiagID; 30460b57cec5SDimitry Andric 30470b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 30480b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2 30490b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic, 30500b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning. 30510b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 30520b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof; 30530b57cec5SDimitry Andric } else { 30540b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof; 30550b57cec5SDimitry Andric } 30560b57cec5SDimitry Andric } else { 30570b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof; 30580b57cec5SDimitry Andric } 30590b57cec5SDimitry Andric 30600b57cec5SDimitry Andric Diag(BufferEnd, DiagID) 30610b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n"); 30620b57cec5SDimitry Andric } 30630b57cec5SDimitry Andric 30640b57cec5SDimitry Andric BufferPtr = CurPtr; 30650b57cec5SDimitry Andric 30660b57cec5SDimitry Andric // Finally, let the preprocessor handle this. 306781ad6265SDimitry Andric return PP->HandleEndOfFile(Result, isPragmaLexer()); 30680b57cec5SDimitry Andric } 30690b57cec5SDimitry Andric 30700b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 30710b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something 30720b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the 30730b57cec5SDimitry Andric /// lexer. 30740b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() { 30750b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 30760b57cec5SDimitry Andric 307781ad6265SDimitry Andric if (isDependencyDirectivesLexer()) { 307881ad6265SDimitry Andric if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) 307981ad6265SDimitry Andric return 2; 308081ad6265SDimitry Andric return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 308181ad6265SDimitry Andric tok::l_paren); 308281ad6265SDimitry Andric } 308381ad6265SDimitry Andric 30840b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token 30850b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF 30860b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack. 30870b57cec5SDimitry Andric LexingRawMode = true; 30880b57cec5SDimitry Andric 30890b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it. 30900b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr; 30910b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective; 30920b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine; 30930b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 30940b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace; 30950b57cec5SDimitry Andric 30960b57cec5SDimitry Andric Token Tok; 30970b57cec5SDimitry Andric Lex(Tok); 30980b57cec5SDimitry Andric 30990b57cec5SDimitry Andric // Restore state that may have changed. 31000b57cec5SDimitry Andric BufferPtr = TmpBufferPtr; 31010b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode; 31020b57cec5SDimitry Andric HasLeadingSpace = leadingSpace; 31030b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine; 31040b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 31050b57cec5SDimitry Andric 31060b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode. 31070b57cec5SDimitry Andric LexingRawMode = false; 31080b57cec5SDimitry Andric 31090b57cec5SDimitry Andric if (Tok.is(tok::eof)) 31100b57cec5SDimitry Andric return 2; 31110b57cec5SDimitry Andric return Tok.is(tok::l_paren); 31120b57cec5SDimitry Andric } 31130b57cec5SDimitry Andric 31140b57cec5SDimitry Andric /// Find the end of a version control conflict marker. 31150b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 31160b57cec5SDimitry Andric ConflictMarkerKind CMK) { 31170b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 31180b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 31190b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 31200b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator); 31210b57cec5SDimitry Andric while (Pos != StringRef::npos) { 31220b57cec5SDimitry Andric // Must occur at start of line. 31230b57cec5SDimitry Andric if (Pos == 0 || 31240b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 31250b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 31260b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator); 31270b57cec5SDimitry Andric continue; 31280b57cec5SDimitry Andric } 31290b57cec5SDimitry Andric return RestOfBuffer.data()+Pos; 31300b57cec5SDimitry Andric } 31310b57cec5SDimitry Andric return nullptr; 31320b57cec5SDimitry Andric } 31330b57cec5SDimitry Andric 31340b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version 31350b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 31360b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false 31370b57cec5SDimitry Andric /// if not. 31380b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 31390b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 31400b57cec5SDimitry Andric if (CurPtr != BufferStart && 31410b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 31420b57cec5SDimitry Andric return false; 31430b57cec5SDimitry Andric 31440b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>. 31450b57cec5SDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 31460b57cec5SDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 31470b57cec5SDimitry Andric return false; 31480b57cec5SDimitry Andric 31490b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 31500b57cec5SDimitry Andric // it. 31510b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode()) 31520b57cec5SDimitry Andric return false; 31530b57cec5SDimitry Andric 31540b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 31550b57cec5SDimitry Andric 31560b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the 31570b57cec5SDimitry Andric // start of a line to terminate this conflict marker. 31580b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 31590b57cec5SDimitry Andric // We found a match. We are really in a conflict marker. 31600b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line. 31610b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker); 31620b57cec5SDimitry Andric CurrentConflictMarkerState = Kind; 31630b57cec5SDimitry Andric 31640b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the 31650b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n. 31660b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') { 31670b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line"); 31680b57cec5SDimitry Andric ++CurPtr; 31690b57cec5SDimitry Andric } 31700b57cec5SDimitry Andric BufferPtr = CurPtr; 31710b57cec5SDimitry Andric return true; 31720b57cec5SDimitry Andric } 31730b57cec5SDimitry Andric 31740b57cec5SDimitry Andric // No end of conflict marker found. 31750b57cec5SDimitry Andric return false; 31760b57cec5SDimitry Andric } 31770b57cec5SDimitry Andric 31780b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 31790b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 31800b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of 31810b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not. 31820b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 31830b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 31840b57cec5SDimitry Andric if (CurPtr != BufferStart && 31850b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 31860b57cec5SDimitry Andric return false; 31870b57cec5SDimitry Andric 31880b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 31890b57cec5SDimitry Andric // it. 31900b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode()) 31910b57cec5SDimitry Andric return false; 31920b57cec5SDimitry Andric 31930b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row). 31940b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i) 31950b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0]) 31960b57cec5SDimitry Andric return false; 31970b57cec5SDimitry Andric 31980b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could 31990b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 32000b57cec5SDimitry Andric // be the end of conflict marker. 32010b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 32020b57cec5SDimitry Andric CurrentConflictMarkerState)) { 32030b57cec5SDimitry Andric CurPtr = End; 32040b57cec5SDimitry Andric 32050b57cec5SDimitry Andric // Skip ahead to the end of line. 32060b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 32070b57cec5SDimitry Andric ++CurPtr; 32080b57cec5SDimitry Andric 32090b57cec5SDimitry Andric BufferPtr = CurPtr; 32100b57cec5SDimitry Andric 32110b57cec5SDimitry Andric // No longer in the conflict marker. 32120b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 32130b57cec5SDimitry Andric return true; 32140b57cec5SDimitry Andric } 32150b57cec5SDimitry Andric 32160b57cec5SDimitry Andric return false; 32170b57cec5SDimitry Andric } 32180b57cec5SDimitry Andric 32190b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr, 32200b57cec5SDimitry Andric const char *BufferEnd) { 32210b57cec5SDimitry Andric if (CurPtr == BufferEnd) 32220b57cec5SDimitry Andric return nullptr; 32230b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character. 32240b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) { 32250b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>') 32260b57cec5SDimitry Andric return CurPtr + 2; 32270b57cec5SDimitry Andric } 32280b57cec5SDimitry Andric return nullptr; 32290b57cec5SDimitry Andric } 32300b57cec5SDimitry Andric 32310b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 32320b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 32330b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 32340b57cec5SDimitry Andric return false; 32350b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 32360b57cec5SDimitry Andric if (!End) 32370b57cec5SDimitry Andric return false; 32380b57cec5SDimitry Andric const char *Start = CurPtr - 1; 32390b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders) 32400b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source); 32410b57cec5SDimitry Andric Result.startToken(); 32420b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier); 32430b57cec5SDimitry Andric Result.setRawIdentifierData(Start); 32440b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result); 32450b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder); 32460b57cec5SDimitry Andric BufferPtr = End; 32470b57cec5SDimitry Andric return true; 32480b57cec5SDimitry Andric } 32490b57cec5SDimitry Andric 32500b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 32510b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) { 32520b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 32530b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc(); 32540b57cec5SDimitry Andric } 32550b57cec5SDimitry Andric 32560b57cec5SDimitry Andric return false; 32570b57cec5SDimitry Andric } 32580b57cec5SDimitry Andric 3259*bdd1243dSDimitry Andric std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr, 326081ad6265SDimitry Andric const char *SlashLoc, 32610b57cec5SDimitry Andric Token *Result) { 32620b57cec5SDimitry Andric unsigned CharSize; 32630b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 326481ad6265SDimitry Andric assert((Kind == 'u' || Kind == 'U') && "expected a UCN"); 32650b57cec5SDimitry Andric 32660b57cec5SDimitry Andric unsigned NumHexDigits; 32670b57cec5SDimitry Andric if (Kind == 'u') 32680b57cec5SDimitry Andric NumHexDigits = 4; 32690b57cec5SDimitry Andric else if (Kind == 'U') 32700b57cec5SDimitry Andric NumHexDigits = 8; 327181ad6265SDimitry Andric 327281ad6265SDimitry Andric bool Delimited = false; 327381ad6265SDimitry Andric bool FoundEndDelimiter = false; 327481ad6265SDimitry Andric unsigned Count = 0; 327581ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 32760b57cec5SDimitry Andric 32770b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) { 3278349cc55cSDimitry Andric if (Diagnose) 32790b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 3280*bdd1243dSDimitry Andric return std::nullopt; 32810b57cec5SDimitry Andric } 32820b57cec5SDimitry Andric 32830b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize; 32840b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1]; 32850b57cec5SDimitry Andric 32860b57cec5SDimitry Andric uint32_t CodePoint = 0; 3287349cc55cSDimitry Andric while (Count != NumHexDigits || Delimited) { 32880b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize); 3289*bdd1243dSDimitry Andric if (!Delimited && Count == 0 && C == '{') { 3290349cc55cSDimitry Andric Delimited = true; 3291349cc55cSDimitry Andric CurPtr += CharSize; 3292349cc55cSDimitry Andric continue; 3293349cc55cSDimitry Andric } 3294349cc55cSDimitry Andric 3295349cc55cSDimitry Andric if (Delimited && C == '}') { 3296349cc55cSDimitry Andric CurPtr += CharSize; 3297349cc55cSDimitry Andric FoundEndDelimiter = true; 3298349cc55cSDimitry Andric break; 3299349cc55cSDimitry Andric } 33000b57cec5SDimitry Andric 33010b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C); 33020b57cec5SDimitry Andric if (Value == -1U) { 3303349cc55cSDimitry Andric if (!Delimited) 3304349cc55cSDimitry Andric break; 3305349cc55cSDimitry Andric if (Diagnose) 3306*bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_delimited_ucn_incomplete) 330781ad6265SDimitry Andric << StringRef(KindLoc, 1); 3308*bdd1243dSDimitry Andric return std::nullopt; 3309349cc55cSDimitry Andric } 33100b57cec5SDimitry Andric 3311349cc55cSDimitry Andric if (CodePoint & 0xF000'0000) { 3312349cc55cSDimitry Andric if (Diagnose) 3313349cc55cSDimitry Andric Diag(KindLoc, diag::err_escape_too_large) << 0; 3314*bdd1243dSDimitry Andric return std::nullopt; 3315349cc55cSDimitry Andric } 3316349cc55cSDimitry Andric 3317349cc55cSDimitry Andric CodePoint <<= 4; 3318349cc55cSDimitry Andric CodePoint |= Value; 3319349cc55cSDimitry Andric CurPtr += CharSize; 3320349cc55cSDimitry Andric Count++; 3321349cc55cSDimitry Andric } 3322349cc55cSDimitry Andric 3323349cc55cSDimitry Andric if (Count == 0) { 3324349cc55cSDimitry Andric if (Diagnose) 3325*bdd1243dSDimitry Andric Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 3326349cc55cSDimitry Andric : diag::warn_ucn_escape_no_digits) 3327349cc55cSDimitry Andric << StringRef(KindLoc, 1); 3328*bdd1243dSDimitry Andric return std::nullopt; 332981ad6265SDimitry Andric } 333081ad6265SDimitry Andric 333181ad6265SDimitry Andric if (Delimited && Kind == 'U') { 333281ad6265SDimitry Andric if (Diagnose) 3333*bdd1243dSDimitry Andric Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1); 3334*bdd1243dSDimitry Andric return std::nullopt; 3335349cc55cSDimitry Andric } 3336349cc55cSDimitry Andric 3337349cc55cSDimitry Andric if (!Delimited && Count != NumHexDigits) { 3338349cc55cSDimitry Andric if (Diagnose) { 3339*bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 33400b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u. 3341349cc55cSDimitry Andric if (Count == 4 && NumHexDigits == 8) { 33420b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 33430b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight) 33440b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u"); 33450b57cec5SDimitry Andric } 33460b57cec5SDimitry Andric } 3347*bdd1243dSDimitry Andric return std::nullopt; 33480b57cec5SDimitry Andric } 33490b57cec5SDimitry Andric 3350349cc55cSDimitry Andric if (Delimited && PP) { 3351*bdd1243dSDimitry Andric Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b 3352753f127fSDimitry Andric ? diag::warn_cxx2b_delimited_escape_sequence 3353753f127fSDimitry Andric : diag::ext_delimited_escape_sequence) 3354753f127fSDimitry Andric << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 33550b57cec5SDimitry Andric } 33560b57cec5SDimitry Andric 33570b57cec5SDimitry Andric if (Result) { 33580b57cec5SDimitry Andric Result->setFlag(Token::HasUCN); 3359*bdd1243dSDimitry Andric // If the UCN contains either a trigraph or a line splicing, 3360*bdd1243dSDimitry Andric // we need to call getAndAdvanceChar again to set the appropriate flags 3361*bdd1243dSDimitry Andric // on Result. 3362*bdd1243dSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0))) 33630b57cec5SDimitry Andric StartPtr = CurPtr; 33640b57cec5SDimitry Andric else 33650b57cec5SDimitry Andric while (StartPtr != CurPtr) 33660b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 33670b57cec5SDimitry Andric } else { 33680b57cec5SDimitry Andric StartPtr = CurPtr; 33690b57cec5SDimitry Andric } 337081ad6265SDimitry Andric return CodePoint; 337181ad6265SDimitry Andric } 337281ad6265SDimitry Andric 3373*bdd1243dSDimitry Andric std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr, 3374*bdd1243dSDimitry Andric const char *SlashLoc, 337581ad6265SDimitry Andric Token *Result) { 337681ad6265SDimitry Andric unsigned CharSize; 337781ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode(); 337881ad6265SDimitry Andric 337981ad6265SDimitry Andric char C = getCharAndSize(StartPtr, CharSize); 338081ad6265SDimitry Andric assert(C == 'N' && "expected \\N{...}"); 338181ad6265SDimitry Andric 338281ad6265SDimitry Andric const char *CurPtr = StartPtr + CharSize; 338381ad6265SDimitry Andric const char *KindLoc = &CurPtr[-1]; 338481ad6265SDimitry Andric 338581ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize); 338681ad6265SDimitry Andric if (C != '{') { 338781ad6265SDimitry Andric if (Diagnose) 3388*bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_ucn_escape_incomplete); 3389*bdd1243dSDimitry Andric return std::nullopt; 339081ad6265SDimitry Andric } 339181ad6265SDimitry Andric CurPtr += CharSize; 339281ad6265SDimitry Andric const char *StartName = CurPtr; 339381ad6265SDimitry Andric bool FoundEndDelimiter = false; 339481ad6265SDimitry Andric llvm::SmallVector<char, 30> Buffer; 339581ad6265SDimitry Andric while (C) { 339681ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize); 339781ad6265SDimitry Andric CurPtr += CharSize; 339881ad6265SDimitry Andric if (C == '}') { 339981ad6265SDimitry Andric FoundEndDelimiter = true; 340081ad6265SDimitry Andric break; 340181ad6265SDimitry Andric } 340281ad6265SDimitry Andric 3403*bdd1243dSDimitry Andric if (isVerticalWhitespace(C)) 340481ad6265SDimitry Andric break; 340581ad6265SDimitry Andric Buffer.push_back(C); 340681ad6265SDimitry Andric } 340781ad6265SDimitry Andric 340881ad6265SDimitry Andric if (!FoundEndDelimiter || Buffer.empty()) { 340981ad6265SDimitry Andric if (Diagnose) 3410*bdd1243dSDimitry Andric Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty 341181ad6265SDimitry Andric : diag::warn_delimited_ucn_incomplete) 341281ad6265SDimitry Andric << StringRef(KindLoc, 1); 3413*bdd1243dSDimitry Andric return std::nullopt; 341481ad6265SDimitry Andric } 341581ad6265SDimitry Andric 341681ad6265SDimitry Andric StringRef Name(Buffer.data(), Buffer.size()); 3417*bdd1243dSDimitry Andric std::optional<char32_t> Match = 341881ad6265SDimitry Andric llvm::sys::unicode::nameToCodepointStrict(Name); 3419*bdd1243dSDimitry Andric std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch; 3420*bdd1243dSDimitry Andric if (!Match) { 342181ad6265SDimitry Andric LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name); 3422*bdd1243dSDimitry Andric if (Diagnose) { 3423*bdd1243dSDimitry Andric Diag(StartName, diag::err_invalid_ucn_name) 3424*bdd1243dSDimitry Andric << StringRef(Buffer.data(), Buffer.size()) 3425*bdd1243dSDimitry Andric << makeCharRange(*this, StartName, CurPtr - CharSize); 342681ad6265SDimitry Andric if (LooseMatch) { 342781ad6265SDimitry Andric Diag(StartName, diag::note_invalid_ucn_name_loose_matching) 342881ad6265SDimitry Andric << FixItHint::CreateReplacement( 342981ad6265SDimitry Andric makeCharRange(*this, StartName, CurPtr - CharSize), 343081ad6265SDimitry Andric LooseMatch->Name); 343181ad6265SDimitry Andric } 343281ad6265SDimitry Andric } 3433*bdd1243dSDimitry Andric // We do not offer misspelled character names suggestions here 343481ad6265SDimitry Andric // as the set of what would be a valid suggestion depends on context, 343581ad6265SDimitry Andric // and we should not make invalid suggestions. 343681ad6265SDimitry Andric } 343781ad6265SDimitry Andric 3438*bdd1243dSDimitry Andric if (Diagnose && Match) 3439*bdd1243dSDimitry Andric Diag(SlashLoc, PP->getLangOpts().CPlusPlus2b 3440753f127fSDimitry Andric ? diag::warn_cxx2b_delimited_escape_sequence 3441753f127fSDimitry Andric : diag::ext_delimited_escape_sequence) 3442753f127fSDimitry Andric << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0); 344381ad6265SDimitry Andric 3444*bdd1243dSDimitry Andric // If no diagnostic has been emitted yet, likely because we are doing a 3445*bdd1243dSDimitry Andric // tentative lexing, we do not want to recover here to make sure the token 3446*bdd1243dSDimitry Andric // will not be incorrectly considered valid. This function will be called 3447*bdd1243dSDimitry Andric // again and a diagnostic emitted then. 3448*bdd1243dSDimitry Andric if (LooseMatch && Diagnose) 3449*bdd1243dSDimitry Andric Match = LooseMatch->CodePoint; 345081ad6265SDimitry Andric 345181ad6265SDimitry Andric if (Result) { 345281ad6265SDimitry Andric Result->setFlag(Token::HasUCN); 3453*bdd1243dSDimitry Andric // If the UCN contains either a trigraph or a line splicing, 3454*bdd1243dSDimitry Andric // we need to call getAndAdvanceChar again to set the appropriate flags 3455*bdd1243dSDimitry Andric // on Result. 3456*bdd1243dSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3)) 345781ad6265SDimitry Andric StartPtr = CurPtr; 345881ad6265SDimitry Andric else 345981ad6265SDimitry Andric while (StartPtr != CurPtr) 346081ad6265SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 346181ad6265SDimitry Andric } else { 346281ad6265SDimitry Andric StartPtr = CurPtr; 346381ad6265SDimitry Andric } 3464*bdd1243dSDimitry Andric return Match ? std::optional<uint32_t>(*Match) : std::nullopt; 346581ad6265SDimitry Andric } 346681ad6265SDimitry Andric 346781ad6265SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 346881ad6265SDimitry Andric Token *Result) { 346981ad6265SDimitry Andric 347081ad6265SDimitry Andric unsigned CharSize; 3471*bdd1243dSDimitry Andric std::optional<uint32_t> CodePointOpt; 347281ad6265SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 347381ad6265SDimitry Andric if (Kind == 'u' || Kind == 'U') 347481ad6265SDimitry Andric CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result); 347581ad6265SDimitry Andric else if (Kind == 'N') 3476*bdd1243dSDimitry Andric CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result); 347781ad6265SDimitry Andric 347881ad6265SDimitry Andric if (!CodePointOpt) 347981ad6265SDimitry Andric return 0; 348081ad6265SDimitry Andric 348181ad6265SDimitry Andric uint32_t CodePoint = *CodePointOpt; 34820b57cec5SDimitry Andric 34830b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode 34840b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) 34850b57cec5SDimitry Andric return CodePoint; 34860b57cec5SDimitry Andric 34870b57cec5SDimitry Andric // C99 6.4.3p2: A universal character name shall not specify a character whose 34880b57cec5SDimitry Andric // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 34890b57cec5SDimitry Andric // 0060 (`), nor one in the range D800 through DFFF inclusive.) 34900b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a 34910b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the 34920b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 34930b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the 34940b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 34950b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the 34960b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 34970b57cec5SDimitry Andric // basic source character set, the program is ill-formed. 34980b57cec5SDimitry Andric if (CodePoint < 0xA0) { 34990b57cec5SDimitry Andric if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 35000b57cec5SDimitry Andric return CodePoint; 35010b57cec5SDimitry Andric 35020b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad 35030b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 35040b57cec5SDimitry Andric if (Result && PP) { 35050b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F) 35060b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character); 35070b57cec5SDimitry Andric else { 35080b57cec5SDimitry Andric char C = static_cast<char>(CodePoint); 35090b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 35100b57cec5SDimitry Andric } 35110b57cec5SDimitry Andric } 35120b57cec5SDimitry Andric 35130b57cec5SDimitry Andric return 0; 35140b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 35150b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 35160b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad 35170b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 35180b57cec5SDimitry Andric if (Result && PP) { 35190b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 35200b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 35210b57cec5SDimitry Andric else 35220b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid); 35230b57cec5SDimitry Andric } 35240b57cec5SDimitry Andric return 0; 35250b57cec5SDimitry Andric } 35260b57cec5SDimitry Andric 35270b57cec5SDimitry Andric return CodePoint; 35280b57cec5SDimitry Andric } 35290b57cec5SDimitry Andric 35300b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 35310b57cec5SDimitry Andric const char *CurPtr) { 35320b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3533349cc55cSDimitry Andric isUnicodeWhitespace(C)) { 35340b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace) 35350b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr); 35360b57cec5SDimitry Andric 35370b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 35380b57cec5SDimitry Andric return true; 35390b57cec5SDimitry Andric } 35400b57cec5SDimitry Andric return false; 35410b57cec5SDimitry Andric } 35420b57cec5SDimitry Andric 35430b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 35440b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine(); 35450b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace(); 35460b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 35470b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine. 35480b57cec5SDimitry Andric } 35490b57cec5SDimitry Andric 35500b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) { 355181ad6265SDimitry Andric assert(!isDependencyDirectivesLexer()); 355281ad6265SDimitry Andric 35530b57cec5SDimitry Andric // Start a new token. 35540b57cec5SDimitry Andric Result.startToken(); 35550b57cec5SDimitry Andric 35560b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal. 35570b57cec5SDimitry Andric if (IsAtStartOfLine) { 35580b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 35590b57cec5SDimitry Andric IsAtStartOfLine = false; 35600b57cec5SDimitry Andric } 35610b57cec5SDimitry Andric 35620b57cec5SDimitry Andric if (HasLeadingSpace) { 35630b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 35640b57cec5SDimitry Andric HasLeadingSpace = false; 35650b57cec5SDimitry Andric } 35660b57cec5SDimitry Andric 35670b57cec5SDimitry Andric if (HasLeadingEmptyMacro) { 35680b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro); 35690b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 35700b57cec5SDimitry Andric } 35710b57cec5SDimitry Andric 35720b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 35730b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false; 35740b57cec5SDimitry Andric bool isRawLex = isLexingRawMode(); 35750b57cec5SDimitry Andric (void) isRawLex; 35760b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 35770b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.) 35780b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 35790b57cec5SDimitry Andric return returnedToken; 35800b57cec5SDimitry Andric } 35810b57cec5SDimitry Andric 35820b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an 35830b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer 35840b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing 35850b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes 35860b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this. 35870b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3588*bdd1243dSDimitry Andric LexStart: 3589*bdd1243dSDimitry Andric assert(!Result.needsCleaning() && "Result needs cleaning"); 3590*bdd1243dSDimitry Andric assert(!Result.hasPtrData() && "Result has not been reset"); 35910b57cec5SDimitry Andric 35920b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 35930b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 35940b57cec5SDimitry Andric 35950b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens. 3596fe6060f1SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 3597fe6060f1SDimitry Andric do { 35980b57cec5SDimitry Andric ++CurPtr; 3599fe6060f1SDimitry Andric } while (isHorizontalWhitespace(*CurPtr)); 36000b57cec5SDimitry Andric 36010b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just 36020b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the 36030b57cec5SDimitry Andric // whitespace. 36040b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 36050b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 36060b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 36070b57cec5SDimitry Andric return true; 36080b57cec5SDimitry Andric } 36090b57cec5SDimitry Andric 36100b57cec5SDimitry Andric BufferPtr = CurPtr; 36110b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 36120b57cec5SDimitry Andric } 36130b57cec5SDimitry Andric 36140b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 36150b57cec5SDimitry Andric 36160b57cec5SDimitry Andric // Read a character, advancing over it. 36170b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result); 36180b57cec5SDimitry Andric tok::TokenKind Kind; 36190b57cec5SDimitry Andric 3620e8d8bef9SDimitry Andric if (!isVerticalWhitespace(Char)) 3621e8d8bef9SDimitry Andric NewLinePtr = nullptr; 3622e8d8bef9SDimitry Andric 36230b57cec5SDimitry Andric switch (Char) { 36240b57cec5SDimitry Andric case 0: // Null. 36250b57cec5SDimitry Andric // Found end of file? 36260b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd) 36270b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 36280b57cec5SDimitry Andric 36290b57cec5SDimitry Andric // Check if we are performing code completion. 36300b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 36310b57cec5SDimitry Andric // Return the code-completion token. 36320b57cec5SDimitry Andric Result.startToken(); 36330b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion); 36340b57cec5SDimitry Andric return true; 36350b57cec5SDimitry Andric } 36360b57cec5SDimitry Andric 36370b57cec5SDimitry Andric if (!isLexingRawMode()) 36380b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file); 36390b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 36400b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 36410b57cec5SDimitry Andric return true; // KeepWhitespaceMode 36420b57cec5SDimitry Andric 36430b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer. 36440b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 36450b57cec5SDimitry Andric goto LexNextToken; 36460b57cec5SDimitry Andric 36470b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z". 36480b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file. 36490b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) { 36500b57cec5SDimitry Andric if (!isLexingRawMode()) 36510b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 36520b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 36530b57cec5SDimitry Andric } 36540b57cec5SDimitry Andric 36550b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage. 36560b57cec5SDimitry Andric Kind = tok::unknown; 36570b57cec5SDimitry Andric break; 36580b57cec5SDimitry Andric 36590b57cec5SDimitry Andric case '\r': 36600b57cec5SDimitry Andric if (CurPtr[0] == '\n') 36610b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 3662*bdd1243dSDimitry Andric [[fallthrough]]; 36630b57cec5SDimitry Andric case '\n': 36640b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 36650b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token. 36660b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 36670b57cec5SDimitry Andric // Done parsing the "line". 36680b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 36690b57cec5SDimitry Andric 36700b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 36710b57cec5SDimitry Andric if (PP) 36720b57cec5SDimitry Andric resetExtendedTokenMode(); 36730b57cec5SDimitry Andric 36740b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line. 36750b57cec5SDimitry Andric IsAtStartOfLine = true; 36760b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 3677e8d8bef9SDimitry Andric NewLinePtr = CurPtr - 1; 36780b57cec5SDimitry Andric 36790b57cec5SDimitry Andric Kind = tok::eod; 36800b57cec5SDimitry Andric break; 36810b57cec5SDimitry Andric } 36820b57cec5SDimitry Andric 36830b57cec5SDimitry Andric // No leading whitespace seen so far. 36840b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 36850b57cec5SDimitry Andric 36860b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 36870b57cec5SDimitry Andric return true; // KeepWhitespaceMode 36880b57cec5SDimitry Andric 36890b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 36900b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 36910b57cec5SDimitry Andric goto LexNextToken; 36920b57cec5SDimitry Andric case ' ': 36930b57cec5SDimitry Andric case '\t': 36940b57cec5SDimitry Andric case '\f': 36950b57cec5SDimitry Andric case '\v': 36960b57cec5SDimitry Andric SkipHorizontalWhitespace: 36970b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 36980b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 36990b57cec5SDimitry Andric return true; // KeepWhitespaceMode 37000b57cec5SDimitry Andric 37010b57cec5SDimitry Andric SkipIgnoredUnits: 37020b57cec5SDimitry Andric CurPtr = BufferPtr; 37030b57cec5SDimitry Andric 37040b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently 37050b57cec5SDimitry Andric // too (without going through the big switch stmt). 37060b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 370781ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 37080b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 37090b57cec5SDimitry Andric return true; // There is a token to return. 37100b57cec5SDimitry Andric goto SkipIgnoredUnits; 37110b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 37120b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 37130b57cec5SDimitry Andric return true; // There is a token to return. 37140b57cec5SDimitry Andric goto SkipIgnoredUnits; 37150b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) { 37160b57cec5SDimitry Andric goto SkipHorizontalWhitespace; 37170b57cec5SDimitry Andric } 37180b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 37190b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 37200b57cec5SDimitry Andric goto LexNextToken; 37210b57cec5SDimitry Andric 37220b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants. 37230b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants. 37240b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4': 37250b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': 37260b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37270b57cec5SDimitry Andric MIOpt.ReadToken(); 37280b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 37290b57cec5SDimitry Andric 373081ad6265SDimitry Andric // Identifier (e.g., uber), or 373181ad6265SDimitry Andric // UTF-8 (C2x/C++17) or UTF-16 (C11/C++11) character literal, or 373281ad6265SDimitry Andric // UTF-8 or UTF-16 string literal (C11/C++11). 373381ad6265SDimitry Andric case 'u': 37340b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37350b57cec5SDimitry Andric MIOpt.ReadToken(); 37360b57cec5SDimitry Andric 37370b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 37380b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37390b57cec5SDimitry Andric 37400b57cec5SDimitry Andric // UTF-16 string literal 37410b57cec5SDimitry Andric if (Char == '"') 37420b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 37430b57cec5SDimitry Andric tok::utf16_string_literal); 37440b57cec5SDimitry Andric 37450b57cec5SDimitry Andric // UTF-16 character constant 37460b57cec5SDimitry Andric if (Char == '\'') 37470b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 37480b57cec5SDimitry Andric tok::utf16_char_constant); 37490b57cec5SDimitry Andric 37500b57cec5SDimitry Andric // UTF-16 raw string literal 37510b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 37520b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 37530b57cec5SDimitry Andric return LexRawStringLiteral(Result, 37540b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37550b57cec5SDimitry Andric SizeTmp2, Result), 37560b57cec5SDimitry Andric tok::utf16_string_literal); 37570b57cec5SDimitry Andric 37580b57cec5SDimitry Andric if (Char == '8') { 37590b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 37600b57cec5SDimitry Andric 37610b57cec5SDimitry Andric // UTF-8 string literal 37620b57cec5SDimitry Andric if (Char2 == '"') 37630b57cec5SDimitry Andric return LexStringLiteral(Result, 37640b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37650b57cec5SDimitry Andric SizeTmp2, Result), 37660b57cec5SDimitry Andric tok::utf8_string_literal); 376781ad6265SDimitry Andric if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C2x)) 37680b57cec5SDimitry Andric return LexCharConstant( 37690b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37700b57cec5SDimitry Andric SizeTmp2, Result), 37710b57cec5SDimitry Andric tok::utf8_char_constant); 37720b57cec5SDimitry Andric 37730b57cec5SDimitry Andric if (Char2 == 'R' && LangOpts.CPlusPlus11) { 37740b57cec5SDimitry Andric unsigned SizeTmp3; 37750b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 37760b57cec5SDimitry Andric // UTF-8 raw string literal 37770b57cec5SDimitry Andric if (Char3 == '"') { 37780b57cec5SDimitry Andric return LexRawStringLiteral(Result, 37790b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 37800b57cec5SDimitry Andric SizeTmp2, Result), 37810b57cec5SDimitry Andric SizeTmp3, Result), 37820b57cec5SDimitry Andric tok::utf8_string_literal); 37830b57cec5SDimitry Andric } 37840b57cec5SDimitry Andric } 37850b57cec5SDimitry Andric } 37860b57cec5SDimitry Andric } 37870b57cec5SDimitry Andric 37880b57cec5SDimitry Andric // treat u like the start of an identifier. 3789349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 37900b57cec5SDimitry Andric 379181ad6265SDimitry Andric case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal 37920b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 37930b57cec5SDimitry Andric MIOpt.ReadToken(); 37940b57cec5SDimitry Andric 37950b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 37960b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 37970b57cec5SDimitry Andric 37980b57cec5SDimitry Andric // UTF-32 string literal 37990b57cec5SDimitry Andric if (Char == '"') 38000b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38010b57cec5SDimitry Andric tok::utf32_string_literal); 38020b57cec5SDimitry Andric 38030b57cec5SDimitry Andric // UTF-32 character constant 38040b57cec5SDimitry Andric if (Char == '\'') 38050b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38060b57cec5SDimitry Andric tok::utf32_char_constant); 38070b57cec5SDimitry Andric 38080b57cec5SDimitry Andric // UTF-32 raw string literal 38090b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 38100b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 38110b57cec5SDimitry Andric return LexRawStringLiteral(Result, 38120b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38130b57cec5SDimitry Andric SizeTmp2, Result), 38140b57cec5SDimitry Andric tok::utf32_string_literal); 38150b57cec5SDimitry Andric } 38160b57cec5SDimitry Andric 38170b57cec5SDimitry Andric // treat U like the start of an identifier. 3818349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 38190b57cec5SDimitry Andric 38200b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal 38210b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38220b57cec5SDimitry Andric MIOpt.ReadToken(); 38230b57cec5SDimitry Andric 38240b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 38250b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38260b57cec5SDimitry Andric 38270b57cec5SDimitry Andric if (Char == '"') 38280b57cec5SDimitry Andric return LexRawStringLiteral(Result, 38290b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result), 38300b57cec5SDimitry Andric tok::string_literal); 38310b57cec5SDimitry Andric } 38320b57cec5SDimitry Andric 38330b57cec5SDimitry Andric // treat R like the start of an identifier. 3834349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 38350b57cec5SDimitry Andric 38360b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 38370b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38380b57cec5SDimitry Andric MIOpt.ReadToken(); 38390b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 38400b57cec5SDimitry Andric 38410b57cec5SDimitry Andric // Wide string literal. 38420b57cec5SDimitry Andric if (Char == '"') 38430b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38440b57cec5SDimitry Andric tok::wide_string_literal); 38450b57cec5SDimitry Andric 38460b57cec5SDimitry Andric // Wide raw string literal. 38470b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && Char == 'R' && 38480b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 38490b57cec5SDimitry Andric return LexRawStringLiteral(Result, 38500b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 38510b57cec5SDimitry Andric SizeTmp2, Result), 38520b57cec5SDimitry Andric tok::wide_string_literal); 38530b57cec5SDimitry Andric 38540b57cec5SDimitry Andric // Wide character constant. 38550b57cec5SDimitry Andric if (Char == '\'') 38560b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 38570b57cec5SDimitry Andric tok::wide_char_constant); 38580b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier. 3859*bdd1243dSDimitry Andric [[fallthrough]]; 38600b57cec5SDimitry Andric 38610b57cec5SDimitry Andric // C99 6.4.2: Identifiers. 38620b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 38630b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 38640b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 38650b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z': 38660b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 38670b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 38680b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 38690b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z': 38700b57cec5SDimitry Andric case '_': 38710b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38720b57cec5SDimitry Andric MIOpt.ReadToken(); 3873349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 38740b57cec5SDimitry Andric 38750b57cec5SDimitry Andric case '$': // $ in identifiers. 38760b57cec5SDimitry Andric if (LangOpts.DollarIdents) { 38770b57cec5SDimitry Andric if (!isLexingRawMode()) 38780b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier); 38790b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38800b57cec5SDimitry Andric MIOpt.ReadToken(); 3881349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr); 38820b57cec5SDimitry Andric } 38830b57cec5SDimitry Andric 38840b57cec5SDimitry Andric Kind = tok::unknown; 38850b57cec5SDimitry Andric break; 38860b57cec5SDimitry Andric 38870b57cec5SDimitry Andric // C99 6.4.4: Character Constants. 38880b57cec5SDimitry Andric case '\'': 38890b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38900b57cec5SDimitry Andric MIOpt.ReadToken(); 38910b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant); 38920b57cec5SDimitry Andric 38930b57cec5SDimitry Andric // C99 6.4.5: String Literals. 38940b57cec5SDimitry Andric case '"': 38950b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 38960b57cec5SDimitry Andric MIOpt.ReadToken(); 38970b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr, 38980b57cec5SDimitry Andric ParsingFilename ? tok::header_name 38990b57cec5SDimitry Andric : tok::string_literal); 39000b57cec5SDimitry Andric 39010b57cec5SDimitry Andric // C99 6.4.6: Punctuators. 39020b57cec5SDimitry Andric case '?': 39030b57cec5SDimitry Andric Kind = tok::question; 39040b57cec5SDimitry Andric break; 39050b57cec5SDimitry Andric case '[': 39060b57cec5SDimitry Andric Kind = tok::l_square; 39070b57cec5SDimitry Andric break; 39080b57cec5SDimitry Andric case ']': 39090b57cec5SDimitry Andric Kind = tok::r_square; 39100b57cec5SDimitry Andric break; 39110b57cec5SDimitry Andric case '(': 39120b57cec5SDimitry Andric Kind = tok::l_paren; 39130b57cec5SDimitry Andric break; 39140b57cec5SDimitry Andric case ')': 39150b57cec5SDimitry Andric Kind = tok::r_paren; 39160b57cec5SDimitry Andric break; 39170b57cec5SDimitry Andric case '{': 39180b57cec5SDimitry Andric Kind = tok::l_brace; 39190b57cec5SDimitry Andric break; 39200b57cec5SDimitry Andric case '}': 39210b57cec5SDimitry Andric Kind = tok::r_brace; 39220b57cec5SDimitry Andric break; 39230b57cec5SDimitry Andric case '.': 39240b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39250b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') { 39260b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 39270b57cec5SDimitry Andric MIOpt.ReadToken(); 39280b57cec5SDimitry Andric 39290b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 39300b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') { 39310b57cec5SDimitry Andric Kind = tok::periodstar; 39320b57cec5SDimitry Andric CurPtr += SizeTmp; 39330b57cec5SDimitry Andric } else if (Char == '.' && 39340b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 39350b57cec5SDimitry Andric Kind = tok::ellipsis; 39360b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39370b57cec5SDimitry Andric SizeTmp2, Result); 39380b57cec5SDimitry Andric } else { 39390b57cec5SDimitry Andric Kind = tok::period; 39400b57cec5SDimitry Andric } 39410b57cec5SDimitry Andric break; 39420b57cec5SDimitry Andric case '&': 39430b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39440b57cec5SDimitry Andric if (Char == '&') { 39450b57cec5SDimitry Andric Kind = tok::ampamp; 39460b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39470b57cec5SDimitry Andric } else if (Char == '=') { 39480b57cec5SDimitry Andric Kind = tok::ampequal; 39490b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39500b57cec5SDimitry Andric } else { 39510b57cec5SDimitry Andric Kind = tok::amp; 39520b57cec5SDimitry Andric } 39530b57cec5SDimitry Andric break; 39540b57cec5SDimitry Andric case '*': 39550b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 39560b57cec5SDimitry Andric Kind = tok::starequal; 39570b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39580b57cec5SDimitry Andric } else { 39590b57cec5SDimitry Andric Kind = tok::star; 39600b57cec5SDimitry Andric } 39610b57cec5SDimitry Andric break; 39620b57cec5SDimitry Andric case '+': 39630b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39640b57cec5SDimitry Andric if (Char == '+') { 39650b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39660b57cec5SDimitry Andric Kind = tok::plusplus; 39670b57cec5SDimitry Andric } else if (Char == '=') { 39680b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39690b57cec5SDimitry Andric Kind = tok::plusequal; 39700b57cec5SDimitry Andric } else { 39710b57cec5SDimitry Andric Kind = tok::plus; 39720b57cec5SDimitry Andric } 39730b57cec5SDimitry Andric break; 39740b57cec5SDimitry Andric case '-': 39750b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 39760b57cec5SDimitry Andric if (Char == '-') { // -- 39770b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39780b57cec5SDimitry Andric Kind = tok::minusminus; 39790b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus && 39800b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 39810b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 39820b57cec5SDimitry Andric SizeTmp2, Result); 39830b57cec5SDimitry Andric Kind = tok::arrowstar; 39840b57cec5SDimitry Andric } else if (Char == '>') { // -> 39850b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39860b57cec5SDimitry Andric Kind = tok::arrow; 39870b57cec5SDimitry Andric } else if (Char == '=') { // -= 39880b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 39890b57cec5SDimitry Andric Kind = tok::minusequal; 39900b57cec5SDimitry Andric } else { 39910b57cec5SDimitry Andric Kind = tok::minus; 39920b57cec5SDimitry Andric } 39930b57cec5SDimitry Andric break; 39940b57cec5SDimitry Andric case '~': 39950b57cec5SDimitry Andric Kind = tok::tilde; 39960b57cec5SDimitry Andric break; 39970b57cec5SDimitry Andric case '!': 39980b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 39990b57cec5SDimitry Andric Kind = tok::exclaimequal; 40000b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40010b57cec5SDimitry Andric } else { 40020b57cec5SDimitry Andric Kind = tok::exclaim; 40030b57cec5SDimitry Andric } 40040b57cec5SDimitry Andric break; 40050b57cec5SDimitry Andric case '/': 40060b57cec5SDimitry Andric // 6.4.9: Comments 40070b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40080b57cec5SDimitry Andric if (Char == '/') { // Line comment. 40090b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally 40100b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though, 40110b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the 40120b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex 40130b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as 40140b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'. 40150b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment. 40160b57cec5SDimitry Andric // However, we never do this if we are just preprocessing. 401781ad6265SDimitry Andric bool TreatAsComment = 401881ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 40190b57cec5SDimitry Andric if (!TreatAsComment) 40200b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput())) 40210b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 40220b57cec5SDimitry Andric 40230b57cec5SDimitry Andric if (TreatAsComment) { 40240b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 40250b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 40260b57cec5SDimitry Andric return true; // There is a token to return. 40270b57cec5SDimitry Andric 40280b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be 40290b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through 40300b57cec5SDimitry Andric // the big switch, handle it efficiently now. 40310b57cec5SDimitry Andric goto SkipIgnoredUnits; 40320b57cec5SDimitry Andric } 40330b57cec5SDimitry Andric } 40340b57cec5SDimitry Andric 40350b57cec5SDimitry Andric if (Char == '*') { // /**/ comment. 40360b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 40370b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 40380b57cec5SDimitry Andric return true; // There is a token to return. 40390b57cec5SDimitry Andric 40400b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 40410b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 40420b57cec5SDimitry Andric goto LexNextToken; 40430b57cec5SDimitry Andric } 40440b57cec5SDimitry Andric 40450b57cec5SDimitry Andric if (Char == '=') { 40460b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40470b57cec5SDimitry Andric Kind = tok::slashequal; 40480b57cec5SDimitry Andric } else { 40490b57cec5SDimitry Andric Kind = tok::slash; 40500b57cec5SDimitry Andric } 40510b57cec5SDimitry Andric break; 40520b57cec5SDimitry Andric case '%': 40530b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40540b57cec5SDimitry Andric if (Char == '=') { 40550b57cec5SDimitry Andric Kind = tok::percentequal; 40560b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40570b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') { 40580b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}' 40590b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40600b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { 40610b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40620b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40630b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 40640b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##' 40650b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 40660b57cec5SDimitry Andric SizeTmp2, Result); 40670b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 40680b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 40690b57cec5SDimitry Andric if (!isLexingRawMode()) 40700b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 40710b57cec5SDimitry Andric Kind = tok::hashat; 40720b57cec5SDimitry Andric } else { // '%:' -> '#' 40730b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 40740b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 40750b57cec5SDimitry Andric // the preprocessor to handle it. 40760b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 40770b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 40780b57cec5SDimitry Andric goto HandleDirective; 40790b57cec5SDimitry Andric 40800b57cec5SDimitry Andric Kind = tok::hash; 40810b57cec5SDimitry Andric } 40820b57cec5SDimitry Andric } else { 40830b57cec5SDimitry Andric Kind = tok::percent; 40840b57cec5SDimitry Andric } 40850b57cec5SDimitry Andric break; 40860b57cec5SDimitry Andric case '<': 40870b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 40880b57cec5SDimitry Andric if (ParsingFilename) { 40890b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr); 40900b57cec5SDimitry Andric } else if (Char == '<') { 40910b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 40920b57cec5SDimitry Andric if (After == '=') { 40930b57cec5SDimitry Andric Kind = tok::lesslessequal; 40940b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 40950b57cec5SDimitry Andric SizeTmp2, Result); 40960b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 40970b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker, 40980b57cec5SDimitry Andric // recognize it as such and recover nicely. 40990b57cec5SDimitry Andric goto LexNextToken; 41000b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 41010b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker, 41020b57cec5SDimitry Andric // ignore it. 41030b57cec5SDimitry Andric goto LexNextToken; 41040b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') { 41050b57cec5SDimitry Andric Kind = tok::lesslessless; 41060b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 41070b57cec5SDimitry Andric SizeTmp2, Result); 41080b57cec5SDimitry Andric } else { 41090b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41100b57cec5SDimitry Andric Kind = tok::lessless; 41110b57cec5SDimitry Andric } 41120b57cec5SDimitry Andric } else if (Char == '=') { 41130b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 41140b57cec5SDimitry Andric if (After == '>') { 411581ad6265SDimitry Andric if (LangOpts.CPlusPlus20) { 41160b57cec5SDimitry Andric if (!isLexingRawMode()) 41170b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 41180b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 41190b57cec5SDimitry Andric SizeTmp2, Result); 41200b57cec5SDimitry Andric Kind = tok::spaceship; 41210b57cec5SDimitry Andric break; 41220b57cec5SDimitry Andric } 41230b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a 41240b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode. 412581ad6265SDimitry Andric if (LangOpts.CPlusPlus && !isLexingRawMode()) { 41265ffd83dbSDimitry Andric Diag(BufferPtr, diag::warn_cxx20_compat_spaceship) 41270b57cec5SDimitry Andric << FixItHint::CreateInsertion( 41280b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 41290b57cec5SDimitry Andric } 41300b57cec5SDimitry Andric } 41310b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41320b57cec5SDimitry Andric Kind = tok::lessequal; 41330b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 41340b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && 41350b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 41360b57cec5SDimitry Andric // C++0x [lex.pptoken]p3: 41370b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent 41380b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor 41390b57cec5SDimitry Andric // token by itself and not as the first character of the alternative 41400b57cec5SDimitry Andric // token <:. 41410b57cec5SDimitry Andric unsigned SizeTmp3; 41420b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 41430b57cec5SDimitry Andric if (After != ':' && After != '>') { 41440b57cec5SDimitry Andric Kind = tok::less; 41450b57cec5SDimitry Andric if (!isLexingRawMode()) 41460b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 41470b57cec5SDimitry Andric break; 41480b57cec5SDimitry Andric } 41490b57cec5SDimitry Andric } 41500b57cec5SDimitry Andric 41510b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41520b57cec5SDimitry Andric Kind = tok::l_square; 41530b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 41540b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41550b57cec5SDimitry Andric Kind = tok::l_brace; 41560b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 41570b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) { 41580b57cec5SDimitry Andric return true; 41590b57cec5SDimitry Andric } else { 41600b57cec5SDimitry Andric Kind = tok::less; 41610b57cec5SDimitry Andric } 41620b57cec5SDimitry Andric break; 41630b57cec5SDimitry Andric case '>': 41640b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41650b57cec5SDimitry Andric if (Char == '=') { 41660b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41670b57cec5SDimitry Andric Kind = tok::greaterequal; 41680b57cec5SDimitry Andric } else if (Char == '>') { 41690b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 41700b57cec5SDimitry Andric if (After == '=') { 41710b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 41720b57cec5SDimitry Andric SizeTmp2, Result); 41730b57cec5SDimitry Andric Kind = tok::greatergreaterequal; 41740b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 41750b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such 41760b57cec5SDimitry Andric // and recover nicely. 41770b57cec5SDimitry Andric goto LexNextToken; 41780b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 41790b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 41800b57cec5SDimitry Andric goto LexNextToken; 41810b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') { 41820b57cec5SDimitry Andric Kind = tok::greatergreatergreater; 41830b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 41840b57cec5SDimitry Andric SizeTmp2, Result); 41850b57cec5SDimitry Andric } else { 41860b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41870b57cec5SDimitry Andric Kind = tok::greatergreater; 41880b57cec5SDimitry Andric } 41890b57cec5SDimitry Andric } else { 41900b57cec5SDimitry Andric Kind = tok::greater; 41910b57cec5SDimitry Andric } 41920b57cec5SDimitry Andric break; 41930b57cec5SDimitry Andric case '^': 41940b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 41950b57cec5SDimitry Andric if (Char == '=') { 41960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 41970b57cec5SDimitry Andric Kind = tok::caretequal; 41980b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') { 41990b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42000b57cec5SDimitry Andric Kind = tok::caretcaret; 42010b57cec5SDimitry Andric } else { 42020b57cec5SDimitry Andric Kind = tok::caret; 42030b57cec5SDimitry Andric } 42040b57cec5SDimitry Andric break; 42050b57cec5SDimitry Andric case '|': 42060b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 42070b57cec5SDimitry Andric if (Char == '=') { 42080b57cec5SDimitry Andric Kind = tok::pipeequal; 42090b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42100b57cec5SDimitry Andric } else if (Char == '|') { 42110b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it. 42120b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 42130b57cec5SDimitry Andric goto LexNextToken; 42140b57cec5SDimitry Andric Kind = tok::pipepipe; 42150b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42160b57cec5SDimitry Andric } else { 42170b57cec5SDimitry Andric Kind = tok::pipe; 42180b57cec5SDimitry Andric } 42190b57cec5SDimitry Andric break; 42200b57cec5SDimitry Andric case ':': 42210b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 42220b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') { 42230b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']' 42240b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42250b57cec5SDimitry Andric } else if ((LangOpts.CPlusPlus || 42260b57cec5SDimitry Andric LangOpts.DoubleSquareBracketAttributes) && 42270b57cec5SDimitry Andric Char == ':') { 42280b57cec5SDimitry Andric Kind = tok::coloncolon; 42290b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42300b57cec5SDimitry Andric } else { 42310b57cec5SDimitry Andric Kind = tok::colon; 42320b57cec5SDimitry Andric } 42330b57cec5SDimitry Andric break; 42340b57cec5SDimitry Andric case ';': 42350b57cec5SDimitry Andric Kind = tok::semi; 42360b57cec5SDimitry Andric break; 42370b57cec5SDimitry Andric case '=': 42380b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 42390b57cec5SDimitry Andric if (Char == '=') { 42400b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it. 42410b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 42420b57cec5SDimitry Andric goto LexNextToken; 42430b57cec5SDimitry Andric 42440b57cec5SDimitry Andric Kind = tok::equalequal; 42450b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42460b57cec5SDimitry Andric } else { 42470b57cec5SDimitry Andric Kind = tok::equal; 42480b57cec5SDimitry Andric } 42490b57cec5SDimitry Andric break; 42500b57cec5SDimitry Andric case ',': 42510b57cec5SDimitry Andric Kind = tok::comma; 42520b57cec5SDimitry Andric break; 42530b57cec5SDimitry Andric case '#': 42540b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 42550b57cec5SDimitry Andric if (Char == '#') { 42560b57cec5SDimitry Andric Kind = tok::hashhash; 42570b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42580b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 42590b57cec5SDimitry Andric Kind = tok::hashat; 42600b57cec5SDimitry Andric if (!isLexingRawMode()) 42610b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 42620b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 42630b57cec5SDimitry Andric } else { 42640b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 42650b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 42660b57cec5SDimitry Andric // the preprocessor to handle it. 42670b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 42680b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 42690b57cec5SDimitry Andric goto HandleDirective; 42700b57cec5SDimitry Andric 42710b57cec5SDimitry Andric Kind = tok::hash; 42720b57cec5SDimitry Andric } 42730b57cec5SDimitry Andric break; 42740b57cec5SDimitry Andric 42750b57cec5SDimitry Andric case '@': 42760b57cec5SDimitry Andric // Objective C support. 42770b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC) 42780b57cec5SDimitry Andric Kind = tok::at; 42790b57cec5SDimitry Andric else 42800b57cec5SDimitry Andric Kind = tok::unknown; 42810b57cec5SDimitry Andric break; 42820b57cec5SDimitry Andric 42830b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 42840b57cec5SDimitry Andric case '\\': 42850b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) { 42860b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 42870b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 42880b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 42890b57cec5SDimitry Andric return true; // KeepWhitespaceMode 42900b57cec5SDimitry Andric 42910b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 42920b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 42930b57cec5SDimitry Andric goto LexNextToken; 42940b57cec5SDimitry Andric } 42950b57cec5SDimitry Andric 4296349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 42970b57cec5SDimitry Andric } 42980b57cec5SDimitry Andric } 42990b57cec5SDimitry Andric 43000b57cec5SDimitry Andric Kind = tok::unknown; 43010b57cec5SDimitry Andric break; 43020b57cec5SDimitry Andric 43030b57cec5SDimitry Andric default: { 43040b57cec5SDimitry Andric if (isASCII(Char)) { 43050b57cec5SDimitry Andric Kind = tok::unknown; 43060b57cec5SDimitry Andric break; 43070b57cec5SDimitry Andric } 43080b57cec5SDimitry Andric 43090b57cec5SDimitry Andric llvm::UTF32 CodePoint; 43100b57cec5SDimitry Andric 43110b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 43120b57cec5SDimitry Andric // an escaped newline. 43130b57cec5SDimitry Andric --CurPtr; 43140b57cec5SDimitry Andric llvm::ConversionResult Status = 43150b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 43160b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 43170b57cec5SDimitry Andric &CodePoint, 43180b57cec5SDimitry Andric llvm::strictConversion); 43190b57cec5SDimitry Andric if (Status == llvm::conversionOK) { 43200b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 43210b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 43220b57cec5SDimitry Andric return true; // KeepWhitespaceMode 43230b57cec5SDimitry Andric 43240b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 43250b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 43260b57cec5SDimitry Andric goto LexNextToken; 43270b57cec5SDimitry Andric } 4328349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr); 43290b57cec5SDimitry Andric } 43300b57cec5SDimitry Andric 43310b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective || 43320b57cec5SDimitry Andric PP->isPreprocessedOutput()) { 43330b57cec5SDimitry Andric ++CurPtr; 43340b57cec5SDimitry Andric Kind = tok::unknown; 43350b57cec5SDimitry Andric break; 43360b57cec5SDimitry Andric } 43370b57cec5SDimitry Andric 43380b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 43390b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 43400b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character. 43410b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8); 43420b57cec5SDimitry Andric 43430b57cec5SDimitry Andric BufferPtr = CurPtr+1; 43440b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with 43450b57cec5SDimitry Andric // this lexer. 43460b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 43470b57cec5SDimitry Andric goto LexNextToken; 43480b57cec5SDimitry Andric } 43490b57cec5SDimitry Andric } 43500b57cec5SDimitry Andric 43510b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 43520b57cec5SDimitry Andric MIOpt.ReadToken(); 43530b57cec5SDimitry Andric 43540b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 43550b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 43560b57cec5SDimitry Andric return true; 43570b57cec5SDimitry Andric 43580b57cec5SDimitry Andric HandleDirective: 43590b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive. 43600b57cec5SDimitry Andric 43610b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash); 43620b57cec5SDimitry Andric PP->HandleDirective(Result); 43630b57cec5SDimitry Andric 43640b57cec5SDimitry Andric if (PP->hadModuleLoaderFatalFailure()) { 43650b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing. 43660b57cec5SDimitry Andric assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 43670b57cec5SDimitry Andric return true; 43680b57cec5SDimitry Andric } 43690b57cec5SDimitry Andric 43700b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state. 43710b57cec5SDimitry Andric return false; 4372*bdd1243dSDimitry Andric 4373*bdd1243dSDimitry Andric LexNextToken: 4374*bdd1243dSDimitry Andric Result.clearFlag(Token::NeedsCleaning); 4375*bdd1243dSDimitry Andric goto LexStart; 43760b57cec5SDimitry Andric } 437781ad6265SDimitry Andric 437881ad6265SDimitry Andric const char *Lexer::convertDependencyDirectiveToken( 437981ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok, Token &Result) { 438081ad6265SDimitry Andric const char *TokPtr = BufferStart + DDTok.Offset; 438181ad6265SDimitry Andric Result.startToken(); 438281ad6265SDimitry Andric Result.setLocation(getSourceLocation(TokPtr)); 438381ad6265SDimitry Andric Result.setKind(DDTok.Kind); 438481ad6265SDimitry Andric Result.setFlag((Token::TokenFlags)DDTok.Flags); 438581ad6265SDimitry Andric Result.setLength(DDTok.Length); 438681ad6265SDimitry Andric BufferPtr = TokPtr + DDTok.Length; 438781ad6265SDimitry Andric return TokPtr; 438881ad6265SDimitry Andric } 438981ad6265SDimitry Andric 439081ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveToken(Token &Result) { 439181ad6265SDimitry Andric assert(isDependencyDirectivesLexer()); 439281ad6265SDimitry Andric 439381ad6265SDimitry Andric using namespace dependency_directives_scan; 439481ad6265SDimitry Andric 439581ad6265SDimitry Andric while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) { 439681ad6265SDimitry Andric if (DepDirectives.front().Kind == pp_eof) 439781ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd); 4398*bdd1243dSDimitry Andric if (DepDirectives.front().Kind == tokens_present_before_eof) 4399*bdd1243dSDimitry Andric MIOpt.ReadToken(); 440081ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0; 440181ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front(); 440281ad6265SDimitry Andric } 440381ad6265SDimitry Andric 440481ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok = 440581ad6265SDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++]; 440681ad6265SDimitry Andric if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) { 440781ad6265SDimitry Andric // Read something other than a preprocessor directive hash. 440881ad6265SDimitry Andric MIOpt.ReadToken(); 440981ad6265SDimitry Andric } 441081ad6265SDimitry Andric 4411*bdd1243dSDimitry Andric if (ParsingFilename && DDTok.is(tok::less)) { 4412*bdd1243dSDimitry Andric BufferPtr = BufferStart + DDTok.Offset; 4413*bdd1243dSDimitry Andric LexAngledStringLiteral(Result, BufferPtr + 1); 4414*bdd1243dSDimitry Andric if (Result.isNot(tok::header_name)) 4415*bdd1243dSDimitry Andric return true; 4416*bdd1243dSDimitry Andric // Advance the index of lexed tokens. 4417*bdd1243dSDimitry Andric while (true) { 4418*bdd1243dSDimitry Andric const dependency_directives_scan::Token &NextTok = 4419*bdd1243dSDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex]; 4420*bdd1243dSDimitry Andric if (BufferStart + NextTok.Offset >= BufferPtr) 4421*bdd1243dSDimitry Andric break; 4422*bdd1243dSDimitry Andric ++NextDepDirectiveTokenIndex; 4423*bdd1243dSDimitry Andric } 4424*bdd1243dSDimitry Andric return true; 4425*bdd1243dSDimitry Andric } 4426*bdd1243dSDimitry Andric 442781ad6265SDimitry Andric const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result); 442881ad6265SDimitry Andric 442981ad6265SDimitry Andric if (Result.is(tok::hash) && Result.isAtStartOfLine()) { 443081ad6265SDimitry Andric PP->HandleDirective(Result); 443181ad6265SDimitry Andric return false; 443281ad6265SDimitry Andric } 443381ad6265SDimitry Andric if (Result.is(tok::raw_identifier)) { 443481ad6265SDimitry Andric Result.setRawIdentifierData(TokPtr); 443581ad6265SDimitry Andric if (!isLexingRawMode()) { 443681ad6265SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 443781ad6265SDimitry Andric if (II->isHandleIdentifierCase()) 443881ad6265SDimitry Andric return PP->HandleIdentifier(Result); 443981ad6265SDimitry Andric } 444081ad6265SDimitry Andric return true; 444181ad6265SDimitry Andric } 444281ad6265SDimitry Andric if (Result.isLiteral()) { 444381ad6265SDimitry Andric Result.setLiteralData(TokPtr); 444481ad6265SDimitry Andric return true; 444581ad6265SDimitry Andric } 444681ad6265SDimitry Andric if (Result.is(tok::colon) && 444781ad6265SDimitry Andric (LangOpts.CPlusPlus || LangOpts.DoubleSquareBracketAttributes)) { 444881ad6265SDimitry Andric // Convert consecutive colons to 'tok::coloncolon'. 444981ad6265SDimitry Andric if (*BufferPtr == ':') { 445081ad6265SDimitry Andric assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is( 445181ad6265SDimitry Andric tok::colon)); 445281ad6265SDimitry Andric ++NextDepDirectiveTokenIndex; 445381ad6265SDimitry Andric Result.setKind(tok::coloncolon); 445481ad6265SDimitry Andric } 445581ad6265SDimitry Andric return true; 445681ad6265SDimitry Andric } 445781ad6265SDimitry Andric if (Result.is(tok::eod)) 445881ad6265SDimitry Andric ParsingPreprocessorDirective = false; 445981ad6265SDimitry Andric 446081ad6265SDimitry Andric return true; 446181ad6265SDimitry Andric } 446281ad6265SDimitry Andric 446381ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) { 446481ad6265SDimitry Andric assert(isDependencyDirectivesLexer()); 446581ad6265SDimitry Andric 446681ad6265SDimitry Andric using namespace dependency_directives_scan; 446781ad6265SDimitry Andric 446881ad6265SDimitry Andric bool Stop = false; 446981ad6265SDimitry Andric unsigned NestedIfs = 0; 447081ad6265SDimitry Andric do { 447181ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front(); 447281ad6265SDimitry Andric switch (DepDirectives.front().Kind) { 447381ad6265SDimitry Andric case pp_none: 447481ad6265SDimitry Andric llvm_unreachable("unexpected 'pp_none'"); 447581ad6265SDimitry Andric case pp_include: 447681ad6265SDimitry Andric case pp___include_macros: 447781ad6265SDimitry Andric case pp_define: 447881ad6265SDimitry Andric case pp_undef: 447981ad6265SDimitry Andric case pp_import: 448081ad6265SDimitry Andric case pp_pragma_import: 448181ad6265SDimitry Andric case pp_pragma_once: 448281ad6265SDimitry Andric case pp_pragma_push_macro: 448381ad6265SDimitry Andric case pp_pragma_pop_macro: 448481ad6265SDimitry Andric case pp_pragma_include_alias: 448581ad6265SDimitry Andric case pp_include_next: 448681ad6265SDimitry Andric case decl_at_import: 448781ad6265SDimitry Andric case cxx_module_decl: 448881ad6265SDimitry Andric case cxx_import_decl: 448981ad6265SDimitry Andric case cxx_export_module_decl: 449081ad6265SDimitry Andric case cxx_export_import_decl: 4491*bdd1243dSDimitry Andric case tokens_present_before_eof: 449281ad6265SDimitry Andric break; 449381ad6265SDimitry Andric case pp_if: 449481ad6265SDimitry Andric case pp_ifdef: 449581ad6265SDimitry Andric case pp_ifndef: 449681ad6265SDimitry Andric ++NestedIfs; 449781ad6265SDimitry Andric break; 449881ad6265SDimitry Andric case pp_elif: 449981ad6265SDimitry Andric case pp_elifdef: 450081ad6265SDimitry Andric case pp_elifndef: 450181ad6265SDimitry Andric case pp_else: 450281ad6265SDimitry Andric if (!NestedIfs) { 450381ad6265SDimitry Andric Stop = true; 450481ad6265SDimitry Andric } 450581ad6265SDimitry Andric break; 450681ad6265SDimitry Andric case pp_endif: 450781ad6265SDimitry Andric if (!NestedIfs) { 450881ad6265SDimitry Andric Stop = true; 450981ad6265SDimitry Andric } else { 451081ad6265SDimitry Andric --NestedIfs; 451181ad6265SDimitry Andric } 451281ad6265SDimitry Andric break; 451381ad6265SDimitry Andric case pp_eof: 451481ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0; 451581ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd); 451681ad6265SDimitry Andric } 451781ad6265SDimitry Andric } while (!Stop); 451881ad6265SDimitry Andric 451981ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok = 452081ad6265SDimitry Andric DepDirectives.front().Tokens.front(); 452181ad6265SDimitry Andric assert(DDTok.is(tok::hash)); 452281ad6265SDimitry Andric NextDepDirectiveTokenIndex = 1; 452381ad6265SDimitry Andric 452481ad6265SDimitry Andric convertDependencyDirectiveToken(DDTok, Result); 452581ad6265SDimitry Andric return false; 452681ad6265SDimitry Andric } 4527