1*0b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===// 2*0b57cec5SDimitry Andric // 3*0b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4*0b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information. 5*0b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6*0b57cec5SDimitry Andric // 7*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 8*0b57cec5SDimitry Andric // 9*0b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces. 10*0b57cec5SDimitry Andric // 11*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 12*0b57cec5SDimitry Andric 13*0b57cec5SDimitry Andric #include "clang/Lex/Lexer.h" 14*0b57cec5SDimitry Andric #include "UnicodeCharSets.h" 15*0b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h" 16*0b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h" 17*0b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h" 18*0b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h" 19*0b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h" 20*0b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 21*0b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h" 22*0b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h" 23*0b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h" 24*0b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h" 25*0b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h" 26*0b57cec5SDimitry Andric #include "clang/Lex/Token.h" 27*0b57cec5SDimitry Andric #include "clang/Basic/Diagnostic.h" 28*0b57cec5SDimitry Andric #include "clang/Basic/LLVM.h" 29*0b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h" 30*0b57cec5SDimitry Andric #include "llvm/ADT/None.h" 31*0b57cec5SDimitry Andric #include "llvm/ADT/Optional.h" 32*0b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h" 33*0b57cec5SDimitry Andric #include "llvm/ADT/StringSwitch.h" 34*0b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h" 35*0b57cec5SDimitry Andric #include "llvm/Support/Compiler.h" 36*0b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h" 37*0b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h" 38*0b57cec5SDimitry Andric #include "llvm/Support/MemoryBuffer.h" 39*0b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h" 40*0b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h" 41*0b57cec5SDimitry Andric #include <algorithm> 42*0b57cec5SDimitry Andric #include <cassert> 43*0b57cec5SDimitry Andric #include <cstddef> 44*0b57cec5SDimitry Andric #include <cstdint> 45*0b57cec5SDimitry Andric #include <cstring> 46*0b57cec5SDimitry Andric #include <string> 47*0b57cec5SDimitry Andric #include <tuple> 48*0b57cec5SDimitry Andric #include <utility> 49*0b57cec5SDimitry Andric 50*0b57cec5SDimitry Andric using namespace clang; 51*0b57cec5SDimitry Andric 52*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 53*0b57cec5SDimitry Andric // Token Class Implementation 54*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 55*0b57cec5SDimitry Andric 56*0b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 57*0b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 58*0b57cec5SDimitry Andric if (isAnnotation()) 59*0b57cec5SDimitry Andric return false; 60*0b57cec5SDimitry Andric if (IdentifierInfo *II = getIdentifierInfo()) 61*0b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey; 62*0b57cec5SDimitry Andric return false; 63*0b57cec5SDimitry Andric } 64*0b57cec5SDimitry Andric 65*0b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind. 66*0b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const { 67*0b57cec5SDimitry Andric if (isAnnotation()) 68*0b57cec5SDimitry Andric return tok::objc_not_keyword; 69*0b57cec5SDimitry Andric IdentifierInfo *specId = getIdentifierInfo(); 70*0b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 71*0b57cec5SDimitry Andric } 72*0b57cec5SDimitry Andric 73*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 74*0b57cec5SDimitry Andric // Lexer Class Implementation 75*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 76*0b57cec5SDimitry Andric 77*0b57cec5SDimitry Andric void Lexer::anchor() {} 78*0b57cec5SDimitry Andric 79*0b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 80*0b57cec5SDimitry Andric const char *BufEnd) { 81*0b57cec5SDimitry Andric BufferStart = BufStart; 82*0b57cec5SDimitry Andric BufferPtr = BufPtr; 83*0b57cec5SDimitry Andric BufferEnd = BufEnd; 84*0b57cec5SDimitry Andric 85*0b57cec5SDimitry Andric assert(BufEnd[0] == 0 && 86*0b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end" 87*0b57cec5SDimitry Andric " to simplify lexing!"); 88*0b57cec5SDimitry Andric 89*0b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act 90*0b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just 91*0b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present. 92*0b57cec5SDimitry Andric if (BufferStart == BufferPtr) { 93*0b57cec5SDimitry Andric // Determine the size of the BOM. 94*0b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart); 95*0b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 96*0b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 97*0b57cec5SDimitry Andric .Default(0); 98*0b57cec5SDimitry Andric 99*0b57cec5SDimitry Andric // Skip the BOM. 100*0b57cec5SDimitry Andric BufferPtr += BOMLength; 101*0b57cec5SDimitry Andric } 102*0b57cec5SDimitry Andric 103*0b57cec5SDimitry Andric Is_PragmaLexer = false; 104*0b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 105*0b57cec5SDimitry Andric 106*0b57cec5SDimitry Andric // Start of the file is a start of line. 107*0b57cec5SDimitry Andric IsAtStartOfLine = true; 108*0b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 109*0b57cec5SDimitry Andric 110*0b57cec5SDimitry Andric HasLeadingSpace = false; 111*0b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 112*0b57cec5SDimitry Andric 113*0b57cec5SDimitry Andric // We are not after parsing a #. 114*0b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 115*0b57cec5SDimitry Andric 116*0b57cec5SDimitry Andric // We are not after parsing #include. 117*0b57cec5SDimitry Andric ParsingFilename = false; 118*0b57cec5SDimitry Andric 119*0b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation 120*0b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 121*0b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 122*0b57cec5SDimitry Andric // or otherwise skipping over tokens. 123*0b57cec5SDimitry Andric LexingRawMode = false; 124*0b57cec5SDimitry Andric 125*0b57cec5SDimitry Andric // Default to not keeping comments. 126*0b57cec5SDimitry Andric ExtendedTokenMode = 0; 127*0b57cec5SDimitry Andric } 128*0b57cec5SDimitry Andric 129*0b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer 130*0b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer 131*0b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will 132*0b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them. 133*0b57cec5SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 134*0b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID), 135*0b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 136*0b57cec5SDimitry Andric LangOpts(PP.getLangOpts()) { 137*0b57cec5SDimitry Andric InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 138*0b57cec5SDimitry Andric InputFile->getBufferEnd()); 139*0b57cec5SDimitry Andric 140*0b57cec5SDimitry Andric resetExtendedTokenMode(); 141*0b57cec5SDimitry Andric } 142*0b57cec5SDimitry Andric 143*0b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 144*0b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 145*0b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 146*0b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 147*0b57cec5SDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd) 148*0b57cec5SDimitry Andric : FileLoc(fileloc), LangOpts(langOpts) { 149*0b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd); 150*0b57cec5SDimitry Andric 151*0b57cec5SDimitry Andric // We *are* in raw mode. 152*0b57cec5SDimitry Andric LexingRawMode = true; 153*0b57cec5SDimitry Andric } 154*0b57cec5SDimitry Andric 155*0b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only 156*0b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 157*0b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it. 158*0b57cec5SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 159*0b57cec5SDimitry Andric const SourceManager &SM, const LangOptions &langOpts) 160*0b57cec5SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile->getBufferStart(), 161*0b57cec5SDimitry Andric FromFile->getBufferStart(), FromFile->getBufferEnd()) {} 162*0b57cec5SDimitry Andric 163*0b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() { 164*0b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor"); 165*0b57cec5SDimitry Andric if (LangOpts.TraditionalCPP) 166*0b57cec5SDimitry Andric SetKeepWhitespaceMode(true); 167*0b57cec5SDimitry Andric else 168*0b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState()); 169*0b57cec5SDimitry Andric } 170*0b57cec5SDimitry Andric 171*0b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 172*0b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method 173*0b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done. 174*0b57cec5SDimitry Andric /// 175*0b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a 176*0b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an 177*0b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be 178*0b57cec5SDimitry Andric /// "expanded from". 179*0b57cec5SDimitry Andric /// 180*0b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a 181*0b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making 182*0b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 183*0b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc 184*0b57cec5SDimitry Andric /// out of the critical path of the lexer! 185*0b57cec5SDimitry Andric /// 186*0b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 187*0b57cec5SDimitry Andric SourceLocation ExpansionLocStart, 188*0b57cec5SDimitry Andric SourceLocation ExpansionLocEnd, 189*0b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) { 190*0b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 191*0b57cec5SDimitry Andric 192*0b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally. 193*0b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc); 194*0b57cec5SDimitry Andric const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 195*0b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP); 196*0b57cec5SDimitry Andric 197*0b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we 198*0b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a 199*0b57cec5SDimitry Andric // scratch buffer. 200*0b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc); 201*0b57cec5SDimitry Andric 202*0b57cec5SDimitry Andric L->BufferPtr = StrData; 203*0b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen; 204*0b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 205*0b57cec5SDimitry Andric 206*0b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that 207*0b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed. 208*0b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 209*0b57cec5SDimitry Andric ExpansionLocStart, 210*0b57cec5SDimitry Andric ExpansionLocEnd, TokLen); 211*0b57cec5SDimitry Andric 212*0b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will 213*0b57cec5SDimitry Andric // return an EOD token. 214*0b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true; 215*0b57cec5SDimitry Andric 216*0b57cec5SDimitry Andric // This lexer really is for _Pragma. 217*0b57cec5SDimitry Andric L->Is_PragmaLexer = true; 218*0b57cec5SDimitry Andric return L; 219*0b57cec5SDimitry Andric } 220*0b57cec5SDimitry Andric 221*0b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) { 222*0b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size(); 223*0b57cec5SDimitry Andric while (i < e) { 224*0b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) { 225*0b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\'); 226*0b57cec5SDimitry Andric i += 2; 227*0b57cec5SDimitry Andric ++e; 228*0b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') { 229*0b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'. 230*0b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') && 231*0b57cec5SDimitry Andric Str[i] != Str[i + 1]) { 232*0b57cec5SDimitry Andric Str[i] = '\\'; 233*0b57cec5SDimitry Andric Str[i + 1] = 'n'; 234*0b57cec5SDimitry Andric } else { 235*0b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'. 236*0b57cec5SDimitry Andric Str[i] = '\\'; 237*0b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n'); 238*0b57cec5SDimitry Andric ++e; 239*0b57cec5SDimitry Andric } 240*0b57cec5SDimitry Andric i += 2; 241*0b57cec5SDimitry Andric } else 242*0b57cec5SDimitry Andric ++i; 243*0b57cec5SDimitry Andric } 244*0b57cec5SDimitry Andric } 245*0b57cec5SDimitry Andric 246*0b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) { 247*0b57cec5SDimitry Andric std::string Result = Str; 248*0b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"'; 249*0b57cec5SDimitry Andric StringifyImpl(Result, Quote); 250*0b57cec5SDimitry Andric return Result; 251*0b57cec5SDimitry Andric } 252*0b57cec5SDimitry Andric 253*0b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); } 254*0b57cec5SDimitry Andric 255*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 256*0b57cec5SDimitry Andric // Token Spelling 257*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 258*0b57cec5SDimitry Andric 259*0b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the 260*0b57cec5SDimitry Andric /// spelling of this token from the provided input buffer. 261*0b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 262*0b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) { 263*0b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 264*0b57cec5SDimitry Andric 265*0b57cec5SDimitry Andric size_t Length = 0; 266*0b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength(); 267*0b57cec5SDimitry Andric 268*0b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) { 269*0b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote. 270*0b57cec5SDimitry Andric while (BufPtr < BufEnd) { 271*0b57cec5SDimitry Andric unsigned Size; 272*0b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 273*0b57cec5SDimitry Andric BufPtr += Size; 274*0b57cec5SDimitry Andric 275*0b57cec5SDimitry Andric if (Spelling[Length - 1] == '"') 276*0b57cec5SDimitry Andric break; 277*0b57cec5SDimitry Andric } 278*0b57cec5SDimitry Andric 279*0b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line 280*0b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their 281*0b57cec5SDimitry Andric // r-char-sequence. 282*0b57cec5SDimitry Andric if (Length >= 2 && 283*0b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 284*0b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing 285*0b57cec5SDimitry Andric // quote. 286*0b57cec5SDimitry Andric const char *RawEnd = BufEnd; 287*0b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"'); 288*0b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1; 289*0b57cec5SDimitry Andric 290*0b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling. 291*0b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength); 292*0b57cec5SDimitry Andric Length += RawLength; 293*0b57cec5SDimitry Andric BufPtr += RawLength; 294*0b57cec5SDimitry Andric 295*0b57cec5SDimitry Andric // The rest of the token is lexed normally. 296*0b57cec5SDimitry Andric } 297*0b57cec5SDimitry Andric } 298*0b57cec5SDimitry Andric 299*0b57cec5SDimitry Andric while (BufPtr < BufEnd) { 300*0b57cec5SDimitry Andric unsigned Size; 301*0b57cec5SDimitry Andric Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 302*0b57cec5SDimitry Andric BufPtr += Size; 303*0b57cec5SDimitry Andric } 304*0b57cec5SDimitry Andric 305*0b57cec5SDimitry Andric assert(Length < Tok.getLength() && 306*0b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!"); 307*0b57cec5SDimitry Andric return Length; 308*0b57cec5SDimitry Andric } 309*0b57cec5SDimitry Andric 310*0b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 311*0b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 312*0b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 313*0b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 314*0b57cec5SDimitry Andric /// UCNs, etc. 315*0b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc, 316*0b57cec5SDimitry Andric SmallVectorImpl<char> &buffer, 317*0b57cec5SDimitry Andric const SourceManager &SM, 318*0b57cec5SDimitry Andric const LangOptions &options, 319*0b57cec5SDimitry Andric bool *invalid) { 320*0b57cec5SDimitry Andric // Break down the source location. 321*0b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 322*0b57cec5SDimitry Andric 323*0b57cec5SDimitry Andric // Try to the load the file buffer. 324*0b57cec5SDimitry Andric bool invalidTemp = false; 325*0b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 326*0b57cec5SDimitry Andric if (invalidTemp) { 327*0b57cec5SDimitry Andric if (invalid) *invalid = true; 328*0b57cec5SDimitry Andric return {}; 329*0b57cec5SDimitry Andric } 330*0b57cec5SDimitry Andric 331*0b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second; 332*0b57cec5SDimitry Andric 333*0b57cec5SDimitry Andric // Lex from the start of the given location. 334*0b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 335*0b57cec5SDimitry Andric file.begin(), tokenBegin, file.end()); 336*0b57cec5SDimitry Andric Token token; 337*0b57cec5SDimitry Andric lexer.LexFromRawLexer(token); 338*0b57cec5SDimitry Andric 339*0b57cec5SDimitry Andric unsigned length = token.getLength(); 340*0b57cec5SDimitry Andric 341*0b57cec5SDimitry Andric // Common case: no need for cleaning. 342*0b57cec5SDimitry Andric if (!token.needsCleaning()) 343*0b57cec5SDimitry Andric return StringRef(tokenBegin, length); 344*0b57cec5SDimitry Andric 345*0b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string. 346*0b57cec5SDimitry Andric buffer.resize(length); 347*0b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 348*0b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size()); 349*0b57cec5SDimitry Andric } 350*0b57cec5SDimitry Andric 351*0b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a 352*0b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file 353*0b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this 354*0b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs 355*0b57cec5SDimitry Andric /// UCNs, etc. 356*0b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 357*0b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 358*0b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 359*0b57cec5SDimitry Andric 360*0b57cec5SDimitry Andric bool CharDataInvalid = false; 361*0b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 362*0b57cec5SDimitry Andric &CharDataInvalid); 363*0b57cec5SDimitry Andric if (Invalid) 364*0b57cec5SDimitry Andric *Invalid = CharDataInvalid; 365*0b57cec5SDimitry Andric if (CharDataInvalid) 366*0b57cec5SDimitry Andric return {}; 367*0b57cec5SDimitry Andric 368*0b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 369*0b57cec5SDimitry Andric if (!Tok.needsCleaning()) 370*0b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength()); 371*0b57cec5SDimitry Andric 372*0b57cec5SDimitry Andric std::string Result; 373*0b57cec5SDimitry Andric Result.resize(Tok.getLength()); 374*0b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 375*0b57cec5SDimitry Andric return Result; 376*0b57cec5SDimitry Andric } 377*0b57cec5SDimitry Andric 378*0b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a 379*0b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required 380*0b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least 381*0b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned. 382*0b57cec5SDimitry Andric /// 383*0b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in 384*0b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer* 385*0b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a 386*0b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer 387*0b57cec5SDimitry Andric /// if an internal buffer is returned. 388*0b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 389*0b57cec5SDimitry Andric const SourceManager &SourceMgr, 390*0b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) { 391*0b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 392*0b57cec5SDimitry Andric 393*0b57cec5SDimitry Andric const char *TokStart = nullptr; 394*0b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo. 395*0b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier)) 396*0b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data(); 397*0b57cec5SDimitry Andric else if (!Tok.hasUCN()) { 398*0b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 399*0b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick. 400*0b57cec5SDimitry Andric Buffer = II->getNameStart(); 401*0b57cec5SDimitry Andric return II->getLength(); 402*0b57cec5SDimitry Andric } 403*0b57cec5SDimitry Andric } 404*0b57cec5SDimitry Andric 405*0b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo. 406*0b57cec5SDimitry Andric if (Tok.isLiteral()) 407*0b57cec5SDimitry Andric TokStart = Tok.getLiteralData(); 408*0b57cec5SDimitry Andric 409*0b57cec5SDimitry Andric if (!TokStart) { 410*0b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer. 411*0b57cec5SDimitry Andric bool CharDataInvalid = false; 412*0b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 413*0b57cec5SDimitry Andric if (Invalid) 414*0b57cec5SDimitry Andric *Invalid = CharDataInvalid; 415*0b57cec5SDimitry Andric if (CharDataInvalid) { 416*0b57cec5SDimitry Andric Buffer = ""; 417*0b57cec5SDimitry Andric return 0; 418*0b57cec5SDimitry Andric } 419*0b57cec5SDimitry Andric } 420*0b57cec5SDimitry Andric 421*0b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly. 422*0b57cec5SDimitry Andric if (!Tok.needsCleaning()) { 423*0b57cec5SDimitry Andric Buffer = TokStart; 424*0b57cec5SDimitry Andric return Tok.getLength(); 425*0b57cec5SDimitry Andric } 426*0b57cec5SDimitry Andric 427*0b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string. 428*0b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 429*0b57cec5SDimitry Andric } 430*0b57cec5SDimitry Andric 431*0b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return 432*0b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g. 433*0b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes 434*0b57cec5SDimitry Andric /// that are part of that. 435*0b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 436*0b57cec5SDimitry Andric const SourceManager &SM, 437*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 438*0b57cec5SDimitry Andric Token TheTok; 439*0b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts)) 440*0b57cec5SDimitry Andric return 0; 441*0b57cec5SDimitry Andric return TheTok.getLength(); 442*0b57cec5SDimitry Andric } 443*0b57cec5SDimitry Andric 444*0b57cec5SDimitry Andric /// Relex the token at the specified location. 445*0b57cec5SDimitry Andric /// \returns true if there was a failure, false on success. 446*0b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 447*0b57cec5SDimitry Andric const SourceManager &SM, 448*0b57cec5SDimitry Andric const LangOptions &LangOpts, 449*0b57cec5SDimitry Andric bool IgnoreWhiteSpace) { 450*0b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')', 451*0b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle 452*0b57cec5SDimitry Andric // all obviously single-char tokens. This could use 453*0b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 454*0b57cec5SDimitry Andric // something. 455*0b57cec5SDimitry Andric 456*0b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not 457*0b57cec5SDimitry Andric // the token this macro expanded to. 458*0b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc); 459*0b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 460*0b57cec5SDimitry Andric bool Invalid = false; 461*0b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 462*0b57cec5SDimitry Andric if (Invalid) 463*0b57cec5SDimitry Andric return true; 464*0b57cec5SDimitry Andric 465*0b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second; 466*0b57cec5SDimitry Andric 467*0b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0])) 468*0b57cec5SDimitry Andric return true; 469*0b57cec5SDimitry Andric 470*0b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 471*0b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 472*0b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end()); 473*0b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 474*0b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result); 475*0b57cec5SDimitry Andric return false; 476*0b57cec5SDimitry Andric } 477*0b57cec5SDimitry Andric 478*0b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains 479*0b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid. 480*0b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) { 481*0b57cec5SDimitry Andric const char *BufStart = Buffer.data(); 482*0b57cec5SDimitry Andric if (Offset >= Buffer.size()) 483*0b57cec5SDimitry Andric return nullptr; 484*0b57cec5SDimitry Andric 485*0b57cec5SDimitry Andric const char *LexStart = BufStart + Offset; 486*0b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) { 487*0b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) && 488*0b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) { 489*0b57cec5SDimitry Andric // LexStart should point at first character of logical line. 490*0b57cec5SDimitry Andric ++LexStart; 491*0b57cec5SDimitry Andric break; 492*0b57cec5SDimitry Andric } 493*0b57cec5SDimitry Andric } 494*0b57cec5SDimitry Andric return LexStart; 495*0b57cec5SDimitry Andric } 496*0b57cec5SDimitry Andric 497*0b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 498*0b57cec5SDimitry Andric const SourceManager &SM, 499*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 500*0b57cec5SDimitry Andric assert(Loc.isFileID()); 501*0b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 502*0b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 503*0b57cec5SDimitry Andric return Loc; 504*0b57cec5SDimitry Andric 505*0b57cec5SDimitry Andric bool Invalid = false; 506*0b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 507*0b57cec5SDimitry Andric if (Invalid) 508*0b57cec5SDimitry Andric return Loc; 509*0b57cec5SDimitry Andric 510*0b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line 511*0b57cec5SDimitry Andric // (or the buffer). We'll relex from that point. 512*0b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second; 513*0b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second); 514*0b57cec5SDimitry Andric if (!LexStart || LexStart == StrData) 515*0b57cec5SDimitry Andric return Loc; 516*0b57cec5SDimitry Andric 517*0b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token. 518*0b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 519*0b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart, 520*0b57cec5SDimitry Andric Buffer.end()); 521*0b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 522*0b57cec5SDimitry Andric 523*0b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location. 524*0b57cec5SDimitry Andric Token TheTok; 525*0b57cec5SDimitry Andric do { 526*0b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 527*0b57cec5SDimitry Andric 528*0b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) { 529*0b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're 530*0b57cec5SDimitry Andric // looking for. If the current token encompasses our source location, 531*0b57cec5SDimitry Andric // return the beginning of that token. 532*0b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 533*0b57cec5SDimitry Andric return TheTok.getLocation(); 534*0b57cec5SDimitry Andric 535*0b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means 536*0b57cec5SDimitry Andric // that it points into whitespace. We're done here. 537*0b57cec5SDimitry Andric break; 538*0b57cec5SDimitry Andric } 539*0b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof); 540*0b57cec5SDimitry Andric 541*0b57cec5SDimitry Andric // We've passed our source location; just return the original source location. 542*0b57cec5SDimitry Andric return Loc; 543*0b57cec5SDimitry Andric } 544*0b57cec5SDimitry Andric 545*0b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 546*0b57cec5SDimitry Andric const SourceManager &SM, 547*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 548*0b57cec5SDimitry Andric if (Loc.isFileID()) 549*0b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts); 550*0b57cec5SDimitry Andric 551*0b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc)) 552*0b57cec5SDimitry Andric return Loc; 553*0b57cec5SDimitry Andric 554*0b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc); 555*0b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 556*0b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 557*0b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo = 558*0b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc); 559*0b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first && 560*0b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second); 561*0b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 562*0b57cec5SDimitry Andric } 563*0b57cec5SDimitry Andric 564*0b57cec5SDimitry Andric namespace { 565*0b57cec5SDimitry Andric 566*0b57cec5SDimitry Andric enum PreambleDirectiveKind { 567*0b57cec5SDimitry Andric PDK_Skipped, 568*0b57cec5SDimitry Andric PDK_Unknown 569*0b57cec5SDimitry Andric }; 570*0b57cec5SDimitry Andric 571*0b57cec5SDimitry Andric } // namespace 572*0b57cec5SDimitry Andric 573*0b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer, 574*0b57cec5SDimitry Andric const LangOptions &LangOpts, 575*0b57cec5SDimitry Andric unsigned MaxLines) { 576*0b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a 577*0b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our 578*0b57cec5SDimitry Andric // position within the file. 579*0b57cec5SDimitry Andric const unsigned StartOffset = 1; 580*0b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 581*0b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(), 582*0b57cec5SDimitry Andric Buffer.end()); 583*0b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true); 584*0b57cec5SDimitry Andric 585*0b57cec5SDimitry Andric bool InPreprocessorDirective = false; 586*0b57cec5SDimitry Andric Token TheTok; 587*0b57cec5SDimitry Andric SourceLocation ActiveCommentLoc; 588*0b57cec5SDimitry Andric 589*0b57cec5SDimitry Andric unsigned MaxLineOffset = 0; 590*0b57cec5SDimitry Andric if (MaxLines) { 591*0b57cec5SDimitry Andric const char *CurPtr = Buffer.begin(); 592*0b57cec5SDimitry Andric unsigned CurLine = 0; 593*0b57cec5SDimitry Andric while (CurPtr != Buffer.end()) { 594*0b57cec5SDimitry Andric char ch = *CurPtr++; 595*0b57cec5SDimitry Andric if (ch == '\n') { 596*0b57cec5SDimitry Andric ++CurLine; 597*0b57cec5SDimitry Andric if (CurLine == MaxLines) 598*0b57cec5SDimitry Andric break; 599*0b57cec5SDimitry Andric } 600*0b57cec5SDimitry Andric } 601*0b57cec5SDimitry Andric if (CurPtr != Buffer.end()) 602*0b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin(); 603*0b57cec5SDimitry Andric } 604*0b57cec5SDimitry Andric 605*0b57cec5SDimitry Andric do { 606*0b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 607*0b57cec5SDimitry Andric 608*0b57cec5SDimitry Andric if (InPreprocessorDirective) { 609*0b57cec5SDimitry Andric // If we've hit the end of the file, we're done. 610*0b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) { 611*0b57cec5SDimitry Andric break; 612*0b57cec5SDimitry Andric } 613*0b57cec5SDimitry Andric 614*0b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this 615*0b57cec5SDimitry Andric // token. 616*0b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine()) 617*0b57cec5SDimitry Andric continue; 618*0b57cec5SDimitry Andric 619*0b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look 620*0b57cec5SDimitry Andric // at this token again below. 621*0b57cec5SDimitry Andric InPreprocessorDirective = false; 622*0b57cec5SDimitry Andric } 623*0b57cec5SDimitry Andric 624*0b57cec5SDimitry Andric // Keep track of the # of lines in the preamble. 625*0b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) { 626*0b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 627*0b57cec5SDimitry Andric 628*0b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble, 629*0b57cec5SDimitry Andric // and we're about to exceed that limit, we're done. 630*0b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset) 631*0b57cec5SDimitry Andric break; 632*0b57cec5SDimitry Andric } 633*0b57cec5SDimitry Andric 634*0b57cec5SDimitry Andric // Comments are okay; skip over them. 635*0b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) { 636*0b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid()) 637*0b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation(); 638*0b57cec5SDimitry Andric continue; 639*0b57cec5SDimitry Andric } 640*0b57cec5SDimitry Andric 641*0b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 642*0b57cec5SDimitry Andric // This is the start of a preprocessor directive. 643*0b57cec5SDimitry Andric Token HashTok = TheTok; 644*0b57cec5SDimitry Andric InPreprocessorDirective = true; 645*0b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation(); 646*0b57cec5SDimitry Andric 647*0b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens, 648*0b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at 649*0b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives. 650*0b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok); 651*0b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 652*0b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier(); 653*0b57cec5SDimitry Andric PreambleDirectiveKind PDK 654*0b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 655*0b57cec5SDimitry Andric .Case("include", PDK_Skipped) 656*0b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped) 657*0b57cec5SDimitry Andric .Case("define", PDK_Skipped) 658*0b57cec5SDimitry Andric .Case("undef", PDK_Skipped) 659*0b57cec5SDimitry Andric .Case("line", PDK_Skipped) 660*0b57cec5SDimitry Andric .Case("error", PDK_Skipped) 661*0b57cec5SDimitry Andric .Case("pragma", PDK_Skipped) 662*0b57cec5SDimitry Andric .Case("import", PDK_Skipped) 663*0b57cec5SDimitry Andric .Case("include_next", PDK_Skipped) 664*0b57cec5SDimitry Andric .Case("warning", PDK_Skipped) 665*0b57cec5SDimitry Andric .Case("ident", PDK_Skipped) 666*0b57cec5SDimitry Andric .Case("sccs", PDK_Skipped) 667*0b57cec5SDimitry Andric .Case("assert", PDK_Skipped) 668*0b57cec5SDimitry Andric .Case("unassert", PDK_Skipped) 669*0b57cec5SDimitry Andric .Case("if", PDK_Skipped) 670*0b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped) 671*0b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped) 672*0b57cec5SDimitry Andric .Case("elif", PDK_Skipped) 673*0b57cec5SDimitry Andric .Case("else", PDK_Skipped) 674*0b57cec5SDimitry Andric .Case("endif", PDK_Skipped) 675*0b57cec5SDimitry Andric .Default(PDK_Unknown); 676*0b57cec5SDimitry Andric 677*0b57cec5SDimitry Andric switch (PDK) { 678*0b57cec5SDimitry Andric case PDK_Skipped: 679*0b57cec5SDimitry Andric continue; 680*0b57cec5SDimitry Andric 681*0b57cec5SDimitry Andric case PDK_Unknown: 682*0b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'. 683*0b57cec5SDimitry Andric break; 684*0b57cec5SDimitry Andric } 685*0b57cec5SDimitry Andric } 686*0b57cec5SDimitry Andric 687*0b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor 688*0b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this 689*0b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'. 690*0b57cec5SDimitry Andric TheTok = HashTok; 691*0b57cec5SDimitry Andric } 692*0b57cec5SDimitry Andric 693*0b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the 694*0b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in 695*0b57cec5SDimitry Andric // the preamble. 696*0b57cec5SDimitry Andric break; 697*0b57cec5SDimitry Andric } while (true); 698*0b57cec5SDimitry Andric 699*0b57cec5SDimitry Andric SourceLocation End; 700*0b57cec5SDimitry Andric if (ActiveCommentLoc.isValid()) 701*0b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment. 702*0b57cec5SDimitry Andric else 703*0b57cec5SDimitry Andric End = TheTok.getLocation(); 704*0b57cec5SDimitry Andric 705*0b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(), 706*0b57cec5SDimitry Andric TheTok.isAtStartOfLine()); 707*0b57cec5SDimitry Andric } 708*0b57cec5SDimitry Andric 709*0b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo, 710*0b57cec5SDimitry Andric const SourceManager &SM, 711*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 712*0b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion 713*0b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and 714*0b57cec5SDimitry Andric // trigraphs. 715*0b57cec5SDimitry Andric bool Invalid = false; 716*0b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 717*0b57cec5SDimitry Andric 718*0b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done. 719*0b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 720*0b57cec5SDimitry Andric return 0; 721*0b57cec5SDimitry Andric 722*0b57cec5SDimitry Andric unsigned PhysOffset = 0; 723*0b57cec5SDimitry Andric 724*0b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip 725*0b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple 726*0b57cec5SDimitry Andric // chars, this method is extremely fast. 727*0b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 728*0b57cec5SDimitry Andric if (CharNo == 0) 729*0b57cec5SDimitry Andric return PhysOffset; 730*0b57cec5SDimitry Andric ++TokPtr; 731*0b57cec5SDimitry Andric --CharNo; 732*0b57cec5SDimitry Andric ++PhysOffset; 733*0b57cec5SDimitry Andric } 734*0b57cec5SDimitry Andric 735*0b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a 736*0b57cec5SDimitry Andric // lexer to parse it correctly. 737*0b57cec5SDimitry Andric for (; CharNo; --CharNo) { 738*0b57cec5SDimitry Andric unsigned Size; 739*0b57cec5SDimitry Andric Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 740*0b57cec5SDimitry Andric TokPtr += Size; 741*0b57cec5SDimitry Andric PhysOffset += Size; 742*0b57cec5SDimitry Andric } 743*0b57cec5SDimitry Andric 744*0b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the 745*0b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar 746*0b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding 747*0b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph. 748*0b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 749*0b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 750*0b57cec5SDimitry Andric 751*0b57cec5SDimitry Andric return PhysOffset; 752*0b57cec5SDimitry Andric } 753*0b57cec5SDimitry Andric 754*0b57cec5SDimitry Andric /// Computes the source location just past the end of the 755*0b57cec5SDimitry Andric /// token at this source location. 756*0b57cec5SDimitry Andric /// 757*0b57cec5SDimitry Andric /// This routine can be used to produce a source location that 758*0b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and 759*0b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a 760*0b57cec5SDimitry Andric /// token where it expected something different that it received. If 761*0b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if 762*0b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid 763*0b57cec5SDimitry Andric /// source location. 764*0b57cec5SDimitry Andric /// 765*0b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source 766*0b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source 767*0b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces 768*0b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc. 769*0b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 770*0b57cec5SDimitry Andric const SourceManager &SM, 771*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 772*0b57cec5SDimitry Andric if (Loc.isInvalid()) 773*0b57cec5SDimitry Andric return {}; 774*0b57cec5SDimitry Andric 775*0b57cec5SDimitry Andric if (Loc.isMacroID()) { 776*0b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 777*0b57cec5SDimitry Andric return {}; // Points inside the macro expansion. 778*0b57cec5SDimitry Andric } 779*0b57cec5SDimitry Andric 780*0b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 781*0b57cec5SDimitry Andric if (Len > Offset) 782*0b57cec5SDimitry Andric Len = Len - Offset; 783*0b57cec5SDimitry Andric else 784*0b57cec5SDimitry Andric return Loc; 785*0b57cec5SDimitry Andric 786*0b57cec5SDimitry Andric return Loc.getLocWithOffset(Len); 787*0b57cec5SDimitry Andric } 788*0b57cec5SDimitry Andric 789*0b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first 790*0b57cec5SDimitry Andric /// token of the macro expansion. 791*0b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 792*0b57cec5SDimitry Andric const SourceManager &SM, 793*0b57cec5SDimitry Andric const LangOptions &LangOpts, 794*0b57cec5SDimitry Andric SourceLocation *MacroBegin) { 795*0b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 796*0b57cec5SDimitry Andric 797*0b57cec5SDimitry Andric SourceLocation expansionLoc; 798*0b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc)) 799*0b57cec5SDimitry Andric return false; 800*0b57cec5SDimitry Andric 801*0b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 802*0b57cec5SDimitry Andric // No other macro expansions, this is the first. 803*0b57cec5SDimitry Andric if (MacroBegin) 804*0b57cec5SDimitry Andric *MacroBegin = expansionLoc; 805*0b57cec5SDimitry Andric return true; 806*0b57cec5SDimitry Andric } 807*0b57cec5SDimitry Andric 808*0b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 809*0b57cec5SDimitry Andric } 810*0b57cec5SDimitry Andric 811*0b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last 812*0b57cec5SDimitry Andric /// token of the macro expansion. 813*0b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 814*0b57cec5SDimitry Andric const SourceManager &SM, 815*0b57cec5SDimitry Andric const LangOptions &LangOpts, 816*0b57cec5SDimitry Andric SourceLocation *MacroEnd) { 817*0b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 818*0b57cec5SDimitry Andric 819*0b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc); 820*0b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 821*0b57cec5SDimitry Andric if (tokLen == 0) 822*0b57cec5SDimitry Andric return false; 823*0b57cec5SDimitry Andric 824*0b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen); 825*0b57cec5SDimitry Andric SourceLocation expansionLoc; 826*0b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc)) 827*0b57cec5SDimitry Andric return false; 828*0b57cec5SDimitry Andric 829*0b57cec5SDimitry Andric if (expansionLoc.isFileID()) { 830*0b57cec5SDimitry Andric // No other macro expansions. 831*0b57cec5SDimitry Andric if (MacroEnd) 832*0b57cec5SDimitry Andric *MacroEnd = expansionLoc; 833*0b57cec5SDimitry Andric return true; 834*0b57cec5SDimitry Andric } 835*0b57cec5SDimitry Andric 836*0b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 837*0b57cec5SDimitry Andric } 838*0b57cec5SDimitry Andric 839*0b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 840*0b57cec5SDimitry Andric const SourceManager &SM, 841*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 842*0b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 843*0b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 844*0b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID()); 845*0b57cec5SDimitry Andric if (Range.isTokenRange()) { 846*0b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 847*0b57cec5SDimitry Andric if (End.isInvalid()) 848*0b57cec5SDimitry Andric return {}; 849*0b57cec5SDimitry Andric } 850*0b57cec5SDimitry Andric 851*0b57cec5SDimitry Andric // Break down the source locations. 852*0b57cec5SDimitry Andric FileID FID; 853*0b57cec5SDimitry Andric unsigned BeginOffs; 854*0b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 855*0b57cec5SDimitry Andric if (FID.isInvalid()) 856*0b57cec5SDimitry Andric return {}; 857*0b57cec5SDimitry Andric 858*0b57cec5SDimitry Andric unsigned EndOffs; 859*0b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) || 860*0b57cec5SDimitry Andric BeginOffs > EndOffs) 861*0b57cec5SDimitry Andric return {}; 862*0b57cec5SDimitry Andric 863*0b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End); 864*0b57cec5SDimitry Andric } 865*0b57cec5SDimitry Andric 866*0b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 867*0b57cec5SDimitry Andric const SourceManager &SM, 868*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 869*0b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin(); 870*0b57cec5SDimitry Andric SourceLocation End = Range.getEnd(); 871*0b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid()) 872*0b57cec5SDimitry Andric return {}; 873*0b57cec5SDimitry Andric 874*0b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID()) 875*0b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 876*0b57cec5SDimitry Andric 877*0b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) { 878*0b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 879*0b57cec5SDimitry Andric return {}; 880*0b57cec5SDimitry Andric Range.setBegin(Begin); 881*0b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 882*0b57cec5SDimitry Andric } 883*0b57cec5SDimitry Andric 884*0b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) { 885*0b57cec5SDimitry Andric if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 886*0b57cec5SDimitry Andric &End)) || 887*0b57cec5SDimitry Andric (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 888*0b57cec5SDimitry Andric &End))) 889*0b57cec5SDimitry Andric return {}; 890*0b57cec5SDimitry Andric Range.setEnd(End); 891*0b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 892*0b57cec5SDimitry Andric } 893*0b57cec5SDimitry Andric 894*0b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID()); 895*0b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd; 896*0b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 897*0b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 898*0b57cec5SDimitry Andric &MacroEnd)) || 899*0b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 900*0b57cec5SDimitry Andric &MacroEnd)))) { 901*0b57cec5SDimitry Andric Range.setBegin(MacroBegin); 902*0b57cec5SDimitry Andric Range.setEnd(MacroEnd); 903*0b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts); 904*0b57cec5SDimitry Andric } 905*0b57cec5SDimitry Andric 906*0b57cec5SDimitry Andric bool Invalid = false; 907*0b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin), 908*0b57cec5SDimitry Andric &Invalid); 909*0b57cec5SDimitry Andric if (Invalid) 910*0b57cec5SDimitry Andric return {}; 911*0b57cec5SDimitry Andric 912*0b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) { 913*0b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End), 914*0b57cec5SDimitry Andric &Invalid); 915*0b57cec5SDimitry Andric if (Invalid) 916*0b57cec5SDimitry Andric return {}; 917*0b57cec5SDimitry Andric 918*0b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() && 919*0b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() == 920*0b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) { 921*0b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin)); 922*0b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End)); 923*0b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts); 924*0b57cec5SDimitry Andric } 925*0b57cec5SDimitry Andric } 926*0b57cec5SDimitry Andric 927*0b57cec5SDimitry Andric return {}; 928*0b57cec5SDimitry Andric } 929*0b57cec5SDimitry Andric 930*0b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range, 931*0b57cec5SDimitry Andric const SourceManager &SM, 932*0b57cec5SDimitry Andric const LangOptions &LangOpts, 933*0b57cec5SDimitry Andric bool *Invalid) { 934*0b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts); 935*0b57cec5SDimitry Andric if (Range.isInvalid()) { 936*0b57cec5SDimitry Andric if (Invalid) *Invalid = true; 937*0b57cec5SDimitry Andric return {}; 938*0b57cec5SDimitry Andric } 939*0b57cec5SDimitry Andric 940*0b57cec5SDimitry Andric // Break down the source location. 941*0b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 942*0b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) { 943*0b57cec5SDimitry Andric if (Invalid) *Invalid = true; 944*0b57cec5SDimitry Andric return {}; 945*0b57cec5SDimitry Andric } 946*0b57cec5SDimitry Andric 947*0b57cec5SDimitry Andric unsigned EndOffs; 948*0b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 949*0b57cec5SDimitry Andric beginInfo.second > EndOffs) { 950*0b57cec5SDimitry Andric if (Invalid) *Invalid = true; 951*0b57cec5SDimitry Andric return {}; 952*0b57cec5SDimitry Andric } 953*0b57cec5SDimitry Andric 954*0b57cec5SDimitry Andric // Try to the load the file buffer. 955*0b57cec5SDimitry Andric bool invalidTemp = false; 956*0b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 957*0b57cec5SDimitry Andric if (invalidTemp) { 958*0b57cec5SDimitry Andric if (Invalid) *Invalid = true; 959*0b57cec5SDimitry Andric return {}; 960*0b57cec5SDimitry Andric } 961*0b57cec5SDimitry Andric 962*0b57cec5SDimitry Andric if (Invalid) *Invalid = false; 963*0b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second); 964*0b57cec5SDimitry Andric } 965*0b57cec5SDimitry Andric 966*0b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 967*0b57cec5SDimitry Andric const SourceManager &SM, 968*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 969*0b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 970*0b57cec5SDimitry Andric 971*0b57cec5SDimitry Andric // Find the location of the immediate macro expansion. 972*0b57cec5SDimitry Andric while (true) { 973*0b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc); 974*0b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 975*0b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 976*0b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart(); 977*0b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion()) 978*0b57cec5SDimitry Andric break; 979*0b57cec5SDimitry Andric 980*0b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come 981*0b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )" 982*0b57cec5SDimitry Andric 983*0b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the 984*0b57cec5SDimitry Andric // macro expansion. 985*0b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 986*0b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc(); 987*0b57cec5SDimitry Andric if (SpellLoc.isFileID()) 988*0b57cec5SDimitry Andric break; // No inner macro. 989*0b57cec5SDimitry Andric 990*0b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion 991*0b57cec5SDimitry Andric // location, it means there is no inner macro. 992*0b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc); 993*0b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID)) 994*0b57cec5SDimitry Andric break; 995*0b57cec5SDimitry Andric 996*0b57cec5SDimitry Andric // Argument came from inner macro. 997*0b57cec5SDimitry Andric Loc = SpellLoc; 998*0b57cec5SDimitry Andric } 999*0b57cec5SDimitry Andric 1000*0b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 1001*0b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 1002*0b57cec5SDimitry Andric // expanding this macro. 1003*0b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc); 1004*0b57cec5SDimitry Andric 1005*0b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 1006*0b57cec5SDimitry Andric // name so that we can render it into the expansion note. 1007*0b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1008*0b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1009*0b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1010*0b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1011*0b57cec5SDimitry Andric } 1012*0b57cec5SDimitry Andric 1013*0b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics( 1014*0b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) { 1015*0b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros"); 1016*0b57cec5SDimitry Andric // Walk past macro argument expansions. 1017*0b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc)) 1018*0b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin(); 1019*0b57cec5SDimitry Andric 1020*0b57cec5SDimitry Andric // If the macro's spelling has no FileID, then it's actually a token paste 1021*0b57cec5SDimitry Andric // or stringization (or similar) and not a macro at all. 1022*0b57cec5SDimitry Andric if (!SM.getFileEntryForID(SM.getFileID(SM.getSpellingLoc(Loc)))) 1023*0b57cec5SDimitry Andric return {}; 1024*0b57cec5SDimitry Andric 1025*0b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion 1026*0b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin 1027*0b57cec5SDimitry Andric // expanding this macro. 1028*0b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin()); 1029*0b57cec5SDimitry Andric 1030*0b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the 1031*0b57cec5SDimitry Andric // name so that we can render it into the expansion note. 1032*0b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1033*0b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1034*0b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1035*0b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1036*0b57cec5SDimitry Andric } 1037*0b57cec5SDimitry Andric 1038*0b57cec5SDimitry Andric bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 1039*0b57cec5SDimitry Andric return isIdentifierBody(c, LangOpts.DollarIdents); 1040*0b57cec5SDimitry Andric } 1041*0b57cec5SDimitry Andric 1042*0b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) { 1043*0b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0])); 1044*0b57cec5SDimitry Andric if (Str - 1 < BufferStart) 1045*0b57cec5SDimitry Andric return false; 1046*0b57cec5SDimitry Andric 1047*0b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') || 1048*0b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) { 1049*0b57cec5SDimitry Andric if (Str - 2 < BufferStart) 1050*0b57cec5SDimitry Andric return false; 1051*0b57cec5SDimitry Andric --Str; 1052*0b57cec5SDimitry Andric } 1053*0b57cec5SDimitry Andric --Str; 1054*0b57cec5SDimitry Andric 1055*0b57cec5SDimitry Andric // Rewind to first non-space character: 1056*0b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str)) 1057*0b57cec5SDimitry Andric --Str; 1058*0b57cec5SDimitry Andric 1059*0b57cec5SDimitry Andric return *Str == '\\'; 1060*0b57cec5SDimitry Andric } 1061*0b57cec5SDimitry Andric 1062*0b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc, 1063*0b57cec5SDimitry Andric const SourceManager &SM) { 1064*0b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID()) 1065*0b57cec5SDimitry Andric return {}; 1066*0b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1067*0b57cec5SDimitry Andric if (LocInfo.first.isInvalid()) 1068*0b57cec5SDimitry Andric return {}; 1069*0b57cec5SDimitry Andric bool Invalid = false; 1070*0b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 1071*0b57cec5SDimitry Andric if (Invalid) 1072*0b57cec5SDimitry Andric return {}; 1073*0b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second); 1074*0b57cec5SDimitry Andric if (!Line) 1075*0b57cec5SDimitry Andric return {}; 1076*0b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data()); 1077*0b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t"); 1078*0b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos 1079*0b57cec5SDimitry Andric ? "" 1080*0b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars); 1081*0b57cec5SDimitry Andric } 1082*0b57cec5SDimitry Andric 1083*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1084*0b57cec5SDimitry Andric // Diagnostics forwarding code. 1085*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1086*0b57cec5SDimitry Andric 1087*0b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1088*0b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping. 1089*0b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow 1090*0b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1091*0b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1092*0b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1093*0b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1094*0b57cec5SDimitry Andric SourceLocation FileLoc, 1095*0b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) { 1096*0b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1097*0b57cec5SDimitry Andric 1098*0b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like 1099*0b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the 1100*0b57cec5SDimitry Andric // spelling location. 1101*0b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager(); 1102*0b57cec5SDimitry Andric 1103*0b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1104*0b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset. 1105*0b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1106*0b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1107*0b57cec5SDimitry Andric 1108*0b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the 1109*0b57cec5SDimitry Andric // original _Pragma(...) sequence. 1110*0b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc); 1111*0b57cec5SDimitry Andric 1112*0b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen); 1113*0b57cec5SDimitry Andric } 1114*0b57cec5SDimitry Andric 1115*0b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified 1116*0b57cec5SDimitry Andric /// offset in the current file. 1117*0b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc, 1118*0b57cec5SDimitry Andric unsigned TokLen) const { 1119*0b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd && 1120*0b57cec5SDimitry Andric "Location out of range for this buffer!"); 1121*0b57cec5SDimitry Andric 1122*0b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return 1123*0b57cec5SDimitry Andric // the file id from FileLoc with the offset specified. 1124*0b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart; 1125*0b57cec5SDimitry Andric if (FileLoc.isFileID()) 1126*0b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo); 1127*0b57cec5SDimitry Andric 1128*0b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1129*0b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined. 1130*0b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers"); 1131*0b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1132*0b57cec5SDimitry Andric } 1133*0b57cec5SDimitry Andric 1134*0b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source 1135*0b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering. 1136*0b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1137*0b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID); 1138*0b57cec5SDimitry Andric } 1139*0b57cec5SDimitry Andric 1140*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1141*0b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code. 1142*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1143*0b57cec5SDimitry Andric 1144*0b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1145*0b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1146*0b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) { 1147*0b57cec5SDimitry Andric switch (Letter) { 1148*0b57cec5SDimitry Andric default: return 0; 1149*0b57cec5SDimitry Andric case '=': return '#'; 1150*0b57cec5SDimitry Andric case ')': return ']'; 1151*0b57cec5SDimitry Andric case '(': return '['; 1152*0b57cec5SDimitry Andric case '!': return '|'; 1153*0b57cec5SDimitry Andric case '\'': return '^'; 1154*0b57cec5SDimitry Andric case '>': return '}'; 1155*0b57cec5SDimitry Andric case '/': return '\\'; 1156*0b57cec5SDimitry Andric case '<': return '{'; 1157*0b57cec5SDimitry Andric case '-': return '~'; 1158*0b57cec5SDimitry Andric } 1159*0b57cec5SDimitry Andric } 1160*0b57cec5SDimitry Andric 1161*0b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1162*0b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1163*0b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use 1164*0b57cec5SDimitry Andric /// whether trigraphs are enabled or not. 1165*0b57cec5SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1166*0b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP); 1167*0b57cec5SDimitry Andric if (!Res || !L) return Res; 1168*0b57cec5SDimitry Andric 1169*0b57cec5SDimitry Andric if (!L->getLangOpts().Trigraphs) { 1170*0b57cec5SDimitry Andric if (!L->isLexingRawMode()) 1171*0b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored); 1172*0b57cec5SDimitry Andric return 0; 1173*0b57cec5SDimitry Andric } 1174*0b57cec5SDimitry Andric 1175*0b57cec5SDimitry Andric if (!L->isLexingRawMode()) 1176*0b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1177*0b57cec5SDimitry Andric return Res; 1178*0b57cec5SDimitry Andric } 1179*0b57cec5SDimitry Andric 1180*0b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1181*0b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1182*0b57cec5SDimitry Andric /// trigraph equivalent on entry to this function. 1183*0b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1184*0b57cec5SDimitry Andric unsigned Size = 0; 1185*0b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) { 1186*0b57cec5SDimitry Andric ++Size; 1187*0b57cec5SDimitry Andric 1188*0b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1189*0b57cec5SDimitry Andric continue; 1190*0b57cec5SDimitry Andric 1191*0b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half. 1192*0b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1193*0b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size]) 1194*0b57cec5SDimitry Andric ++Size; 1195*0b57cec5SDimitry Andric 1196*0b57cec5SDimitry Andric return Size; 1197*0b57cec5SDimitry Andric } 1198*0b57cec5SDimitry Andric 1199*0b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else. 1200*0b57cec5SDimitry Andric return 0; 1201*0b57cec5SDimitry Andric } 1202*0b57cec5SDimitry Andric 1203*0b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1204*0b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found, 1205*0b57cec5SDimitry Andric /// otherwise return P. 1206*0b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) { 1207*0b57cec5SDimitry Andric while (true) { 1208*0b57cec5SDimitry Andric const char *AfterEscape; 1209*0b57cec5SDimitry Andric if (*P == '\\') { 1210*0b57cec5SDimitry Andric AfterEscape = P+1; 1211*0b57cec5SDimitry Andric } else if (*P == '?') { 1212*0b57cec5SDimitry Andric // If not a trigraph for escape, bail out. 1213*0b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/') 1214*0b57cec5SDimitry Andric return P; 1215*0b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not 1216*0b57cec5SDimitry Andric // support trigraphs. 1217*0b57cec5SDimitry Andric AfterEscape = P+3; 1218*0b57cec5SDimitry Andric } else { 1219*0b57cec5SDimitry Andric return P; 1220*0b57cec5SDimitry Andric } 1221*0b57cec5SDimitry Andric 1222*0b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1223*0b57cec5SDimitry Andric if (NewLineSize == 0) return P; 1224*0b57cec5SDimitry Andric P = AfterEscape+NewLineSize; 1225*0b57cec5SDimitry Andric } 1226*0b57cec5SDimitry Andric } 1227*0b57cec5SDimitry Andric 1228*0b57cec5SDimitry Andric Optional<Token> Lexer::findNextToken(SourceLocation Loc, 1229*0b57cec5SDimitry Andric const SourceManager &SM, 1230*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 1231*0b57cec5SDimitry Andric if (Loc.isMacroID()) { 1232*0b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1233*0b57cec5SDimitry Andric return None; 1234*0b57cec5SDimitry Andric } 1235*0b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1236*0b57cec5SDimitry Andric 1237*0b57cec5SDimitry Andric // Break down the source location. 1238*0b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1239*0b57cec5SDimitry Andric 1240*0b57cec5SDimitry Andric // Try to load the file buffer. 1241*0b57cec5SDimitry Andric bool InvalidTemp = false; 1242*0b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1243*0b57cec5SDimitry Andric if (InvalidTemp) 1244*0b57cec5SDimitry Andric return None; 1245*0b57cec5SDimitry Andric 1246*0b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second; 1247*0b57cec5SDimitry Andric 1248*0b57cec5SDimitry Andric // Lex from the start of the given location. 1249*0b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1250*0b57cec5SDimitry Andric TokenBegin, File.end()); 1251*0b57cec5SDimitry Andric // Find the token. 1252*0b57cec5SDimitry Andric Token Tok; 1253*0b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok); 1254*0b57cec5SDimitry Andric return Tok; 1255*0b57cec5SDimitry Andric } 1256*0b57cec5SDimitry Andric 1257*0b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the 1258*0b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location 1259*0b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the 1260*0b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid. 1261*0b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken( 1262*0b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM, 1263*0b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) { 1264*0b57cec5SDimitry Andric Optional<Token> Tok = findNextToken(Loc, SM, LangOpts); 1265*0b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind)) 1266*0b57cec5SDimitry Andric return {}; 1267*0b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation(); 1268*0b57cec5SDimitry Andric 1269*0b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any. 1270*0b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0; 1271*0b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) { 1272*0b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength(); 1273*0b57cec5SDimitry Andric unsigned char C = *TokenEnd; 1274*0b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) { 1275*0b57cec5SDimitry Andric C = *(++TokenEnd); 1276*0b57cec5SDimitry Andric NumWhitespaceChars++; 1277*0b57cec5SDimitry Andric } 1278*0b57cec5SDimitry Andric 1279*0b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r 1280*0b57cec5SDimitry Andric if (C == '\n' || C == '\r') { 1281*0b57cec5SDimitry Andric char PrevC = C; 1282*0b57cec5SDimitry Andric C = *(++TokenEnd); 1283*0b57cec5SDimitry Andric NumWhitespaceChars++; 1284*0b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC) 1285*0b57cec5SDimitry Andric NumWhitespaceChars++; 1286*0b57cec5SDimitry Andric } 1287*0b57cec5SDimitry Andric } 1288*0b57cec5SDimitry Andric 1289*0b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars); 1290*0b57cec5SDimitry Andric } 1291*0b57cec5SDimitry Andric 1292*0b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1293*0b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases: 1294*0b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1295*0b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?', 1296*0b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not. 1297*0b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between 1298*0b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return 1299*0b57cec5SDimitry Andric /// the char after it. 1300*0b57cec5SDimitry Andric /// 1301*0b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1302*0b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented 1303*0b57cec5SDimitry Andric /// Ptr by Size bytes. 1304*0b57cec5SDimitry Andric /// 1305*0b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1306*0b57cec5SDimitry Andric /// be updated to match. 1307*0b57cec5SDimitry Andric char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1308*0b57cec5SDimitry Andric Token *Tok) { 1309*0b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 1310*0b57cec5SDimitry Andric if (Ptr[0] == '\\') { 1311*0b57cec5SDimitry Andric ++Size; 1312*0b57cec5SDimitry Andric ++Ptr; 1313*0b57cec5SDimitry Andric Slash: 1314*0b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 1315*0b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 1316*0b57cec5SDimitry Andric 1317*0b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and 1318*0b57cec5SDimitry Andric // newline. 1319*0b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1320*0b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 1321*0b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 1322*0b57cec5SDimitry Andric 1323*0b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline. 1324*0b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1325*0b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space); 1326*0b57cec5SDimitry Andric 1327*0b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 1328*0b57cec5SDimitry Andric Size += EscapedNewLineSize; 1329*0b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 1330*0b57cec5SDimitry Andric 1331*0b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 1332*0b57cec5SDimitry Andric return getCharAndSizeSlow(Ptr, Size, Tok); 1333*0b57cec5SDimitry Andric } 1334*0b57cec5SDimitry Andric 1335*0b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 1336*0b57cec5SDimitry Andric return '\\'; 1337*0b57cec5SDimitry Andric } 1338*0b57cec5SDimitry Andric 1339*0b57cec5SDimitry Andric // If this is a trigraph, process it. 1340*0b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') { 1341*0b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit 1342*0b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it. 1343*0b57cec5SDimitry Andric if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : nullptr)) { 1344*0b57cec5SDimitry Andric // Remember that this token needs to be cleaned. 1345*0b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning); 1346*0b57cec5SDimitry Andric 1347*0b57cec5SDimitry Andric Ptr += 3; 1348*0b57cec5SDimitry Andric Size += 3; 1349*0b57cec5SDimitry Andric if (C == '\\') goto Slash; 1350*0b57cec5SDimitry Andric return C; 1351*0b57cec5SDimitry Andric } 1352*0b57cec5SDimitry Andric } 1353*0b57cec5SDimitry Andric 1354*0b57cec5SDimitry Andric // If this is neither, return a single character. 1355*0b57cec5SDimitry Andric ++Size; 1356*0b57cec5SDimitry Andric return *Ptr; 1357*0b57cec5SDimitry Andric } 1358*0b57cec5SDimitry Andric 1359*0b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1360*0b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1361*0b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes. 1362*0b57cec5SDimitry Andric /// 1363*0b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1364*0b57cec5SDimitry Andric /// be updated to match. 1365*0b57cec5SDimitry Andric char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1366*0b57cec5SDimitry Andric const LangOptions &LangOpts) { 1367*0b57cec5SDimitry Andric // If we have a slash, look for an escaped newline. 1368*0b57cec5SDimitry Andric if (Ptr[0] == '\\') { 1369*0b57cec5SDimitry Andric ++Size; 1370*0b57cec5SDimitry Andric ++Ptr; 1371*0b57cec5SDimitry Andric Slash: 1372*0b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace. 1373*0b57cec5SDimitry Andric if (!isWhitespace(Ptr[0])) return '\\'; 1374*0b57cec5SDimitry Andric 1375*0b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline. 1376*0b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1377*0b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it. 1378*0b57cec5SDimitry Andric Size += EscapedNewLineSize; 1379*0b57cec5SDimitry Andric Ptr += EscapedNewLineSize; 1380*0b57cec5SDimitry Andric 1381*0b57cec5SDimitry Andric // Use slow version to accumulate a correct size field. 1382*0b57cec5SDimitry Andric return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 1383*0b57cec5SDimitry Andric } 1384*0b57cec5SDimitry Andric 1385*0b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash. 1386*0b57cec5SDimitry Andric return '\\'; 1387*0b57cec5SDimitry Andric } 1388*0b57cec5SDimitry Andric 1389*0b57cec5SDimitry Andric // If this is a trigraph, process it. 1390*0b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1391*0b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return 1392*0b57cec5SDimitry Andric // it. 1393*0b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1394*0b57cec5SDimitry Andric Ptr += 3; 1395*0b57cec5SDimitry Andric Size += 3; 1396*0b57cec5SDimitry Andric if (C == '\\') goto Slash; 1397*0b57cec5SDimitry Andric return C; 1398*0b57cec5SDimitry Andric } 1399*0b57cec5SDimitry Andric } 1400*0b57cec5SDimitry Andric 1401*0b57cec5SDimitry Andric // If this is neither, return a single character. 1402*0b57cec5SDimitry Andric ++Size; 1403*0b57cec5SDimitry Andric return *Ptr; 1404*0b57cec5SDimitry Andric } 1405*0b57cec5SDimitry Andric 1406*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1407*0b57cec5SDimitry Andric // Helper methods for lexing. 1408*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 1409*0b57cec5SDimitry Andric 1410*0b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file. 1411*0b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) { 1412*0b57cec5SDimitry Andric BufferPtr = BufferStart + Offset; 1413*0b57cec5SDimitry Andric if (BufferPtr > BufferEnd) 1414*0b57cec5SDimitry Andric BufferPtr = BufferEnd; 1415*0b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two 1416*0b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the 1417*0b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line. 1418*0b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine; 1419*0b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine; 1420*0b57cec5SDimitry Andric } 1421*0b57cec5SDimitry Andric 1422*0b57cec5SDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) { 1423*0b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 1424*0b57cec5SDimitry Andric return false; 1425*0b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1426*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars( 1427*0b57cec5SDimitry Andric C11AllowedIDCharRanges); 1428*0b57cec5SDimitry Andric return C11AllowedIDChars.contains(C); 1429*0b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus) { 1430*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1431*0b57cec5SDimitry Andric CXX03AllowedIDCharRanges); 1432*0b57cec5SDimitry Andric return CXX03AllowedIDChars.contains(C); 1433*0b57cec5SDimitry Andric } else { 1434*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1435*0b57cec5SDimitry Andric C99AllowedIDCharRanges); 1436*0b57cec5SDimitry Andric return C99AllowedIDChars.contains(C); 1437*0b57cec5SDimitry Andric } 1438*0b57cec5SDimitry Andric } 1439*0b57cec5SDimitry Andric 1440*0b57cec5SDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) { 1441*0b57cec5SDimitry Andric assert(isAllowedIDChar(C, LangOpts)); 1442*0b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) { 1443*0b57cec5SDimitry Andric return false; 1444*0b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus11 || LangOpts.C11) { 1445*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars( 1446*0b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges); 1447*0b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C); 1448*0b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus) { 1449*0b57cec5SDimitry Andric return true; 1450*0b57cec5SDimitry Andric } else { 1451*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1452*0b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 1453*0b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C); 1454*0b57cec5SDimitry Andric } 1455*0b57cec5SDimitry Andric } 1456*0b57cec5SDimitry Andric 1457*0b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin, 1458*0b57cec5SDimitry Andric const char *End) { 1459*0b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin), 1460*0b57cec5SDimitry Andric L.getSourceLocation(End)); 1461*0b57cec5SDimitry Andric } 1462*0b57cec5SDimitry Andric 1463*0b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C, 1464*0b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) { 1465*0b57cec5SDimitry Andric // Check C99 compatibility. 1466*0b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) { 1467*0b57cec5SDimitry Andric enum { 1468*0b57cec5SDimitry Andric CannotAppearInIdentifier = 0, 1469*0b57cec5SDimitry Andric CannotStartIdentifier 1470*0b57cec5SDimitry Andric }; 1471*0b57cec5SDimitry Andric 1472*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars( 1473*0b57cec5SDimitry Andric C99AllowedIDCharRanges); 1474*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars( 1475*0b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges); 1476*0b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) { 1477*0b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1478*0b57cec5SDimitry Andric << Range 1479*0b57cec5SDimitry Andric << CannotAppearInIdentifier; 1480*0b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) { 1481*0b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id) 1482*0b57cec5SDimitry Andric << Range 1483*0b57cec5SDimitry Andric << CannotStartIdentifier; 1484*0b57cec5SDimitry Andric } 1485*0b57cec5SDimitry Andric } 1486*0b57cec5SDimitry Andric 1487*0b57cec5SDimitry Andric // Check C++98 compatibility. 1488*0b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_unicode_id, Range.getBegin())) { 1489*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet CXX03AllowedIDChars( 1490*0b57cec5SDimitry Andric CXX03AllowedIDCharRanges); 1491*0b57cec5SDimitry Andric if (!CXX03AllowedIDChars.contains(C)) { 1492*0b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id) 1493*0b57cec5SDimitry Andric << Range; 1494*0b57cec5SDimitry Andric } 1495*0b57cec5SDimitry Andric } 1496*0b57cec5SDimitry Andric } 1497*0b57cec5SDimitry Andric 1498*0b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier 1499*0b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier 1500*0b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier 1501*0b57cec5SDimitry Andric /// character and warn if so. 1502*0b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C, 1503*0b57cec5SDimitry Andric CharSourceRange Range) { 1504*0b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes). 1505*0b57cec5SDimitry Andric struct HomoglyphPair { 1506*0b57cec5SDimitry Andric uint32_t Character; 1507*0b57cec5SDimitry Andric char LooksLike; 1508*0b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; } 1509*0b57cec5SDimitry Andric }; 1510*0b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = { 1511*0b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN 1512*0b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK 1513*0b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK 1514*0b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE 1515*0b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER 1516*0b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER 1517*0b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER 1518*0b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION 1519*0b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES 1520*0b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR 1521*0b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS 1522*0b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN 1523*0b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH 1524*0b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS 1525*0b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR 1526*0b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES 1527*0b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND 1528*0b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO 1529*0b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR 1530*0b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON 1531*0b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE 1532*0b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK 1533*0b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN 1534*0b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN 1535*0b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN 1536*0b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND 1537*0b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS 1538*0b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS 1539*0b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK 1540*0b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK 1541*0b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA 1542*0b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS 1543*0b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP 1544*0b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS 1545*0b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON 1546*0b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON 1547*0b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN 1548*0b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN 1549*0b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN 1550*0b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK 1551*0b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT 1552*0b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET 1553*0b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS 1554*0b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET 1555*0b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT 1556*0b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET 1557*0b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE 1558*0b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET 1559*0b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE 1560*0b57cec5SDimitry Andric {0, 0} 1561*0b57cec5SDimitry Andric }; 1562*0b57cec5SDimitry Andric auto Homoglyph = 1563*0b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs), 1564*0b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'}); 1565*0b57cec5SDimitry Andric if (Homoglyph->Character == C) { 1566*0b57cec5SDimitry Andric llvm::SmallString<5> CharBuf; 1567*0b57cec5SDimitry Andric { 1568*0b57cec5SDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf); 1569*0b57cec5SDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4); 1570*0b57cec5SDimitry Andric } 1571*0b57cec5SDimitry Andric if (Homoglyph->LooksLike) { 1572*0b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0}; 1573*0b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph) 1574*0b57cec5SDimitry Andric << Range << CharBuf << LooksLikeStr; 1575*0b57cec5SDimitry Andric } else { 1576*0b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width) 1577*0b57cec5SDimitry Andric << Range << CharBuf; 1578*0b57cec5SDimitry Andric } 1579*0b57cec5SDimitry Andric } 1580*0b57cec5SDimitry Andric } 1581*0b57cec5SDimitry Andric 1582*0b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 1583*0b57cec5SDimitry Andric Token &Result) { 1584*0b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size; 1585*0b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr); 1586*0b57cec5SDimitry Andric if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts)) 1587*0b57cec5SDimitry Andric return false; 1588*0b57cec5SDimitry Andric 1589*0b57cec5SDimitry Andric if (!isLexingRawMode()) 1590*0b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1591*0b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr), 1592*0b57cec5SDimitry Andric /*IsFirst=*/false); 1593*0b57cec5SDimitry Andric 1594*0b57cec5SDimitry Andric Result.setFlag(Token::HasUCN); 1595*0b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1596*0b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1597*0b57cec5SDimitry Andric CurPtr = UCNPtr; 1598*0b57cec5SDimitry Andric else 1599*0b57cec5SDimitry Andric while (CurPtr != UCNPtr) 1600*0b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 1601*0b57cec5SDimitry Andric return true; 1602*0b57cec5SDimitry Andric } 1603*0b57cec5SDimitry Andric 1604*0b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr) { 1605*0b57cec5SDimitry Andric const char *UnicodePtr = CurPtr; 1606*0b57cec5SDimitry Andric llvm::UTF32 CodePoint; 1607*0b57cec5SDimitry Andric llvm::ConversionResult Result = 1608*0b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&UnicodePtr, 1609*0b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 1610*0b57cec5SDimitry Andric &CodePoint, 1611*0b57cec5SDimitry Andric llvm::strictConversion); 1612*0b57cec5SDimitry Andric if (Result != llvm::conversionOK || 1613*0b57cec5SDimitry Andric !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts)) 1614*0b57cec5SDimitry Andric return false; 1615*0b57cec5SDimitry Andric 1616*0b57cec5SDimitry Andric if (!isLexingRawMode()) { 1617*0b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint, 1618*0b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr), 1619*0b57cec5SDimitry Andric /*IsFirst=*/false); 1620*0b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint, 1621*0b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UnicodePtr)); 1622*0b57cec5SDimitry Andric } 1623*0b57cec5SDimitry Andric 1624*0b57cec5SDimitry Andric CurPtr = UnicodePtr; 1625*0b57cec5SDimitry Andric return true; 1626*0b57cec5SDimitry Andric } 1627*0b57cec5SDimitry Andric 1628*0b57cec5SDimitry Andric bool Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1629*0b57cec5SDimitry Andric // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1630*0b57cec5SDimitry Andric unsigned Size; 1631*0b57cec5SDimitry Andric unsigned char C = *CurPtr++; 1632*0b57cec5SDimitry Andric while (isIdentifierBody(C)) 1633*0b57cec5SDimitry Andric C = *CurPtr++; 1634*0b57cec5SDimitry Andric 1635*0b57cec5SDimitry Andric --CurPtr; // Back up over the skipped character. 1636*0b57cec5SDimitry Andric 1637*0b57cec5SDimitry Andric // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1638*0b57cec5SDimitry Andric // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1639*0b57cec5SDimitry Andric // 1640*0b57cec5SDimitry Andric // TODO: Could merge these checks into an InfoTable flag to make the 1641*0b57cec5SDimitry Andric // comparison cheaper 1642*0b57cec5SDimitry Andric if (isASCII(C) && C != '\\' && C != '?' && 1643*0b57cec5SDimitry Andric (C != '$' || !LangOpts.DollarIdents)) { 1644*0b57cec5SDimitry Andric FinishIdentifier: 1645*0b57cec5SDimitry Andric const char *IdStart = BufferPtr; 1646*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1647*0b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart); 1648*0b57cec5SDimitry Andric 1649*0b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to 1650*0b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it. 1651*0b57cec5SDimitry Andric if (LexingRawMode) 1652*0b57cec5SDimitry Andric return true; 1653*0b57cec5SDimitry Andric 1654*0b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind, 1655*0b57cec5SDimitry Andric // looking up the identifier in the identifier table. 1656*0b57cec5SDimitry Andric IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1657*0b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code 1658*0b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it. 1659*0b57cec5SDimitry Andric 1660*0b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat 1661*0b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword. 1662*0b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'. 1663*0b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) { 1664*0b57cec5SDimitry Andric // Return the code-completion token. 1665*0b57cec5SDimitry Andric Result.setKind(tok::code_completion); 1666*0b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters. 1667*0b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in 1668*0b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that 1669*0b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code 1670*0b57cec5SDimitry Andric // simpler. 1671*0b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0"); 1672*0b57cec5SDimitry Andric ++CurPtr; 1673*0b57cec5SDimitry Andric // Note that code completion token is not added as a separate character 1674*0b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need 1675*0b57cec5SDimitry Andric // to check if the buffer has ended. 1676*0b57cec5SDimitry Andric if (CurPtr < BufferEnd) { 1677*0b57cec5SDimitry Andric while (isIdentifierBody(*CurPtr)) 1678*0b57cec5SDimitry Andric ++CurPtr; 1679*0b57cec5SDimitry Andric } 1680*0b57cec5SDimitry Andric BufferPtr = CurPtr; 1681*0b57cec5SDimitry Andric return true; 1682*0b57cec5SDimitry Andric } 1683*0b57cec5SDimitry Andric 1684*0b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the 1685*0b57cec5SDimitry Andric // preprocessor, which may macro expand it or something. 1686*0b57cec5SDimitry Andric if (II->isHandleIdentifierCase()) 1687*0b57cec5SDimitry Andric return PP->HandleIdentifier(Result); 1688*0b57cec5SDimitry Andric 1689*0b57cec5SDimitry Andric return true; 1690*0b57cec5SDimitry Andric } 1691*0b57cec5SDimitry Andric 1692*0b57cec5SDimitry Andric // Otherwise, $,\,? in identifier found. Enter slower path. 1693*0b57cec5SDimitry Andric 1694*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1695*0b57cec5SDimitry Andric while (true) { 1696*0b57cec5SDimitry Andric if (C == '$') { 1697*0b57cec5SDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done. 1698*0b57cec5SDimitry Andric if (!LangOpts.DollarIdents) goto FinishIdentifier; 1699*0b57cec5SDimitry Andric 1700*0b57cec5SDimitry Andric // Otherwise, emit a diagnostic and continue. 1701*0b57cec5SDimitry Andric if (!isLexingRawMode()) 1702*0b57cec5SDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier); 1703*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1704*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1705*0b57cec5SDimitry Andric continue; 1706*0b57cec5SDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) { 1707*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1708*0b57cec5SDimitry Andric continue; 1709*0b57cec5SDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) { 1710*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1711*0b57cec5SDimitry Andric continue; 1712*0b57cec5SDimitry Andric } else if (!isIdentifierBody(C)) { 1713*0b57cec5SDimitry Andric goto FinishIdentifier; 1714*0b57cec5SDimitry Andric } 1715*0b57cec5SDimitry Andric 1716*0b57cec5SDimitry Andric // Otherwise, this character is good, consume it. 1717*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1718*0b57cec5SDimitry Andric 1719*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1720*0b57cec5SDimitry Andric while (isIdentifierBody(C)) { 1721*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1722*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1723*0b57cec5SDimitry Andric } 1724*0b57cec5SDimitry Andric } 1725*0b57cec5SDimitry Andric } 1726*0b57cec5SDimitry Andric 1727*0b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant. 1728*0b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens). 1729*0b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1730*0b57cec5SDimitry Andric unsigned Size; 1731*0b57cec5SDimitry Andric char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 1732*0b57cec5SDimitry Andric if (C1 != '0') 1733*0b57cec5SDimitry Andric return false; 1734*0b57cec5SDimitry Andric char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 1735*0b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X'); 1736*0b57cec5SDimitry Andric } 1737*0b57cec5SDimitry Andric 1738*0b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point 1739*0b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the 1740*0b57cec5SDimitry Andric /// constant. 1741*0b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1742*0b57cec5SDimitry Andric unsigned Size; 1743*0b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 1744*0b57cec5SDimitry Andric char PrevCh = 0; 1745*0b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) { 1746*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1747*0b57cec5SDimitry Andric PrevCh = C; 1748*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1749*0b57cec5SDimitry Andric } 1750*0b57cec5SDimitry Andric 1751*0b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1752*0b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1753*0b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex. 1754*0b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1755*0b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 1756*0b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1757*0b57cec5SDimitry Andric } 1758*0b57cec5SDimitry Andric 1759*0b57cec5SDimitry Andric // If we have a hex FP constant, continue. 1760*0b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 1761*0b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a 1762*0b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's 1763*0b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix. 1764*0b57cec5SDimitry Andric bool IsHexFloat = true; 1765*0b57cec5SDimitry Andric if (!LangOpts.C99) { 1766*0b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts)) 1767*0b57cec5SDimitry Andric IsHexFloat = false; 1768*0b57cec5SDimitry Andric else if (!getLangOpts().CPlusPlus17 && 1769*0b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr) 1770*0b57cec5SDimitry Andric IsHexFloat = false; 1771*0b57cec5SDimitry Andric } 1772*0b57cec5SDimitry Andric if (IsHexFloat) 1773*0b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1774*0b57cec5SDimitry Andric } 1775*0b57cec5SDimitry Andric 1776*0b57cec5SDimitry Andric // If we have a digit separator, continue. 1777*0b57cec5SDimitry Andric if (C == '\'' && getLangOpts().CPlusPlus14) { 1778*0b57cec5SDimitry Andric unsigned NextSize; 1779*0b57cec5SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Size, NextSize, getLangOpts()); 1780*0b57cec5SDimitry Andric if (isIdentifierBody(Next)) { 1781*0b57cec5SDimitry Andric if (!isLexingRawMode()) 1782*0b57cec5SDimitry Andric Diag(CurPtr, diag::warn_cxx11_compat_digit_separator); 1783*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1784*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result); 1785*0b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 1786*0b57cec5SDimitry Andric } 1787*0b57cec5SDimitry Andric } 1788*0b57cec5SDimitry Andric 1789*0b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue. 1790*0b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1791*0b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 1792*0b57cec5SDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1793*0b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 1794*0b57cec5SDimitry Andric 1795*0b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 1796*0b57cec5SDimitry Andric const char *TokStart = BufferPtr; 1797*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1798*0b57cec5SDimitry Andric Result.setLiteralData(TokStart); 1799*0b57cec5SDimitry Andric return true; 1800*0b57cec5SDimitry Andric } 1801*0b57cec5SDimitry Andric 1802*0b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1803*0b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98. 1804*0b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr, 1805*0b57cec5SDimitry Andric bool IsStringLiteral) { 1806*0b57cec5SDimitry Andric assert(getLangOpts().CPlusPlus); 1807*0b57cec5SDimitry Andric 1808*0b57cec5SDimitry Andric // Maximally munch an identifier. 1809*0b57cec5SDimitry Andric unsigned Size; 1810*0b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size); 1811*0b57cec5SDimitry Andric bool Consumed = false; 1812*0b57cec5SDimitry Andric 1813*0b57cec5SDimitry Andric if (!isIdentifierHead(C)) { 1814*0b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) 1815*0b57cec5SDimitry Andric Consumed = true; 1816*0b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) 1817*0b57cec5SDimitry Andric Consumed = true; 1818*0b57cec5SDimitry Andric else 1819*0b57cec5SDimitry Andric return CurPtr; 1820*0b57cec5SDimitry Andric } 1821*0b57cec5SDimitry Andric 1822*0b57cec5SDimitry Andric if (!getLangOpts().CPlusPlus11) { 1823*0b57cec5SDimitry Andric if (!isLexingRawMode()) 1824*0b57cec5SDimitry Andric Diag(CurPtr, 1825*0b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal 1826*0b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal) 1827*0b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1828*0b57cec5SDimitry Andric return CurPtr; 1829*0b57cec5SDimitry Andric } 1830*0b57cec5SDimitry Andric 1831*0b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 1832*0b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming 1833*0b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before 1834*0b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more 1835*0b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that. 1836*0b57cec5SDimitry Andric if (!Consumed) { 1837*0b57cec5SDimitry Andric bool IsUDSuffix = false; 1838*0b57cec5SDimitry Andric if (C == '_') 1839*0b57cec5SDimitry Andric IsUDSuffix = true; 1840*0b57cec5SDimitry Andric else if (IsStringLiteral && getLangOpts().CPlusPlus14) { 1841*0b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a 1842*0b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be 1843*0b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator). 1844*0b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3; 1845*0b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C }; 1846*0b57cec5SDimitry Andric unsigned Consumed = Size; 1847*0b57cec5SDimitry Andric unsigned Chars = 1; 1848*0b57cec5SDimitry Andric while (true) { 1849*0b57cec5SDimitry Andric unsigned NextSize; 1850*0b57cec5SDimitry Andric char Next = getCharAndSizeNoWarn(CurPtr + Consumed, NextSize, 1851*0b57cec5SDimitry Andric getLangOpts()); 1852*0b57cec5SDimitry Andric if (!isIdentifierBody(Next)) { 1853*0b57cec5SDimitry Andric // End of suffix. Check whether this is on the whitelist. 1854*0b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars); 1855*0b57cec5SDimitry Andric IsUDSuffix = StringLiteralParser::isValidUDSuffix(getLangOpts(), 1856*0b57cec5SDimitry Andric CompleteSuffix); 1857*0b57cec5SDimitry Andric break; 1858*0b57cec5SDimitry Andric } 1859*0b57cec5SDimitry Andric 1860*0b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength) 1861*0b57cec5SDimitry Andric // Too long: can't be a standard suffix. 1862*0b57cec5SDimitry Andric break; 1863*0b57cec5SDimitry Andric 1864*0b57cec5SDimitry Andric Buffer[Chars++] = Next; 1865*0b57cec5SDimitry Andric Consumed += NextSize; 1866*0b57cec5SDimitry Andric } 1867*0b57cec5SDimitry Andric } 1868*0b57cec5SDimitry Andric 1869*0b57cec5SDimitry Andric if (!IsUDSuffix) { 1870*0b57cec5SDimitry Andric if (!isLexingRawMode()) 1871*0b57cec5SDimitry Andric Diag(CurPtr, getLangOpts().MSVCCompat 1872*0b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal 1873*0b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal) 1874*0b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1875*0b57cec5SDimitry Andric return CurPtr; 1876*0b57cec5SDimitry Andric } 1877*0b57cec5SDimitry Andric 1878*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result); 1879*0b57cec5SDimitry Andric } 1880*0b57cec5SDimitry Andric 1881*0b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix); 1882*0b57cec5SDimitry Andric while (true) { 1883*0b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size); 1884*0b57cec5SDimitry Andric if (isIdentifierBody(C)) { CurPtr = ConsumeChar(CurPtr, Size, Result); } 1885*0b57cec5SDimitry Andric else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {} 1886*0b57cec5SDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr)) {} 1887*0b57cec5SDimitry Andric else break; 1888*0b57cec5SDimitry Andric } 1889*0b57cec5SDimitry Andric 1890*0b57cec5SDimitry Andric return CurPtr; 1891*0b57cec5SDimitry Andric } 1892*0b57cec5SDimitry Andric 1893*0b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1894*0b57cec5SDimitry Andric /// either " or L" or u8" or u" or U". 1895*0b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1896*0b57cec5SDimitry Andric tok::TokenKind Kind) { 1897*0b57cec5SDimitry Andric const char *AfterQuote = CurPtr; 1898*0b57cec5SDimitry Andric // Does this string contain the \0 character? 1899*0b57cec5SDimitry Andric const char *NulCharacter = nullptr; 1900*0b57cec5SDimitry Andric 1901*0b57cec5SDimitry Andric if (!isLexingRawMode() && 1902*0b57cec5SDimitry Andric (Kind == tok::utf8_string_literal || 1903*0b57cec5SDimitry Andric Kind == tok::utf16_string_literal || 1904*0b57cec5SDimitry Andric Kind == tok::utf32_string_literal)) 1905*0b57cec5SDimitry Andric Diag(BufferPtr, getLangOpts().CPlusPlus 1906*0b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 1907*0b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 1908*0b57cec5SDimitry Andric 1909*0b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 1910*0b57cec5SDimitry Andric while (C != '"') { 1911*0b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 1912*0b57cec5SDimitry Andric // getAndAdvanceChar. 1913*0b57cec5SDimitry Andric if (C == '\\') 1914*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 1915*0b57cec5SDimitry Andric 1916*0b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 1917*0b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1918*0b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1919*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1; 1920*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1921*0b57cec5SDimitry Andric return true; 1922*0b57cec5SDimitry Andric } 1923*0b57cec5SDimitry Andric 1924*0b57cec5SDimitry Andric if (C == 0) { 1925*0b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 1926*0b57cec5SDimitry Andric if (ParsingFilename) 1927*0b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false); 1928*0b57cec5SDimitry Andric else 1929*0b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 1930*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 1931*0b57cec5SDimitry Andric cutOffLexing(); 1932*0b57cec5SDimitry Andric return true; 1933*0b57cec5SDimitry Andric } 1934*0b57cec5SDimitry Andric 1935*0b57cec5SDimitry Andric NulCharacter = CurPtr-1; 1936*0b57cec5SDimitry Andric } 1937*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 1938*0b57cec5SDimitry Andric } 1939*0b57cec5SDimitry Andric 1940*0b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 1941*0b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 1942*0b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 1943*0b57cec5SDimitry Andric 1944*0b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 1945*0b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 1946*0b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 1947*0b57cec5SDimitry Andric 1948*0b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var. 1949*0b57cec5SDimitry Andric const char *TokStart = BufferPtr; 1950*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 1951*0b57cec5SDimitry Andric Result.setLiteralData(TokStart); 1952*0b57cec5SDimitry Andric return true; 1953*0b57cec5SDimitry Andric } 1954*0b57cec5SDimitry Andric 1955*0b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1956*0b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR". 1957*0b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1958*0b57cec5SDimitry Andric tok::TokenKind Kind) { 1959*0b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1960*0b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string, 1961*0b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs, 1962*0b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted. 1963*0b57cec5SDimitry Andric 1964*0b57cec5SDimitry Andric if (!isLexingRawMode()) 1965*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1966*0b57cec5SDimitry Andric 1967*0b57cec5SDimitry Andric unsigned PrefixLen = 0; 1968*0b57cec5SDimitry Andric 1969*0b57cec5SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1970*0b57cec5SDimitry Andric ++PrefixLen; 1971*0b57cec5SDimitry Andric 1972*0b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter. 1973*0b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') { 1974*0b57cec5SDimitry Andric if (!isLexingRawMode()) { 1975*0b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen]; 1976*0b57cec5SDimitry Andric if (PrefixLen == 16) { 1977*0b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long); 1978*0b57cec5SDimitry Andric } else { 1979*0b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1980*0b57cec5SDimitry Andric << StringRef(PrefixEnd, 1); 1981*0b57cec5SDimitry Andric } 1982*0b57cec5SDimitry Andric } 1983*0b57cec5SDimitry Andric 1984*0b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1985*0b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but 1986*0b57cec5SDimitry Andric // there's not much we can do about that. 1987*0b57cec5SDimitry Andric while (true) { 1988*0b57cec5SDimitry Andric char C = *CurPtr++; 1989*0b57cec5SDimitry Andric 1990*0b57cec5SDimitry Andric if (C == '"') 1991*0b57cec5SDimitry Andric break; 1992*0b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) { 1993*0b57cec5SDimitry Andric --CurPtr; 1994*0b57cec5SDimitry Andric break; 1995*0b57cec5SDimitry Andric } 1996*0b57cec5SDimitry Andric } 1997*0b57cec5SDimitry Andric 1998*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 1999*0b57cec5SDimitry Andric return true; 2000*0b57cec5SDimitry Andric } 2001*0b57cec5SDimitry Andric 2002*0b57cec5SDimitry Andric // Save prefix and move CurPtr past it 2003*0b57cec5SDimitry Andric const char *Prefix = CurPtr; 2004*0b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '(' 2005*0b57cec5SDimitry Andric 2006*0b57cec5SDimitry Andric while (true) { 2007*0b57cec5SDimitry Andric char C = *CurPtr++; 2008*0b57cec5SDimitry Andric 2009*0b57cec5SDimitry Andric if (C == ')') { 2010*0b57cec5SDimitry Andric // Check for prefix match and closing quote. 2011*0b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 2012*0b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"' 2013*0b57cec5SDimitry Andric break; 2014*0b57cec5SDimitry Andric } 2015*0b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 2016*0b57cec5SDimitry Andric if (!isLexingRawMode()) 2017*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string) 2018*0b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen); 2019*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2020*0b57cec5SDimitry Andric return true; 2021*0b57cec5SDimitry Andric } 2022*0b57cec5SDimitry Andric } 2023*0b57cec5SDimitry Andric 2024*0b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 2025*0b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 2026*0b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true); 2027*0b57cec5SDimitry Andric 2028*0b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 2029*0b57cec5SDimitry Andric const char *TokStart = BufferPtr; 2030*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 2031*0b57cec5SDimitry Andric Result.setLiteralData(TokStart); 2032*0b57cec5SDimitry Andric return true; 2033*0b57cec5SDimitry Andric } 2034*0b57cec5SDimitry Andric 2035*0b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 2036*0b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames. 2037*0b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 2038*0b57cec5SDimitry Andric // Does this string contain the \0 character? 2039*0b57cec5SDimitry Andric const char *NulCharacter = nullptr; 2040*0b57cec5SDimitry Andric const char *AfterLessPos = CurPtr; 2041*0b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 2042*0b57cec5SDimitry Andric while (C != '>') { 2043*0b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by 2044*0b57cec5SDimitry Andric // getAndAdvanceChar. 2045*0b57cec5SDimitry Andric if (C == '\\') 2046*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 2047*0b57cec5SDimitry Andric 2048*0b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 2049*0b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file. 2050*0b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone < 2051*0b57cec5SDimitry Andric // character. Return this as such. 2052*0b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less); 2053*0b57cec5SDimitry Andric return true; 2054*0b57cec5SDimitry Andric } 2055*0b57cec5SDimitry Andric 2056*0b57cec5SDimitry Andric if (C == 0) { 2057*0b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) { 2058*0b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true); 2059*0b57cec5SDimitry Andric cutOffLexing(); 2060*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown); 2061*0b57cec5SDimitry Andric return true; 2062*0b57cec5SDimitry Andric } 2063*0b57cec5SDimitry Andric NulCharacter = CurPtr-1; 2064*0b57cec5SDimitry Andric } 2065*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 2066*0b57cec5SDimitry Andric } 2067*0b57cec5SDimitry Andric 2068*0b57cec5SDimitry Andric // If a nul character existed in the string, warn about it. 2069*0b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 2070*0b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1; 2071*0b57cec5SDimitry Andric 2072*0b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 2073*0b57cec5SDimitry Andric const char *TokStart = BufferPtr; 2074*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name); 2075*0b57cec5SDimitry Andric Result.setLiteralData(TokStart); 2076*0b57cec5SDimitry Andric return true; 2077*0b57cec5SDimitry Andric } 2078*0b57cec5SDimitry Andric 2079*0b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart, 2080*0b57cec5SDimitry Andric const char *CompletionPoint, 2081*0b57cec5SDimitry Andric bool IsAngled) { 2082*0b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash. 2083*0b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart); 2084*0b57cec5SDimitry Andric auto Slash = PartialPath.find_last_of(LangOpts.MSVCCompat ? "/\\" : "/"); 2085*0b57cec5SDimitry Andric StringRef Dir = 2086*0b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash); 2087*0b57cec5SDimitry Andric const char *StartOfFilename = 2088*0b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1; 2089*0b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point. 2090*0b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get( 2091*0b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename))); 2092*0b57cec5SDimitry Andric // We should replace the characters up to the closing quote, if any. 2093*0b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) { 2094*0b57cec5SDimitry Andric char Next = *(CompletionPoint + 1); 2095*0b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n') 2096*0b57cec5SDimitry Andric break; 2097*0b57cec5SDimitry Andric ++CompletionPoint; 2098*0b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"')) 2099*0b57cec5SDimitry Andric break; 2100*0b57cec5SDimitry Andric } 2101*0b57cec5SDimitry Andric PP->setCodeCompletionTokenRange( 2102*0b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart), 2103*0b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart)); 2104*0b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled); 2105*0b57cec5SDimitry Andric } 2106*0b57cec5SDimitry Andric 2107*0b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having 2108*0b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'. 2109*0b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr, 2110*0b57cec5SDimitry Andric tok::TokenKind Kind) { 2111*0b57cec5SDimitry Andric // Does this character contain the \0 character? 2112*0b57cec5SDimitry Andric const char *NulCharacter = nullptr; 2113*0b57cec5SDimitry Andric 2114*0b57cec5SDimitry Andric if (!isLexingRawMode()) { 2115*0b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant) 2116*0b57cec5SDimitry Andric Diag(BufferPtr, getLangOpts().CPlusPlus 2117*0b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal 2118*0b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal); 2119*0b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant) 2120*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal); 2121*0b57cec5SDimitry Andric } 2122*0b57cec5SDimitry Andric 2123*0b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result); 2124*0b57cec5SDimitry Andric if (C == '\'') { 2125*0b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2126*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character); 2127*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 2128*0b57cec5SDimitry Andric return true; 2129*0b57cec5SDimitry Andric } 2130*0b57cec5SDimitry Andric 2131*0b57cec5SDimitry Andric while (C != '\'') { 2132*0b57cec5SDimitry Andric // Skip escaped characters. 2133*0b57cec5SDimitry Andric if (C == '\\') 2134*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 2135*0b57cec5SDimitry Andric 2136*0b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline. 2137*0b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 2138*0b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 2139*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0; 2140*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2141*0b57cec5SDimitry Andric return true; 2142*0b57cec5SDimitry Andric } 2143*0b57cec5SDimitry Andric 2144*0b57cec5SDimitry Andric if (C == 0) { 2145*0b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 2146*0b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 2147*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown); 2148*0b57cec5SDimitry Andric cutOffLexing(); 2149*0b57cec5SDimitry Andric return true; 2150*0b57cec5SDimitry Andric } 2151*0b57cec5SDimitry Andric 2152*0b57cec5SDimitry Andric NulCharacter = CurPtr-1; 2153*0b57cec5SDimitry Andric } 2154*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 2155*0b57cec5SDimitry Andric } 2156*0b57cec5SDimitry Andric 2157*0b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix. 2158*0b57cec5SDimitry Andric if (getLangOpts().CPlusPlus) 2159*0b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false); 2160*0b57cec5SDimitry Andric 2161*0b57cec5SDimitry Andric // If a nul character existed in the character, warn about it. 2162*0b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode()) 2163*0b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0; 2164*0b57cec5SDimitry Andric 2165*0b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 2166*0b57cec5SDimitry Andric const char *TokStart = BufferPtr; 2167*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 2168*0b57cec5SDimitry Andric Result.setLiteralData(TokStart); 2169*0b57cec5SDimitry Andric return true; 2170*0b57cec5SDimitry Andric } 2171*0b57cec5SDimitry Andric 2172*0b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2173*0b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return. 2174*0b57cec5SDimitry Andric /// 2175*0b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2176*0b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr, 2177*0b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 2178*0b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace. 2179*0b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]); 2180*0b57cec5SDimitry Andric 2181*0b57cec5SDimitry Andric unsigned char Char = *CurPtr; 2182*0b57cec5SDimitry Andric 2183*0b57cec5SDimitry Andric // Skip consecutive spaces efficiently. 2184*0b57cec5SDimitry Andric while (true) { 2185*0b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively. 2186*0b57cec5SDimitry Andric while (isHorizontalWhitespace(Char)) 2187*0b57cec5SDimitry Andric Char = *++CurPtr; 2188*0b57cec5SDimitry Andric 2189*0b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done. 2190*0b57cec5SDimitry Andric if (!isVerticalWhitespace(Char)) 2191*0b57cec5SDimitry Andric break; 2192*0b57cec5SDimitry Andric 2193*0b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 2194*0b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this. 2195*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2196*0b57cec5SDimitry Andric return false; 2197*0b57cec5SDimitry Andric } 2198*0b57cec5SDimitry Andric 2199*0b57cec5SDimitry Andric // OK, but handle newline. 2200*0b57cec5SDimitry Andric SawNewline = true; 2201*0b57cec5SDimitry Andric Char = *++CurPtr; 2202*0b57cec5SDimitry Andric } 2203*0b57cec5SDimitry Andric 2204*0b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now. 2205*0b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 2206*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 2207*0b57cec5SDimitry Andric if (SawNewline) { 2208*0b57cec5SDimitry Andric IsAtStartOfLine = true; 2209*0b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 2210*0b57cec5SDimitry Andric } 2211*0b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 2212*0b57cec5SDimitry Andric return true; 2213*0b57cec5SDimitry Andric } 2214*0b57cec5SDimitry Andric 2215*0b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space. 2216*0b57cec5SDimitry Andric char PrevChar = CurPtr[-1]; 2217*0b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar); 2218*0b57cec5SDimitry Andric 2219*0b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace); 2220*0b57cec5SDimitry Andric if (SawNewline) { 2221*0b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 2222*0b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 2223*0b57cec5SDimitry Andric } 2224*0b57cec5SDimitry Andric 2225*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2226*0b57cec5SDimitry Andric return false; 2227*0b57cec5SDimitry Andric } 2228*0b57cec5SDimitry Andric 2229*0b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the 2230*0b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and 2231*0b57cec5SDimitry Andric /// return. 2232*0b57cec5SDimitry Andric /// 2233*0b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 2234*0b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 2235*0b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr, 2236*0b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 2237*0b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an 2238*0b57cec5SDimitry Andric // extension warning. 2239*0b57cec5SDimitry Andric if (!LangOpts.LineComment && !isLexingRawMode()) { 2240*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment); 2241*0b57cec5SDimitry Andric 2242*0b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation 2243*0b57cec5SDimitry Andric // unit. 2244*0b57cec5SDimitry Andric LangOpts.LineComment = true; 2245*0b57cec5SDimitry Andric } 2246*0b57cec5SDimitry Andric 2247*0b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that 2248*0b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in 2249*0b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop. 2250*0b57cec5SDimitry Andric // 2251*0b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer) 2252*0b57cec5SDimitry Andric // character that ends the line comment. 2253*0b57cec5SDimitry Andric char C; 2254*0b57cec5SDimitry Andric while (true) { 2255*0b57cec5SDimitry Andric C = *CurPtr; 2256*0b57cec5SDimitry Andric // Skip over characters in the fast loop. 2257*0b57cec5SDimitry Andric while (C != 0 && // Potentially EOF. 2258*0b57cec5SDimitry Andric C != '\n' && C != '\r') // Newline or DOS-style newline. 2259*0b57cec5SDimitry Andric C = *++CurPtr; 2260*0b57cec5SDimitry Andric 2261*0b57cec5SDimitry Andric const char *NextLine = CurPtr; 2262*0b57cec5SDimitry Andric if (C != 0) { 2263*0b57cec5SDimitry Andric // We found a newline, see if it's escaped. 2264*0b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1; 2265*0b57cec5SDimitry Andric bool HasSpace = false; 2266*0b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace. 2267*0b57cec5SDimitry Andric --EscapePtr; 2268*0b57cec5SDimitry Andric HasSpace = true; 2269*0b57cec5SDimitry Andric } 2270*0b57cec5SDimitry Andric 2271*0b57cec5SDimitry Andric if (*EscapePtr == '\\') 2272*0b57cec5SDimitry Andric // Escaped newline. 2273*0b57cec5SDimitry Andric CurPtr = EscapePtr; 2274*0b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2275*0b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs) 2276*0b57cec5SDimitry Andric // Trigraph-escaped newline. 2277*0b57cec5SDimitry Andric CurPtr = EscapePtr-2; 2278*0b57cec5SDimitry Andric else 2279*0b57cec5SDimitry Andric break; // This is a newline, we're done. 2280*0b57cec5SDimitry Andric 2281*0b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 2282*0b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode()) 2283*0b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space); 2284*0b57cec5SDimitry Andric } 2285*0b57cec5SDimitry Andric 2286*0b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2287*0b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting 2288*0b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline, 2289*0b57cec5SDimitry Andric // we'll handle it below. 2290*0b57cec5SDimitry Andric const char *OldPtr = CurPtr; 2291*0b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode(); 2292*0b57cec5SDimitry Andric LexingRawMode = true; 2293*0b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result); 2294*0b57cec5SDimitry Andric LexingRawMode = OldRawMode; 2295*0b57cec5SDimitry Andric 2296*0b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed. 2297*0b57cec5SDimitry Andric // We're done and can skip forward to the newline. 2298*0b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) { 2299*0b57cec5SDimitry Andric CurPtr = NextLine; 2300*0b57cec5SDimitry Andric break; 2301*0b57cec5SDimitry Andric } 2302*0b57cec5SDimitry Andric 2303*0b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or 2304*0b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic 2305*0b57cec5SDimitry Andric // unless the next line is also a // comment. 2306*0b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' && 2307*0b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) { 2308*0b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr) 2309*0b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2310*0b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next 2311*0b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic. 2312*0b57cec5SDimitry Andric if (isWhitespace(C)) { 2313*0b57cec5SDimitry Andric const char *ForwardPtr = CurPtr; 2314*0b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2315*0b57cec5SDimitry Andric ++ForwardPtr; 2316*0b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2317*0b57cec5SDimitry Andric break; 2318*0b57cec5SDimitry Andric } 2319*0b57cec5SDimitry Andric 2320*0b57cec5SDimitry Andric if (!isLexingRawMode()) 2321*0b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2322*0b57cec5SDimitry Andric break; 2323*0b57cec5SDimitry Andric } 2324*0b57cec5SDimitry Andric } 2325*0b57cec5SDimitry Andric 2326*0b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) { 2327*0b57cec5SDimitry Andric --CurPtr; 2328*0b57cec5SDimitry Andric break; 2329*0b57cec5SDimitry Andric } 2330*0b57cec5SDimitry Andric 2331*0b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2332*0b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 2333*0b57cec5SDimitry Andric cutOffLexing(); 2334*0b57cec5SDimitry Andric return false; 2335*0b57cec5SDimitry Andric } 2336*0b57cec5SDimitry Andric } 2337*0b57cec5SDimitry Andric 2338*0b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the 2339*0b57cec5SDimitry Andric // comment unless we're in a #if 0 block. 2340*0b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 2341*0b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2342*0b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 2343*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2344*0b57cec5SDimitry Andric return true; // A token has to be returned. 2345*0b57cec5SDimitry Andric } 2346*0b57cec5SDimitry Andric 2347*0b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 2348*0b57cec5SDimitry Andric if (inKeepCommentMode()) 2349*0b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr); 2350*0b57cec5SDimitry Andric 2351*0b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 2352*0b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token. 2353*0b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2354*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2355*0b57cec5SDimitry Andric return false; 2356*0b57cec5SDimitry Andric } 2357*0b57cec5SDimitry Andric 2358*0b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or 2359*0b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't 2360*0b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that 2361*0b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the 2362*0b57cec5SDimitry Andric /// comment above in that mode. 2363*0b57cec5SDimitry Andric ++CurPtr; 2364*0b57cec5SDimitry Andric 2365*0b57cec5SDimitry Andric // The next returned token is at the start of the line. 2366*0b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 2367*0b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true; 2368*0b57cec5SDimitry Andric // No leading whitespace seen so far. 2369*0b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 2370*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2371*0b57cec5SDimitry Andric return false; 2372*0b57cec5SDimitry Andric } 2373*0b57cec5SDimitry Andric 2374*0b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate 2375*0b57cec5SDimitry Andric /// way and return it. 2376*0b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2377*0b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment 2378*0b57cec5SDimitry Andric // directly. 2379*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 2380*0b57cec5SDimitry Andric 2381*0b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode) 2382*0b57cec5SDimitry Andric return true; 2383*0b57cec5SDimitry Andric 2384*0b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into 2385*0b57cec5SDimitry Andric // a C-style block comment. 2386*0b57cec5SDimitry Andric bool Invalid = false; 2387*0b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid); 2388*0b57cec5SDimitry Andric if (Invalid) 2389*0b57cec5SDimitry Andric return true; 2390*0b57cec5SDimitry Andric 2391*0b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2392*0b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*". 2393*0b57cec5SDimitry Andric Spelling += "*/"; // add suffix. 2394*0b57cec5SDimitry Andric 2395*0b57cec5SDimitry Andric Result.setKind(tok::comment); 2396*0b57cec5SDimitry Andric PP->CreateString(Spelling, Result, 2397*0b57cec5SDimitry Andric Result.getLocation(), Result.getLocation()); 2398*0b57cec5SDimitry Andric return true; 2399*0b57cec5SDimitry Andric } 2400*0b57cec5SDimitry Andric 2401*0b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2402*0b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2403*0b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment. 2404*0b57cec5SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2405*0b57cec5SDimitry Andric Lexer *L) { 2406*0b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2407*0b57cec5SDimitry Andric 2408*0b57cec5SDimitry Andric // Back up off the newline. 2409*0b57cec5SDimitry Andric --CurPtr; 2410*0b57cec5SDimitry Andric 2411*0b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character. 2412*0b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2413*0b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline. 2414*0b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1]) 2415*0b57cec5SDimitry Andric return false; 2416*0b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline. 2417*0b57cec5SDimitry Andric --CurPtr; 2418*0b57cec5SDimitry Andric } 2419*0b57cec5SDimitry Andric 2420*0b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace 2421*0b57cec5SDimitry Andric // between the slash and newline. 2422*0b57cec5SDimitry Andric bool HasSpace = false; 2423*0b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2424*0b57cec5SDimitry Andric --CurPtr; 2425*0b57cec5SDimitry Andric HasSpace = true; 2426*0b57cec5SDimitry Andric } 2427*0b57cec5SDimitry Andric 2428*0b57cec5SDimitry Andric // If we have a slash, we know this is an escaped newline. 2429*0b57cec5SDimitry Andric if (*CurPtr == '\\') { 2430*0b57cec5SDimitry Andric if (CurPtr[-1] != '*') return false; 2431*0b57cec5SDimitry Andric } else { 2432*0b57cec5SDimitry Andric // It isn't a slash, is it the ?? / trigraph? 2433*0b57cec5SDimitry Andric if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2434*0b57cec5SDimitry Andric CurPtr[-3] != '*') 2435*0b57cec5SDimitry Andric return false; 2436*0b57cec5SDimitry Andric 2437*0b57cec5SDimitry Andric // This is the trigraph ending the comment. Emit a stern warning! 2438*0b57cec5SDimitry Andric CurPtr -= 2; 2439*0b57cec5SDimitry Andric 2440*0b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and 2441*0b57cec5SDimitry Andric // ignore this * character. 2442*0b57cec5SDimitry Andric if (!L->getLangOpts().Trigraphs) { 2443*0b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2444*0b57cec5SDimitry Andric L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2445*0b57cec5SDimitry Andric return false; 2446*0b57cec5SDimitry Andric } 2447*0b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2448*0b57cec5SDimitry Andric L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2449*0b57cec5SDimitry Andric } 2450*0b57cec5SDimitry Andric 2451*0b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters. 2452*0b57cec5SDimitry Andric if (!L->isLexingRawMode()) 2453*0b57cec5SDimitry Andric L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2454*0b57cec5SDimitry Andric 2455*0b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it. 2456*0b57cec5SDimitry Andric if (HasSpace && !L->isLexingRawMode()) 2457*0b57cec5SDimitry Andric L->Diag(CurPtr, diag::backslash_newline_space); 2458*0b57cec5SDimitry Andric 2459*0b57cec5SDimitry Andric return true; 2460*0b57cec5SDimitry Andric } 2461*0b57cec5SDimitry Andric 2462*0b57cec5SDimitry Andric #ifdef __SSE2__ 2463*0b57cec5SDimitry Andric #include <emmintrin.h> 2464*0b57cec5SDimitry Andric #elif __ALTIVEC__ 2465*0b57cec5SDimitry Andric #include <altivec.h> 2466*0b57cec5SDimitry Andric #undef bool 2467*0b57cec5SDimitry Andric #endif 2468*0b57cec5SDimitry Andric 2469*0b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment. 2470*0b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment. 2471*0b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block 2472*0b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing 2473*0b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between 2474*0b57cec5SDimitry Andric /// the terminating * and /. 2475*0b57cec5SDimitry Andric /// 2476*0b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted 2477*0b57cec5SDimitry Andric /// some tokens, this will store the first token and return true. 2478*0b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr, 2479*0b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) { 2480*0b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once 2481*0b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common 2482*0b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their 2483*0b57cec5SDimitry Andric // comments. 2484*0b57cec5SDimitry Andric 2485*0b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle 2486*0b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline 2487*0b57cec5SDimitry Andric // after it. 2488*0b57cec5SDimitry Andric unsigned CharSize; 2489*0b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize); 2490*0b57cec5SDimitry Andric CurPtr += CharSize; 2491*0b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) { 2492*0b57cec5SDimitry Andric if (!isLexingRawMode()) 2493*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 2494*0b57cec5SDimitry Andric --CurPtr; 2495*0b57cec5SDimitry Andric 2496*0b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 2497*0b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 2498*0b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 2499*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 2500*0b57cec5SDimitry Andric return true; 2501*0b57cec5SDimitry Andric } 2502*0b57cec5SDimitry Andric 2503*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2504*0b57cec5SDimitry Andric return false; 2505*0b57cec5SDimitry Andric } 2506*0b57cec5SDimitry Andric 2507*0b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so, 2508*0b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it. 2509*0b57cec5SDimitry Andric if (C == '/') 2510*0b57cec5SDimitry Andric C = *CurPtr++; 2511*0b57cec5SDimitry Andric 2512*0b57cec5SDimitry Andric while (true) { 2513*0b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a 2514*0b57cec5SDimitry Andric // (probably ending) '/' character. 2515*0b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd && 2516*0b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it 2517*0b57cec5SDimitry Andric // doesn't check for '\0'. 2518*0b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2519*0b57cec5SDimitry Andric // While not aligned to a 16-byte boundary. 2520*0b57cec5SDimitry Andric while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2521*0b57cec5SDimitry Andric C = *CurPtr++; 2522*0b57cec5SDimitry Andric 2523*0b57cec5SDimitry Andric if (C == '/') goto FoundSlash; 2524*0b57cec5SDimitry Andric 2525*0b57cec5SDimitry Andric #ifdef __SSE2__ 2526*0b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/'); 2527*0b57cec5SDimitry Andric while (CurPtr+16 <= BufferEnd) { 2528*0b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2529*0b57cec5SDimitry Andric Slashes)); 2530*0b57cec5SDimitry Andric if (cmp != 0) { 2531*0b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's 2532*0b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of 2533*0b57cec5SDimitry Andric // the outer loop. 2534*0b57cec5SDimitry Andric CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1; 2535*0b57cec5SDimitry Andric goto FoundSlash; 2536*0b57cec5SDimitry Andric } 2537*0b57cec5SDimitry Andric CurPtr += 16; 2538*0b57cec5SDimitry Andric } 2539*0b57cec5SDimitry Andric #elif __ALTIVEC__ 2540*0b57cec5SDimitry Andric __vector unsigned char Slashes = { 2541*0b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/', 2542*0b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/' 2543*0b57cec5SDimitry Andric }; 2544*0b57cec5SDimitry Andric while (CurPtr+16 <= BufferEnd && 2545*0b57cec5SDimitry Andric !vec_any_eq(*(const vector unsigned char*)CurPtr, Slashes)) 2546*0b57cec5SDimitry Andric CurPtr += 16; 2547*0b57cec5SDimitry Andric #else 2548*0b57cec5SDimitry Andric // Scan for '/' quickly. Many block comments are very large. 2549*0b57cec5SDimitry Andric while (CurPtr[0] != '/' && 2550*0b57cec5SDimitry Andric CurPtr[1] != '/' && 2551*0b57cec5SDimitry Andric CurPtr[2] != '/' && 2552*0b57cec5SDimitry Andric CurPtr[3] != '/' && 2553*0b57cec5SDimitry Andric CurPtr+4 < BufferEnd) { 2554*0b57cec5SDimitry Andric CurPtr += 4; 2555*0b57cec5SDimitry Andric } 2556*0b57cec5SDimitry Andric #endif 2557*0b57cec5SDimitry Andric 2558*0b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one. 2559*0b57cec5SDimitry Andric C = *CurPtr++; 2560*0b57cec5SDimitry Andric } 2561*0b57cec5SDimitry Andric 2562*0b57cec5SDimitry Andric // Loop to scan the remainder. 2563*0b57cec5SDimitry Andric while (C != '/' && C != '\0') 2564*0b57cec5SDimitry Andric C = *CurPtr++; 2565*0b57cec5SDimitry Andric 2566*0b57cec5SDimitry Andric if (C == '/') { 2567*0b57cec5SDimitry Andric FoundSlash: 2568*0b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done! 2569*0b57cec5SDimitry Andric break; 2570*0b57cec5SDimitry Andric 2571*0b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2572*0b57cec5SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2573*0b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the 2574*0b57cec5SDimitry Andric // * and /. We're done! 2575*0b57cec5SDimitry Andric break; 2576*0b57cec5SDimitry Andric } 2577*0b57cec5SDimitry Andric } 2578*0b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2579*0b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this 2580*0b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with 2581*0b57cec5SDimitry Andric // embedded escaped newlines, but oh well. 2582*0b57cec5SDimitry Andric if (!isLexingRawMode()) 2583*0b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment); 2584*0b57cec5SDimitry Andric } 2585*0b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) { 2586*0b57cec5SDimitry Andric if (!isLexingRawMode()) 2587*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment); 2588*0b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately 2589*0b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the 2590*0b57cec5SDimitry Andric // comment, which surely would confuse the parser. 2591*0b57cec5SDimitry Andric --CurPtr; 2592*0b57cec5SDimitry Andric 2593*0b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since 2594*0b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token. 2595*0b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 2596*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 2597*0b57cec5SDimitry Andric return true; 2598*0b57cec5SDimitry Andric } 2599*0b57cec5SDimitry Andric 2600*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2601*0b57cec5SDimitry Andric return false; 2602*0b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2603*0b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 2604*0b57cec5SDimitry Andric cutOffLexing(); 2605*0b57cec5SDimitry Andric return false; 2606*0b57cec5SDimitry Andric } 2607*0b57cec5SDimitry Andric 2608*0b57cec5SDimitry Andric C = *CurPtr++; 2609*0b57cec5SDimitry Andric } 2610*0b57cec5SDimitry Andric 2611*0b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block. 2612*0b57cec5SDimitry Andric if (PP && !isLexingRawMode() && 2613*0b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2614*0b57cec5SDimitry Andric getSourceLocation(CurPtr)))) { 2615*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2616*0b57cec5SDimitry Andric return true; // A token has to be returned. 2617*0b57cec5SDimitry Andric } 2618*0b57cec5SDimitry Andric 2619*0b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token. 2620*0b57cec5SDimitry Andric if (inKeepCommentMode()) { 2621*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment); 2622*0b57cec5SDimitry Andric return true; 2623*0b57cec5SDimitry Andric } 2624*0b57cec5SDimitry Andric 2625*0b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be 2626*0b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it 2627*0b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would 2628*0b57cec5SDimitry Andric // have already returned above with the comment as a token. 2629*0b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) { 2630*0b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine); 2631*0b57cec5SDimitry Andric return false; 2632*0b57cec5SDimitry Andric } 2633*0b57cec5SDimitry Andric 2634*0b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token. 2635*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2636*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 2637*0b57cec5SDimitry Andric return false; 2638*0b57cec5SDimitry Andric } 2639*0b57cec5SDimitry Andric 2640*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2641*0b57cec5SDimitry Andric // Primary Lexing Entry Points 2642*0b57cec5SDimitry Andric //===----------------------------------------------------------------------===// 2643*0b57cec5SDimitry Andric 2644*0b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2645*0b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode. 2646*0b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 2647*0b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false && 2648*0b57cec5SDimitry Andric "Must be in a preprocessing directive!"); 2649*0b57cec5SDimitry Andric Token Tmp; 2650*0b57cec5SDimitry Andric 2651*0b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 2652*0b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 2653*0b57cec5SDimitry Andric while (true) { 2654*0b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp); 2655*0b57cec5SDimitry Andric switch (Char) { 2656*0b57cec5SDimitry Andric default: 2657*0b57cec5SDimitry Andric if (Result) 2658*0b57cec5SDimitry Andric Result->push_back(Char); 2659*0b57cec5SDimitry Andric break; 2660*0b57cec5SDimitry Andric case 0: // Null. 2661*0b57cec5SDimitry Andric // Found end of file? 2662*0b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) { 2663*0b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 2664*0b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 2665*0b57cec5SDimitry Andric cutOffLexing(); 2666*0b57cec5SDimitry Andric return; 2667*0b57cec5SDimitry Andric } 2668*0b57cec5SDimitry Andric 2669*0b57cec5SDimitry Andric // Nope, normal character, continue. 2670*0b57cec5SDimitry Andric if (Result) 2671*0b57cec5SDimitry Andric Result->push_back(Char); 2672*0b57cec5SDimitry Andric break; 2673*0b57cec5SDimitry Andric } 2674*0b57cec5SDimitry Andric // FALL THROUGH. 2675*0b57cec5SDimitry Andric LLVM_FALLTHROUGH; 2676*0b57cec5SDimitry Andric case '\r': 2677*0b57cec5SDimitry Andric case '\n': 2678*0b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2679*0b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2680*0b57cec5SDimitry Andric BufferPtr = CurPtr-1; 2681*0b57cec5SDimitry Andric 2682*0b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition. 2683*0b57cec5SDimitry Andric Lex(Tmp); 2684*0b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) { 2685*0b57cec5SDimitry Andric if (PP) 2686*0b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage(); 2687*0b57cec5SDimitry Andric Lex(Tmp); 2688*0b57cec5SDimitry Andric } 2689*0b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!"); 2690*0b57cec5SDimitry Andric 2691*0b57cec5SDimitry Andric // Finally, we're done; 2692*0b57cec5SDimitry Andric return; 2693*0b57cec5SDimitry Andric } 2694*0b57cec5SDimitry Andric } 2695*0b57cec5SDimitry Andric } 2696*0b57cec5SDimitry Andric 2697*0b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this 2698*0b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required. 2699*0b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be 2700*0b57cec5SDimitry Andric /// called again. 2701*0b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2702*0b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive, 2703*0b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will 2704*0b57cec5SDimitry Andric // then be the end of file. 2705*0b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 2706*0b57cec5SDimitry Andric // Done parsing the "line". 2707*0b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 2708*0b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 2709*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod); 2710*0b57cec5SDimitry Andric 2711*0b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 2712*0b57cec5SDimitry Andric if (PP) 2713*0b57cec5SDimitry Andric resetExtendedTokenMode(); 2714*0b57cec5SDimitry Andric return true; // Have a token. 2715*0b57cec5SDimitry Andric } 2716*0b57cec5SDimitry Andric 2717*0b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller 2718*0b57cec5SDimitry Andric // that put us in raw mode handle the event. 2719*0b57cec5SDimitry Andric if (isLexingRawMode()) { 2720*0b57cec5SDimitry Andric Result.startToken(); 2721*0b57cec5SDimitry Andric BufferPtr = BufferEnd; 2722*0b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof); 2723*0b57cec5SDimitry Andric return true; 2724*0b57cec5SDimitry Andric } 2725*0b57cec5SDimitry Andric 2726*0b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) { 2727*0b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack); 2728*0b57cec5SDimitry Andric ConditionalStack.clear(); 2729*0b57cec5SDimitry Andric } 2730*0b57cec5SDimitry Andric 2731*0b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline. 2732*0b57cec5SDimitry Andric 2733*0b57cec5SDimitry Andric // If we are in a #if directive, emit an error. 2734*0b57cec5SDimitry Andric while (!ConditionalStack.empty()) { 2735*0b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc) 2736*0b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc, 2737*0b57cec5SDimitry Andric diag::err_pp_unterminated_conditional); 2738*0b57cec5SDimitry Andric ConditionalStack.pop_back(); 2739*0b57cec5SDimitry Andric } 2740*0b57cec5SDimitry Andric 2741*0b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2742*0b57cec5SDimitry Andric // a pedwarn. 2743*0b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) { 2744*0b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics(); 2745*0b57cec5SDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd); 2746*0b57cec5SDimitry Andric unsigned DiagID; 2747*0b57cec5SDimitry Andric 2748*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 2749*0b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2 2750*0b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic, 2751*0b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning. 2752*0b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) { 2753*0b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof; 2754*0b57cec5SDimitry Andric } else { 2755*0b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof; 2756*0b57cec5SDimitry Andric } 2757*0b57cec5SDimitry Andric } else { 2758*0b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof; 2759*0b57cec5SDimitry Andric } 2760*0b57cec5SDimitry Andric 2761*0b57cec5SDimitry Andric Diag(BufferEnd, DiagID) 2762*0b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n"); 2763*0b57cec5SDimitry Andric } 2764*0b57cec5SDimitry Andric 2765*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2766*0b57cec5SDimitry Andric 2767*0b57cec5SDimitry Andric // Finally, let the preprocessor handle this. 2768*0b57cec5SDimitry Andric return PP->HandleEndOfFile(Result, isPragmaLexer()); 2769*0b57cec5SDimitry Andric } 2770*0b57cec5SDimitry Andric 2771*0b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2772*0b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something 2773*0b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the 2774*0b57cec5SDimitry Andric /// lexer. 2775*0b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() { 2776*0b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2777*0b57cec5SDimitry Andric 2778*0b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token 2779*0b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF 2780*0b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack. 2781*0b57cec5SDimitry Andric LexingRawMode = true; 2782*0b57cec5SDimitry Andric 2783*0b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it. 2784*0b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr; 2785*0b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective; 2786*0b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine; 2787*0b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 2788*0b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace; 2789*0b57cec5SDimitry Andric 2790*0b57cec5SDimitry Andric Token Tok; 2791*0b57cec5SDimitry Andric Lex(Tok); 2792*0b57cec5SDimitry Andric 2793*0b57cec5SDimitry Andric // Restore state that may have changed. 2794*0b57cec5SDimitry Andric BufferPtr = TmpBufferPtr; 2795*0b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode; 2796*0b57cec5SDimitry Andric HasLeadingSpace = leadingSpace; 2797*0b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine; 2798*0b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine; 2799*0b57cec5SDimitry Andric 2800*0b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode. 2801*0b57cec5SDimitry Andric LexingRawMode = false; 2802*0b57cec5SDimitry Andric 2803*0b57cec5SDimitry Andric if (Tok.is(tok::eof)) 2804*0b57cec5SDimitry Andric return 2; 2805*0b57cec5SDimitry Andric return Tok.is(tok::l_paren); 2806*0b57cec5SDimitry Andric } 2807*0b57cec5SDimitry Andric 2808*0b57cec5SDimitry Andric /// Find the end of a version control conflict marker. 2809*0b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2810*0b57cec5SDimitry Andric ConflictMarkerKind CMK) { 2811*0b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2812*0b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2813*0b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen); 2814*0b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator); 2815*0b57cec5SDimitry Andric while (Pos != StringRef::npos) { 2816*0b57cec5SDimitry Andric // Must occur at start of line. 2817*0b57cec5SDimitry Andric if (Pos == 0 || 2818*0b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) { 2819*0b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2820*0b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator); 2821*0b57cec5SDimitry Andric continue; 2822*0b57cec5SDimitry Andric } 2823*0b57cec5SDimitry Andric return RestOfBuffer.data()+Pos; 2824*0b57cec5SDimitry Andric } 2825*0b57cec5SDimitry Andric return nullptr; 2826*0b57cec5SDimitry Andric } 2827*0b57cec5SDimitry Andric 2828*0b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version 2829*0b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2830*0b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false 2831*0b57cec5SDimitry Andric /// if not. 2832*0b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2833*0b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 2834*0b57cec5SDimitry Andric if (CurPtr != BufferStart && 2835*0b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2836*0b57cec5SDimitry Andric return false; 2837*0b57cec5SDimitry Andric 2838*0b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>. 2839*0b57cec5SDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).startswith("<<<<<<<") && 2840*0b57cec5SDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).startswith(">>>> ")) 2841*0b57cec5SDimitry Andric return false; 2842*0b57cec5SDimitry Andric 2843*0b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 2844*0b57cec5SDimitry Andric // it. 2845*0b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode()) 2846*0b57cec5SDimitry Andric return false; 2847*0b57cec5SDimitry Andric 2848*0b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2849*0b57cec5SDimitry Andric 2850*0b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the 2851*0b57cec5SDimitry Andric // start of a line to terminate this conflict marker. 2852*0b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2853*0b57cec5SDimitry Andric // We found a match. We are really in a conflict marker. 2854*0b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line. 2855*0b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker); 2856*0b57cec5SDimitry Andric CurrentConflictMarkerState = Kind; 2857*0b57cec5SDimitry Andric 2858*0b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the 2859*0b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n. 2860*0b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') { 2861*0b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line"); 2862*0b57cec5SDimitry Andric ++CurPtr; 2863*0b57cec5SDimitry Andric } 2864*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2865*0b57cec5SDimitry Andric return true; 2866*0b57cec5SDimitry Andric } 2867*0b57cec5SDimitry Andric 2868*0b57cec5SDimitry Andric // No end of conflict marker found. 2869*0b57cec5SDimitry Andric return false; 2870*0b57cec5SDimitry Andric } 2871*0b57cec5SDimitry Andric 2872*0b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2873*0b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2874*0b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of 2875*0b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not. 2876*0b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2877*0b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line. 2878*0b57cec5SDimitry Andric if (CurPtr != BufferStart && 2879*0b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2880*0b57cec5SDimitry Andric return false; 2881*0b57cec5SDimitry Andric 2882*0b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore 2883*0b57cec5SDimitry Andric // it. 2884*0b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode()) 2885*0b57cec5SDimitry Andric return false; 2886*0b57cec5SDimitry Andric 2887*0b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row). 2888*0b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i) 2889*0b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0]) 2890*0b57cec5SDimitry Andric return false; 2891*0b57cec5SDimitry Andric 2892*0b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could 2893*0b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2894*0b57cec5SDimitry Andric // be the end of conflict marker. 2895*0b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2896*0b57cec5SDimitry Andric CurrentConflictMarkerState)) { 2897*0b57cec5SDimitry Andric CurPtr = End; 2898*0b57cec5SDimitry Andric 2899*0b57cec5SDimitry Andric // Skip ahead to the end of line. 2900*0b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2901*0b57cec5SDimitry Andric ++CurPtr; 2902*0b57cec5SDimitry Andric 2903*0b57cec5SDimitry Andric BufferPtr = CurPtr; 2904*0b57cec5SDimitry Andric 2905*0b57cec5SDimitry Andric // No longer in the conflict marker. 2906*0b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None; 2907*0b57cec5SDimitry Andric return true; 2908*0b57cec5SDimitry Andric } 2909*0b57cec5SDimitry Andric 2910*0b57cec5SDimitry Andric return false; 2911*0b57cec5SDimitry Andric } 2912*0b57cec5SDimitry Andric 2913*0b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr, 2914*0b57cec5SDimitry Andric const char *BufferEnd) { 2915*0b57cec5SDimitry Andric if (CurPtr == BufferEnd) 2916*0b57cec5SDimitry Andric return nullptr; 2917*0b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character. 2918*0b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) { 2919*0b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>') 2920*0b57cec5SDimitry Andric return CurPtr + 2; 2921*0b57cec5SDimitry Andric } 2922*0b57cec5SDimitry Andric return nullptr; 2923*0b57cec5SDimitry Andric } 2924*0b57cec5SDimitry Andric 2925*0b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) { 2926*0b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!"); 2927*0b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode) 2928*0b57cec5SDimitry Andric return false; 2929*0b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd); 2930*0b57cec5SDimitry Andric if (!End) 2931*0b57cec5SDimitry Andric return false; 2932*0b57cec5SDimitry Andric const char *Start = CurPtr - 1; 2933*0b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders) 2934*0b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source); 2935*0b57cec5SDimitry Andric Result.startToken(); 2936*0b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier); 2937*0b57cec5SDimitry Andric Result.setRawIdentifierData(Start); 2938*0b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result); 2939*0b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder); 2940*0b57cec5SDimitry Andric BufferPtr = End; 2941*0b57cec5SDimitry Andric return true; 2942*0b57cec5SDimitry Andric } 2943*0b57cec5SDimitry Andric 2944*0b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2945*0b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) { 2946*0b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2947*0b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc(); 2948*0b57cec5SDimitry Andric } 2949*0b57cec5SDimitry Andric 2950*0b57cec5SDimitry Andric return false; 2951*0b57cec5SDimitry Andric } 2952*0b57cec5SDimitry Andric 2953*0b57cec5SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 2954*0b57cec5SDimitry Andric Token *Result) { 2955*0b57cec5SDimitry Andric unsigned CharSize; 2956*0b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize); 2957*0b57cec5SDimitry Andric 2958*0b57cec5SDimitry Andric unsigned NumHexDigits; 2959*0b57cec5SDimitry Andric if (Kind == 'u') 2960*0b57cec5SDimitry Andric NumHexDigits = 4; 2961*0b57cec5SDimitry Andric else if (Kind == 'U') 2962*0b57cec5SDimitry Andric NumHexDigits = 8; 2963*0b57cec5SDimitry Andric else 2964*0b57cec5SDimitry Andric return 0; 2965*0b57cec5SDimitry Andric 2966*0b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) { 2967*0b57cec5SDimitry Andric if (Result && !isLexingRawMode()) 2968*0b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89); 2969*0b57cec5SDimitry Andric return 0; 2970*0b57cec5SDimitry Andric } 2971*0b57cec5SDimitry Andric 2972*0b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize; 2973*0b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1]; 2974*0b57cec5SDimitry Andric 2975*0b57cec5SDimitry Andric uint32_t CodePoint = 0; 2976*0b57cec5SDimitry Andric for (unsigned i = 0; i < NumHexDigits; ++i) { 2977*0b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize); 2978*0b57cec5SDimitry Andric 2979*0b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C); 2980*0b57cec5SDimitry Andric if (Value == -1U) { 2981*0b57cec5SDimitry Andric if (Result && !isLexingRawMode()) { 2982*0b57cec5SDimitry Andric if (i == 0) { 2983*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 2984*0b57cec5SDimitry Andric << StringRef(KindLoc, 1); 2985*0b57cec5SDimitry Andric } else { 2986*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 2987*0b57cec5SDimitry Andric 2988*0b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u. 2989*0b57cec5SDimitry Andric if (i == 4 && NumHexDigits == 8) { 2990*0b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1); 2991*0b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight) 2992*0b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u"); 2993*0b57cec5SDimitry Andric } 2994*0b57cec5SDimitry Andric } 2995*0b57cec5SDimitry Andric } 2996*0b57cec5SDimitry Andric 2997*0b57cec5SDimitry Andric return 0; 2998*0b57cec5SDimitry Andric } 2999*0b57cec5SDimitry Andric 3000*0b57cec5SDimitry Andric CodePoint <<= 4; 3001*0b57cec5SDimitry Andric CodePoint += Value; 3002*0b57cec5SDimitry Andric 3003*0b57cec5SDimitry Andric CurPtr += CharSize; 3004*0b57cec5SDimitry Andric } 3005*0b57cec5SDimitry Andric 3006*0b57cec5SDimitry Andric if (Result) { 3007*0b57cec5SDimitry Andric Result->setFlag(Token::HasUCN); 3008*0b57cec5SDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2) 3009*0b57cec5SDimitry Andric StartPtr = CurPtr; 3010*0b57cec5SDimitry Andric else 3011*0b57cec5SDimitry Andric while (StartPtr != CurPtr) 3012*0b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result); 3013*0b57cec5SDimitry Andric } else { 3014*0b57cec5SDimitry Andric StartPtr = CurPtr; 3015*0b57cec5SDimitry Andric } 3016*0b57cec5SDimitry Andric 3017*0b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode 3018*0b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) 3019*0b57cec5SDimitry Andric return CodePoint; 3020*0b57cec5SDimitry Andric 3021*0b57cec5SDimitry Andric // C99 6.4.3p2: A universal character name shall not specify a character whose 3022*0b57cec5SDimitry Andric // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 3023*0b57cec5SDimitry Andric // 0060 (`), nor one in the range D800 through DFFF inclusive.) 3024*0b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a 3025*0b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the 3026*0b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 3027*0b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the 3028*0b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 3029*0b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the 3030*0b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 3031*0b57cec5SDimitry Andric // basic source character set, the program is ill-formed. 3032*0b57cec5SDimitry Andric if (CodePoint < 0xA0) { 3033*0b57cec5SDimitry Andric if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 3034*0b57cec5SDimitry Andric return CodePoint; 3035*0b57cec5SDimitry Andric 3036*0b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad 3037*0b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 3038*0b57cec5SDimitry Andric if (Result && PP) { 3039*0b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F) 3040*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character); 3041*0b57cec5SDimitry Andric else { 3042*0b57cec5SDimitry Andric char C = static_cast<char>(CodePoint); 3043*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 3044*0b57cec5SDimitry Andric } 3045*0b57cec5SDimitry Andric } 3046*0b57cec5SDimitry Andric 3047*0b57cec5SDimitry Andric return 0; 3048*0b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) { 3049*0b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 3050*0b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad 3051*0b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block. 3052*0b57cec5SDimitry Andric if (Result && PP) { 3053*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11) 3054*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate); 3055*0b57cec5SDimitry Andric else 3056*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid); 3057*0b57cec5SDimitry Andric } 3058*0b57cec5SDimitry Andric return 0; 3059*0b57cec5SDimitry Andric } 3060*0b57cec5SDimitry Andric 3061*0b57cec5SDimitry Andric return CodePoint; 3062*0b57cec5SDimitry Andric } 3063*0b57cec5SDimitry Andric 3064*0b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C, 3065*0b57cec5SDimitry Andric const char *CurPtr) { 3066*0b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars( 3067*0b57cec5SDimitry Andric UnicodeWhitespaceCharRanges); 3068*0b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() && 3069*0b57cec5SDimitry Andric UnicodeWhitespaceChars.contains(C)) { 3070*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace) 3071*0b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr); 3072*0b57cec5SDimitry Andric 3073*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 3074*0b57cec5SDimitry Andric return true; 3075*0b57cec5SDimitry Andric } 3076*0b57cec5SDimitry Andric return false; 3077*0b57cec5SDimitry Andric } 3078*0b57cec5SDimitry Andric 3079*0b57cec5SDimitry Andric bool Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 3080*0b57cec5SDimitry Andric if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) { 3081*0b57cec5SDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3082*0b57cec5SDimitry Andric !PP->isPreprocessedOutput()) { 3083*0b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C, 3084*0b57cec5SDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), 3085*0b57cec5SDimitry Andric /*IsFirst=*/true); 3086*0b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C, 3087*0b57cec5SDimitry Andric makeCharRange(*this, BufferPtr, CurPtr)); 3088*0b57cec5SDimitry Andric } 3089*0b57cec5SDimitry Andric 3090*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3091*0b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 3092*0b57cec5SDimitry Andric } 3093*0b57cec5SDimitry Andric 3094*0b57cec5SDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective && 3095*0b57cec5SDimitry Andric !PP->isPreprocessedOutput() && 3096*0b57cec5SDimitry Andric !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) { 3097*0b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 3098*0b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 3099*0b57cec5SDimitry Andric // just drop the character. 3100*0b57cec5SDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually 3101*0b57cec5SDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that 3102*0b57cec5SDimitry Andric // we not throw away any possible preprocessor tokens, but there's a 3103*0b57cec5SDimitry Andric // loophole in the mapping of Unicode characters to basic character set 3104*0b57cec5SDimitry Andric // characters that allows us to map these particular characters to, say, 3105*0b57cec5SDimitry Andric // whitespace. 3106*0b57cec5SDimitry Andric Diag(BufferPtr, diag::err_non_ascii) 3107*0b57cec5SDimitry Andric << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr)); 3108*0b57cec5SDimitry Andric 3109*0b57cec5SDimitry Andric BufferPtr = CurPtr; 3110*0b57cec5SDimitry Andric return false; 3111*0b57cec5SDimitry Andric } 3112*0b57cec5SDimitry Andric 3113*0b57cec5SDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show 3114*0b57cec5SDimitry Andric // up by accident. 3115*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3116*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 3117*0b57cec5SDimitry Andric return true; 3118*0b57cec5SDimitry Andric } 3119*0b57cec5SDimitry Andric 3120*0b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) { 3121*0b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine(); 3122*0b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace(); 3123*0b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro(); 3124*0b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine. 3125*0b57cec5SDimitry Andric } 3126*0b57cec5SDimitry Andric 3127*0b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) { 3128*0b57cec5SDimitry Andric // Start a new token. 3129*0b57cec5SDimitry Andric Result.startToken(); 3130*0b57cec5SDimitry Andric 3131*0b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal. 3132*0b57cec5SDimitry Andric if (IsAtStartOfLine) { 3133*0b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine); 3134*0b57cec5SDimitry Andric IsAtStartOfLine = false; 3135*0b57cec5SDimitry Andric } 3136*0b57cec5SDimitry Andric 3137*0b57cec5SDimitry Andric if (HasLeadingSpace) { 3138*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 3139*0b57cec5SDimitry Andric HasLeadingSpace = false; 3140*0b57cec5SDimitry Andric } 3141*0b57cec5SDimitry Andric 3142*0b57cec5SDimitry Andric if (HasLeadingEmptyMacro) { 3143*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro); 3144*0b57cec5SDimitry Andric HasLeadingEmptyMacro = false; 3145*0b57cec5SDimitry Andric } 3146*0b57cec5SDimitry Andric 3147*0b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine; 3148*0b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false; 3149*0b57cec5SDimitry Andric bool isRawLex = isLexingRawMode(); 3150*0b57cec5SDimitry Andric (void) isRawLex; 3151*0b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine); 3152*0b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.) 3153*0b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed"); 3154*0b57cec5SDimitry Andric return returnedToken; 3155*0b57cec5SDimitry Andric } 3156*0b57cec5SDimitry Andric 3157*0b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an 3158*0b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer 3159*0b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing 3160*0b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes 3161*0b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this. 3162*0b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) { 3163*0b57cec5SDimitry Andric LexNextToken: 3164*0b57cec5SDimitry Andric // New token, can't need cleaning yet. 3165*0b57cec5SDimitry Andric Result.clearFlag(Token::NeedsCleaning); 3166*0b57cec5SDimitry Andric Result.setIdentifierInfo(nullptr); 3167*0b57cec5SDimitry Andric 3168*0b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable. 3169*0b57cec5SDimitry Andric const char *CurPtr = BufferPtr; 3170*0b57cec5SDimitry Andric 3171*0b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens. 3172*0b57cec5SDimitry Andric if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 3173*0b57cec5SDimitry Andric ++CurPtr; 3174*0b57cec5SDimitry Andric while ((*CurPtr == ' ') || (*CurPtr == '\t')) 3175*0b57cec5SDimitry Andric ++CurPtr; 3176*0b57cec5SDimitry Andric 3177*0b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just 3178*0b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the 3179*0b57cec5SDimitry Andric // whitespace. 3180*0b57cec5SDimitry Andric if (isKeepWhitespaceMode()) { 3181*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown); 3182*0b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set. 3183*0b57cec5SDimitry Andric return true; 3184*0b57cec5SDimitry Andric } 3185*0b57cec5SDimitry Andric 3186*0b57cec5SDimitry Andric BufferPtr = CurPtr; 3187*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 3188*0b57cec5SDimitry Andric } 3189*0b57cec5SDimitry Andric 3190*0b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 3191*0b57cec5SDimitry Andric 3192*0b57cec5SDimitry Andric // Read a character, advancing over it. 3193*0b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result); 3194*0b57cec5SDimitry Andric tok::TokenKind Kind; 3195*0b57cec5SDimitry Andric 3196*0b57cec5SDimitry Andric switch (Char) { 3197*0b57cec5SDimitry Andric case 0: // Null. 3198*0b57cec5SDimitry Andric // Found end of file? 3199*0b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd) 3200*0b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 3201*0b57cec5SDimitry Andric 3202*0b57cec5SDimitry Andric // Check if we are performing code completion. 3203*0b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) { 3204*0b57cec5SDimitry Andric // Return the code-completion token. 3205*0b57cec5SDimitry Andric Result.startToken(); 3206*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion); 3207*0b57cec5SDimitry Andric return true; 3208*0b57cec5SDimitry Andric } 3209*0b57cec5SDimitry Andric 3210*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3211*0b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file); 3212*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 3213*0b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3214*0b57cec5SDimitry Andric return true; // KeepWhitespaceMode 3215*0b57cec5SDimitry Andric 3216*0b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer. 3217*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3218*0b57cec5SDimitry Andric goto LexNextToken; 3219*0b57cec5SDimitry Andric 3220*0b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z". 3221*0b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file. 3222*0b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) { 3223*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3224*0b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft); 3225*0b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1); 3226*0b57cec5SDimitry Andric } 3227*0b57cec5SDimitry Andric 3228*0b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage. 3229*0b57cec5SDimitry Andric Kind = tok::unknown; 3230*0b57cec5SDimitry Andric break; 3231*0b57cec5SDimitry Andric 3232*0b57cec5SDimitry Andric case '\r': 3233*0b57cec5SDimitry Andric if (CurPtr[0] == '\n') 3234*0b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result); 3235*0b57cec5SDimitry Andric LLVM_FALLTHROUGH; 3236*0b57cec5SDimitry Andric case '\n': 3237*0b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line, 3238*0b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token. 3239*0b57cec5SDimitry Andric if (ParsingPreprocessorDirective) { 3240*0b57cec5SDimitry Andric // Done parsing the "line". 3241*0b57cec5SDimitry Andric ParsingPreprocessorDirective = false; 3242*0b57cec5SDimitry Andric 3243*0b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive. 3244*0b57cec5SDimitry Andric if (PP) 3245*0b57cec5SDimitry Andric resetExtendedTokenMode(); 3246*0b57cec5SDimitry Andric 3247*0b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line. 3248*0b57cec5SDimitry Andric IsAtStartOfLine = true; 3249*0b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true; 3250*0b57cec5SDimitry Andric 3251*0b57cec5SDimitry Andric Kind = tok::eod; 3252*0b57cec5SDimitry Andric break; 3253*0b57cec5SDimitry Andric } 3254*0b57cec5SDimitry Andric 3255*0b57cec5SDimitry Andric // No leading whitespace seen so far. 3256*0b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace); 3257*0b57cec5SDimitry Andric 3258*0b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3259*0b57cec5SDimitry Andric return true; // KeepWhitespaceMode 3260*0b57cec5SDimitry Andric 3261*0b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 3262*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3263*0b57cec5SDimitry Andric goto LexNextToken; 3264*0b57cec5SDimitry Andric case ' ': 3265*0b57cec5SDimitry Andric case '\t': 3266*0b57cec5SDimitry Andric case '\f': 3267*0b57cec5SDimitry Andric case '\v': 3268*0b57cec5SDimitry Andric SkipHorizontalWhitespace: 3269*0b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace); 3270*0b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3271*0b57cec5SDimitry Andric return true; // KeepWhitespaceMode 3272*0b57cec5SDimitry Andric 3273*0b57cec5SDimitry Andric SkipIgnoredUnits: 3274*0b57cec5SDimitry Andric CurPtr = BufferPtr; 3275*0b57cec5SDimitry Andric 3276*0b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently 3277*0b57cec5SDimitry Andric // too (without going through the big switch stmt). 3278*0b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 3279*0b57cec5SDimitry Andric LangOpts.LineComment && 3280*0b57cec5SDimitry Andric (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) { 3281*0b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3282*0b57cec5SDimitry Andric return true; // There is a token to return. 3283*0b57cec5SDimitry Andric goto SkipIgnoredUnits; 3284*0b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 3285*0b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine)) 3286*0b57cec5SDimitry Andric return true; // There is a token to return. 3287*0b57cec5SDimitry Andric goto SkipIgnoredUnits; 3288*0b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) { 3289*0b57cec5SDimitry Andric goto SkipHorizontalWhitespace; 3290*0b57cec5SDimitry Andric } 3291*0b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 3292*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3293*0b57cec5SDimitry Andric goto LexNextToken; 3294*0b57cec5SDimitry Andric 3295*0b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants. 3296*0b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants. 3297*0b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4': 3298*0b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9': 3299*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3300*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3301*0b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr); 3302*0b57cec5SDimitry Andric 3303*0b57cec5SDimitry Andric case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal 3304*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3305*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3306*0b57cec5SDimitry Andric 3307*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3308*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3309*0b57cec5SDimitry Andric 3310*0b57cec5SDimitry Andric // UTF-16 string literal 3311*0b57cec5SDimitry Andric if (Char == '"') 3312*0b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3313*0b57cec5SDimitry Andric tok::utf16_string_literal); 3314*0b57cec5SDimitry Andric 3315*0b57cec5SDimitry Andric // UTF-16 character constant 3316*0b57cec5SDimitry Andric if (Char == '\'') 3317*0b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3318*0b57cec5SDimitry Andric tok::utf16_char_constant); 3319*0b57cec5SDimitry Andric 3320*0b57cec5SDimitry Andric // UTF-16 raw string literal 3321*0b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 3322*0b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3323*0b57cec5SDimitry Andric return LexRawStringLiteral(Result, 3324*0b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3325*0b57cec5SDimitry Andric SizeTmp2, Result), 3326*0b57cec5SDimitry Andric tok::utf16_string_literal); 3327*0b57cec5SDimitry Andric 3328*0b57cec5SDimitry Andric if (Char == '8') { 3329*0b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3330*0b57cec5SDimitry Andric 3331*0b57cec5SDimitry Andric // UTF-8 string literal 3332*0b57cec5SDimitry Andric if (Char2 == '"') 3333*0b57cec5SDimitry Andric return LexStringLiteral(Result, 3334*0b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3335*0b57cec5SDimitry Andric SizeTmp2, Result), 3336*0b57cec5SDimitry Andric tok::utf8_string_literal); 3337*0b57cec5SDimitry Andric if (Char2 == '\'' && LangOpts.CPlusPlus17) 3338*0b57cec5SDimitry Andric return LexCharConstant( 3339*0b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3340*0b57cec5SDimitry Andric SizeTmp2, Result), 3341*0b57cec5SDimitry Andric tok::utf8_char_constant); 3342*0b57cec5SDimitry Andric 3343*0b57cec5SDimitry Andric if (Char2 == 'R' && LangOpts.CPlusPlus11) { 3344*0b57cec5SDimitry Andric unsigned SizeTmp3; 3345*0b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3346*0b57cec5SDimitry Andric // UTF-8 raw string literal 3347*0b57cec5SDimitry Andric if (Char3 == '"') { 3348*0b57cec5SDimitry Andric return LexRawStringLiteral(Result, 3349*0b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3350*0b57cec5SDimitry Andric SizeTmp2, Result), 3351*0b57cec5SDimitry Andric SizeTmp3, Result), 3352*0b57cec5SDimitry Andric tok::utf8_string_literal); 3353*0b57cec5SDimitry Andric } 3354*0b57cec5SDimitry Andric } 3355*0b57cec5SDimitry Andric } 3356*0b57cec5SDimitry Andric } 3357*0b57cec5SDimitry Andric 3358*0b57cec5SDimitry Andric // treat u like the start of an identifier. 3359*0b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 3360*0b57cec5SDimitry Andric 3361*0b57cec5SDimitry Andric case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal 3362*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3363*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3364*0b57cec5SDimitry Andric 3365*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) { 3366*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3367*0b57cec5SDimitry Andric 3368*0b57cec5SDimitry Andric // UTF-32 string literal 3369*0b57cec5SDimitry Andric if (Char == '"') 3370*0b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3371*0b57cec5SDimitry Andric tok::utf32_string_literal); 3372*0b57cec5SDimitry Andric 3373*0b57cec5SDimitry Andric // UTF-32 character constant 3374*0b57cec5SDimitry Andric if (Char == '\'') 3375*0b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3376*0b57cec5SDimitry Andric tok::utf32_char_constant); 3377*0b57cec5SDimitry Andric 3378*0b57cec5SDimitry Andric // UTF-32 raw string literal 3379*0b57cec5SDimitry Andric if (Char == 'R' && LangOpts.CPlusPlus11 && 3380*0b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3381*0b57cec5SDimitry Andric return LexRawStringLiteral(Result, 3382*0b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3383*0b57cec5SDimitry Andric SizeTmp2, Result), 3384*0b57cec5SDimitry Andric tok::utf32_string_literal); 3385*0b57cec5SDimitry Andric } 3386*0b57cec5SDimitry Andric 3387*0b57cec5SDimitry Andric // treat U like the start of an identifier. 3388*0b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 3389*0b57cec5SDimitry Andric 3390*0b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal 3391*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3392*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3393*0b57cec5SDimitry Andric 3394*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) { 3395*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3396*0b57cec5SDimitry Andric 3397*0b57cec5SDimitry Andric if (Char == '"') 3398*0b57cec5SDimitry Andric return LexRawStringLiteral(Result, 3399*0b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result), 3400*0b57cec5SDimitry Andric tok::string_literal); 3401*0b57cec5SDimitry Andric } 3402*0b57cec5SDimitry Andric 3403*0b57cec5SDimitry Andric // treat R like the start of an identifier. 3404*0b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 3405*0b57cec5SDimitry Andric 3406*0b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3407*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3408*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3409*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3410*0b57cec5SDimitry Andric 3411*0b57cec5SDimitry Andric // Wide string literal. 3412*0b57cec5SDimitry Andric if (Char == '"') 3413*0b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3414*0b57cec5SDimitry Andric tok::wide_string_literal); 3415*0b57cec5SDimitry Andric 3416*0b57cec5SDimitry Andric // Wide raw string literal. 3417*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && Char == 'R' && 3418*0b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3419*0b57cec5SDimitry Andric return LexRawStringLiteral(Result, 3420*0b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3421*0b57cec5SDimitry Andric SizeTmp2, Result), 3422*0b57cec5SDimitry Andric tok::wide_string_literal); 3423*0b57cec5SDimitry Andric 3424*0b57cec5SDimitry Andric // Wide character constant. 3425*0b57cec5SDimitry Andric if (Char == '\'') 3426*0b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3427*0b57cec5SDimitry Andric tok::wide_char_constant); 3428*0b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier. 3429*0b57cec5SDimitry Andric LLVM_FALLTHROUGH; 3430*0b57cec5SDimitry Andric 3431*0b57cec5SDimitry Andric // C99 6.4.2: Identifiers. 3432*0b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3433*0b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3434*0b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3435*0b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z': 3436*0b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3437*0b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3438*0b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3439*0b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z': 3440*0b57cec5SDimitry Andric case '_': 3441*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3442*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3443*0b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 3444*0b57cec5SDimitry Andric 3445*0b57cec5SDimitry Andric case '$': // $ in identifiers. 3446*0b57cec5SDimitry Andric if (LangOpts.DollarIdents) { 3447*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3448*0b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3449*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3450*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3451*0b57cec5SDimitry Andric return LexIdentifier(Result, CurPtr); 3452*0b57cec5SDimitry Andric } 3453*0b57cec5SDimitry Andric 3454*0b57cec5SDimitry Andric Kind = tok::unknown; 3455*0b57cec5SDimitry Andric break; 3456*0b57cec5SDimitry Andric 3457*0b57cec5SDimitry Andric // C99 6.4.4: Character Constants. 3458*0b57cec5SDimitry Andric case '\'': 3459*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3460*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3461*0b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant); 3462*0b57cec5SDimitry Andric 3463*0b57cec5SDimitry Andric // C99 6.4.5: String Literals. 3464*0b57cec5SDimitry Andric case '"': 3465*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3466*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3467*0b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr, 3468*0b57cec5SDimitry Andric ParsingFilename ? tok::header_name 3469*0b57cec5SDimitry Andric : tok::string_literal); 3470*0b57cec5SDimitry Andric 3471*0b57cec5SDimitry Andric // C99 6.4.6: Punctuators. 3472*0b57cec5SDimitry Andric case '?': 3473*0b57cec5SDimitry Andric Kind = tok::question; 3474*0b57cec5SDimitry Andric break; 3475*0b57cec5SDimitry Andric case '[': 3476*0b57cec5SDimitry Andric Kind = tok::l_square; 3477*0b57cec5SDimitry Andric break; 3478*0b57cec5SDimitry Andric case ']': 3479*0b57cec5SDimitry Andric Kind = tok::r_square; 3480*0b57cec5SDimitry Andric break; 3481*0b57cec5SDimitry Andric case '(': 3482*0b57cec5SDimitry Andric Kind = tok::l_paren; 3483*0b57cec5SDimitry Andric break; 3484*0b57cec5SDimitry Andric case ')': 3485*0b57cec5SDimitry Andric Kind = tok::r_paren; 3486*0b57cec5SDimitry Andric break; 3487*0b57cec5SDimitry Andric case '{': 3488*0b57cec5SDimitry Andric Kind = tok::l_brace; 3489*0b57cec5SDimitry Andric break; 3490*0b57cec5SDimitry Andric case '}': 3491*0b57cec5SDimitry Andric Kind = tok::r_brace; 3492*0b57cec5SDimitry Andric break; 3493*0b57cec5SDimitry Andric case '.': 3494*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3495*0b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') { 3496*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3497*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3498*0b57cec5SDimitry Andric 3499*0b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 3500*0b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') { 3501*0b57cec5SDimitry Andric Kind = tok::periodstar; 3502*0b57cec5SDimitry Andric CurPtr += SizeTmp; 3503*0b57cec5SDimitry Andric } else if (Char == '.' && 3504*0b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 3505*0b57cec5SDimitry Andric Kind = tok::ellipsis; 3506*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3507*0b57cec5SDimitry Andric SizeTmp2, Result); 3508*0b57cec5SDimitry Andric } else { 3509*0b57cec5SDimitry Andric Kind = tok::period; 3510*0b57cec5SDimitry Andric } 3511*0b57cec5SDimitry Andric break; 3512*0b57cec5SDimitry Andric case '&': 3513*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3514*0b57cec5SDimitry Andric if (Char == '&') { 3515*0b57cec5SDimitry Andric Kind = tok::ampamp; 3516*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3517*0b57cec5SDimitry Andric } else if (Char == '=') { 3518*0b57cec5SDimitry Andric Kind = tok::ampequal; 3519*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3520*0b57cec5SDimitry Andric } else { 3521*0b57cec5SDimitry Andric Kind = tok::amp; 3522*0b57cec5SDimitry Andric } 3523*0b57cec5SDimitry Andric break; 3524*0b57cec5SDimitry Andric case '*': 3525*0b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3526*0b57cec5SDimitry Andric Kind = tok::starequal; 3527*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3528*0b57cec5SDimitry Andric } else { 3529*0b57cec5SDimitry Andric Kind = tok::star; 3530*0b57cec5SDimitry Andric } 3531*0b57cec5SDimitry Andric break; 3532*0b57cec5SDimitry Andric case '+': 3533*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3534*0b57cec5SDimitry Andric if (Char == '+') { 3535*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3536*0b57cec5SDimitry Andric Kind = tok::plusplus; 3537*0b57cec5SDimitry Andric } else if (Char == '=') { 3538*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3539*0b57cec5SDimitry Andric Kind = tok::plusequal; 3540*0b57cec5SDimitry Andric } else { 3541*0b57cec5SDimitry Andric Kind = tok::plus; 3542*0b57cec5SDimitry Andric } 3543*0b57cec5SDimitry Andric break; 3544*0b57cec5SDimitry Andric case '-': 3545*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3546*0b57cec5SDimitry Andric if (Char == '-') { // -- 3547*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3548*0b57cec5SDimitry Andric Kind = tok::minusminus; 3549*0b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus && 3550*0b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 3551*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3552*0b57cec5SDimitry Andric SizeTmp2, Result); 3553*0b57cec5SDimitry Andric Kind = tok::arrowstar; 3554*0b57cec5SDimitry Andric } else if (Char == '>') { // -> 3555*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3556*0b57cec5SDimitry Andric Kind = tok::arrow; 3557*0b57cec5SDimitry Andric } else if (Char == '=') { // -= 3558*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3559*0b57cec5SDimitry Andric Kind = tok::minusequal; 3560*0b57cec5SDimitry Andric } else { 3561*0b57cec5SDimitry Andric Kind = tok::minus; 3562*0b57cec5SDimitry Andric } 3563*0b57cec5SDimitry Andric break; 3564*0b57cec5SDimitry Andric case '~': 3565*0b57cec5SDimitry Andric Kind = tok::tilde; 3566*0b57cec5SDimitry Andric break; 3567*0b57cec5SDimitry Andric case '!': 3568*0b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3569*0b57cec5SDimitry Andric Kind = tok::exclaimequal; 3570*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3571*0b57cec5SDimitry Andric } else { 3572*0b57cec5SDimitry Andric Kind = tok::exclaim; 3573*0b57cec5SDimitry Andric } 3574*0b57cec5SDimitry Andric break; 3575*0b57cec5SDimitry Andric case '/': 3576*0b57cec5SDimitry Andric // 6.4.9: Comments 3577*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3578*0b57cec5SDimitry Andric if (Char == '/') { // Line comment. 3579*0b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally 3580*0b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though, 3581*0b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the 3582*0b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex 3583*0b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as 3584*0b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'. 3585*0b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment. 3586*0b57cec5SDimitry Andric // However, we never do this if we are just preprocessing. 3587*0b57cec5SDimitry Andric bool TreatAsComment = LangOpts.LineComment && 3588*0b57cec5SDimitry Andric (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP); 3589*0b57cec5SDimitry Andric if (!TreatAsComment) 3590*0b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput())) 3591*0b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*'; 3592*0b57cec5SDimitry Andric 3593*0b57cec5SDimitry Andric if (TreatAsComment) { 3594*0b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3595*0b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 3596*0b57cec5SDimitry Andric return true; // There is a token to return. 3597*0b57cec5SDimitry Andric 3598*0b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be 3599*0b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through 3600*0b57cec5SDimitry Andric // the big switch, handle it efficiently now. 3601*0b57cec5SDimitry Andric goto SkipIgnoredUnits; 3602*0b57cec5SDimitry Andric } 3603*0b57cec5SDimitry Andric } 3604*0b57cec5SDimitry Andric 3605*0b57cec5SDimitry Andric if (Char == '*') { // /**/ comment. 3606*0b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3607*0b57cec5SDimitry Andric TokAtPhysicalStartOfLine)) 3608*0b57cec5SDimitry Andric return true; // There is a token to return. 3609*0b57cec5SDimitry Andric 3610*0b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 3611*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3612*0b57cec5SDimitry Andric goto LexNextToken; 3613*0b57cec5SDimitry Andric } 3614*0b57cec5SDimitry Andric 3615*0b57cec5SDimitry Andric if (Char == '=') { 3616*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3617*0b57cec5SDimitry Andric Kind = tok::slashequal; 3618*0b57cec5SDimitry Andric } else { 3619*0b57cec5SDimitry Andric Kind = tok::slash; 3620*0b57cec5SDimitry Andric } 3621*0b57cec5SDimitry Andric break; 3622*0b57cec5SDimitry Andric case '%': 3623*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3624*0b57cec5SDimitry Andric if (Char == '=') { 3625*0b57cec5SDimitry Andric Kind = tok::percentequal; 3626*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3627*0b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') { 3628*0b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}' 3629*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3630*0b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { 3631*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3632*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3633*0b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 3634*0b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##' 3635*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3636*0b57cec5SDimitry Andric SizeTmp2, Result); 3637*0b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 3638*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3639*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3640*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 3641*0b57cec5SDimitry Andric Kind = tok::hashat; 3642*0b57cec5SDimitry Andric } else { // '%:' -> '#' 3643*0b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 3644*0b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 3645*0b57cec5SDimitry Andric // the preprocessor to handle it. 3646*0b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 3647*0b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3648*0b57cec5SDimitry Andric goto HandleDirective; 3649*0b57cec5SDimitry Andric 3650*0b57cec5SDimitry Andric Kind = tok::hash; 3651*0b57cec5SDimitry Andric } 3652*0b57cec5SDimitry Andric } else { 3653*0b57cec5SDimitry Andric Kind = tok::percent; 3654*0b57cec5SDimitry Andric } 3655*0b57cec5SDimitry Andric break; 3656*0b57cec5SDimitry Andric case '<': 3657*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3658*0b57cec5SDimitry Andric if (ParsingFilename) { 3659*0b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr); 3660*0b57cec5SDimitry Andric } else if (Char == '<') { 3661*0b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3662*0b57cec5SDimitry Andric if (After == '=') { 3663*0b57cec5SDimitry Andric Kind = tok::lesslessequal; 3664*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3665*0b57cec5SDimitry Andric SizeTmp2, Result); 3666*0b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3667*0b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker, 3668*0b57cec5SDimitry Andric // recognize it as such and recover nicely. 3669*0b57cec5SDimitry Andric goto LexNextToken; 3670*0b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3671*0b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker, 3672*0b57cec5SDimitry Andric // ignore it. 3673*0b57cec5SDimitry Andric goto LexNextToken; 3674*0b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') { 3675*0b57cec5SDimitry Andric Kind = tok::lesslessless; 3676*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3677*0b57cec5SDimitry Andric SizeTmp2, Result); 3678*0b57cec5SDimitry Andric } else { 3679*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3680*0b57cec5SDimitry Andric Kind = tok::lessless; 3681*0b57cec5SDimitry Andric } 3682*0b57cec5SDimitry Andric } else if (Char == '=') { 3683*0b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3684*0b57cec5SDimitry Andric if (After == '>') { 3685*0b57cec5SDimitry Andric if (getLangOpts().CPlusPlus2a) { 3686*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3687*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship); 3688*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3689*0b57cec5SDimitry Andric SizeTmp2, Result); 3690*0b57cec5SDimitry Andric Kind = tok::spaceship; 3691*0b57cec5SDimitry Andric break; 3692*0b57cec5SDimitry Andric } 3693*0b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a 3694*0b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode. 3695*0b57cec5SDimitry Andric if (getLangOpts().CPlusPlus && !isLexingRawMode()) { 3696*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx2a_compat_spaceship) 3697*0b57cec5SDimitry Andric << FixItHint::CreateInsertion( 3698*0b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " "); 3699*0b57cec5SDimitry Andric } 3700*0b57cec5SDimitry Andric } 3701*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3702*0b57cec5SDimitry Andric Kind = tok::lessequal; 3703*0b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 3704*0b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 && 3705*0b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3706*0b57cec5SDimitry Andric // C++0x [lex.pptoken]p3: 3707*0b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent 3708*0b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor 3709*0b57cec5SDimitry Andric // token by itself and not as the first character of the alternative 3710*0b57cec5SDimitry Andric // token <:. 3711*0b57cec5SDimitry Andric unsigned SizeTmp3; 3712*0b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3713*0b57cec5SDimitry Andric if (After != ':' && After != '>') { 3714*0b57cec5SDimitry Andric Kind = tok::less; 3715*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3716*0b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3717*0b57cec5SDimitry Andric break; 3718*0b57cec5SDimitry Andric } 3719*0b57cec5SDimitry Andric } 3720*0b57cec5SDimitry Andric 3721*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3722*0b57cec5SDimitry Andric Kind = tok::l_square; 3723*0b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 3724*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3725*0b57cec5SDimitry Andric Kind = tok::l_brace; 3726*0b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 && 3727*0b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) { 3728*0b57cec5SDimitry Andric return true; 3729*0b57cec5SDimitry Andric } else { 3730*0b57cec5SDimitry Andric Kind = tok::less; 3731*0b57cec5SDimitry Andric } 3732*0b57cec5SDimitry Andric break; 3733*0b57cec5SDimitry Andric case '>': 3734*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3735*0b57cec5SDimitry Andric if (Char == '=') { 3736*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3737*0b57cec5SDimitry Andric Kind = tok::greaterequal; 3738*0b57cec5SDimitry Andric } else if (Char == '>') { 3739*0b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3740*0b57cec5SDimitry Andric if (After == '=') { 3741*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3742*0b57cec5SDimitry Andric SizeTmp2, Result); 3743*0b57cec5SDimitry Andric Kind = tok::greatergreaterequal; 3744*0b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3745*0b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such 3746*0b57cec5SDimitry Andric // and recover nicely. 3747*0b57cec5SDimitry Andric goto LexNextToken; 3748*0b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3749*0b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3750*0b57cec5SDimitry Andric goto LexNextToken; 3751*0b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') { 3752*0b57cec5SDimitry Andric Kind = tok::greatergreatergreater; 3753*0b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3754*0b57cec5SDimitry Andric SizeTmp2, Result); 3755*0b57cec5SDimitry Andric } else { 3756*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3757*0b57cec5SDimitry Andric Kind = tok::greatergreater; 3758*0b57cec5SDimitry Andric } 3759*0b57cec5SDimitry Andric } else { 3760*0b57cec5SDimitry Andric Kind = tok::greater; 3761*0b57cec5SDimitry Andric } 3762*0b57cec5SDimitry Andric break; 3763*0b57cec5SDimitry Andric case '^': 3764*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3765*0b57cec5SDimitry Andric if (Char == '=') { 3766*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3767*0b57cec5SDimitry Andric Kind = tok::caretequal; 3768*0b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') { 3769*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3770*0b57cec5SDimitry Andric Kind = tok::caretcaret; 3771*0b57cec5SDimitry Andric } else { 3772*0b57cec5SDimitry Andric Kind = tok::caret; 3773*0b57cec5SDimitry Andric } 3774*0b57cec5SDimitry Andric break; 3775*0b57cec5SDimitry Andric case '|': 3776*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3777*0b57cec5SDimitry Andric if (Char == '=') { 3778*0b57cec5SDimitry Andric Kind = tok::pipeequal; 3779*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3780*0b57cec5SDimitry Andric } else if (Char == '|') { 3781*0b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it. 3782*0b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3783*0b57cec5SDimitry Andric goto LexNextToken; 3784*0b57cec5SDimitry Andric Kind = tok::pipepipe; 3785*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3786*0b57cec5SDimitry Andric } else { 3787*0b57cec5SDimitry Andric Kind = tok::pipe; 3788*0b57cec5SDimitry Andric } 3789*0b57cec5SDimitry Andric break; 3790*0b57cec5SDimitry Andric case ':': 3791*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3792*0b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') { 3793*0b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']' 3794*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3795*0b57cec5SDimitry Andric } else if ((LangOpts.CPlusPlus || 3796*0b57cec5SDimitry Andric LangOpts.DoubleSquareBracketAttributes) && 3797*0b57cec5SDimitry Andric Char == ':') { 3798*0b57cec5SDimitry Andric Kind = tok::coloncolon; 3799*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3800*0b57cec5SDimitry Andric } else { 3801*0b57cec5SDimitry Andric Kind = tok::colon; 3802*0b57cec5SDimitry Andric } 3803*0b57cec5SDimitry Andric break; 3804*0b57cec5SDimitry Andric case ';': 3805*0b57cec5SDimitry Andric Kind = tok::semi; 3806*0b57cec5SDimitry Andric break; 3807*0b57cec5SDimitry Andric case '=': 3808*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3809*0b57cec5SDimitry Andric if (Char == '=') { 3810*0b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it. 3811*0b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3812*0b57cec5SDimitry Andric goto LexNextToken; 3813*0b57cec5SDimitry Andric 3814*0b57cec5SDimitry Andric Kind = tok::equalequal; 3815*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3816*0b57cec5SDimitry Andric } else { 3817*0b57cec5SDimitry Andric Kind = tok::equal; 3818*0b57cec5SDimitry Andric } 3819*0b57cec5SDimitry Andric break; 3820*0b57cec5SDimitry Andric case ',': 3821*0b57cec5SDimitry Andric Kind = tok::comma; 3822*0b57cec5SDimitry Andric break; 3823*0b57cec5SDimitry Andric case '#': 3824*0b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp); 3825*0b57cec5SDimitry Andric if (Char == '#') { 3826*0b57cec5SDimitry Andric Kind = tok::hashhash; 3827*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3828*0b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 3829*0b57cec5SDimitry Andric Kind = tok::hashat; 3830*0b57cec5SDimitry Andric if (!isLexingRawMode()) 3831*0b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft); 3832*0b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3833*0b57cec5SDimitry Andric } else { 3834*0b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line, 3835*0b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to 3836*0b57cec5SDimitry Andric // the preprocessor to handle it. 3837*0b57cec5SDimitry Andric // TODO: -fpreprocessed mode?? 3838*0b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer) 3839*0b57cec5SDimitry Andric goto HandleDirective; 3840*0b57cec5SDimitry Andric 3841*0b57cec5SDimitry Andric Kind = tok::hash; 3842*0b57cec5SDimitry Andric } 3843*0b57cec5SDimitry Andric break; 3844*0b57cec5SDimitry Andric 3845*0b57cec5SDimitry Andric case '@': 3846*0b57cec5SDimitry Andric // Objective C support. 3847*0b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC) 3848*0b57cec5SDimitry Andric Kind = tok::at; 3849*0b57cec5SDimitry Andric else 3850*0b57cec5SDimitry Andric Kind = tok::unknown; 3851*0b57cec5SDimitry Andric break; 3852*0b57cec5SDimitry Andric 3853*0b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 3854*0b57cec5SDimitry Andric case '\\': 3855*0b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) { 3856*0b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) { 3857*0b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3858*0b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3859*0b57cec5SDimitry Andric return true; // KeepWhitespaceMode 3860*0b57cec5SDimitry Andric 3861*0b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 3862*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3863*0b57cec5SDimitry Andric goto LexNextToken; 3864*0b57cec5SDimitry Andric } 3865*0b57cec5SDimitry Andric 3866*0b57cec5SDimitry Andric return LexUnicode(Result, CodePoint, CurPtr); 3867*0b57cec5SDimitry Andric } 3868*0b57cec5SDimitry Andric } 3869*0b57cec5SDimitry Andric 3870*0b57cec5SDimitry Andric Kind = tok::unknown; 3871*0b57cec5SDimitry Andric break; 3872*0b57cec5SDimitry Andric 3873*0b57cec5SDimitry Andric default: { 3874*0b57cec5SDimitry Andric if (isASCII(Char)) { 3875*0b57cec5SDimitry Andric Kind = tok::unknown; 3876*0b57cec5SDimitry Andric break; 3877*0b57cec5SDimitry Andric } 3878*0b57cec5SDimitry Andric 3879*0b57cec5SDimitry Andric llvm::UTF32 CodePoint; 3880*0b57cec5SDimitry Andric 3881*0b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 3882*0b57cec5SDimitry Andric // an escaped newline. 3883*0b57cec5SDimitry Andric --CurPtr; 3884*0b57cec5SDimitry Andric llvm::ConversionResult Status = 3885*0b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr, 3886*0b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd, 3887*0b57cec5SDimitry Andric &CodePoint, 3888*0b57cec5SDimitry Andric llvm::strictConversion); 3889*0b57cec5SDimitry Andric if (Status == llvm::conversionOK) { 3890*0b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) { 3891*0b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine)) 3892*0b57cec5SDimitry Andric return true; // KeepWhitespaceMode 3893*0b57cec5SDimitry Andric 3894*0b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer. 3895*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3896*0b57cec5SDimitry Andric goto LexNextToken; 3897*0b57cec5SDimitry Andric } 3898*0b57cec5SDimitry Andric return LexUnicode(Result, CodePoint, CurPtr); 3899*0b57cec5SDimitry Andric } 3900*0b57cec5SDimitry Andric 3901*0b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective || 3902*0b57cec5SDimitry Andric PP->isPreprocessedOutput()) { 3903*0b57cec5SDimitry Andric ++CurPtr; 3904*0b57cec5SDimitry Andric Kind = tok::unknown; 3905*0b57cec5SDimitry Andric break; 3906*0b57cec5SDimitry Andric } 3907*0b57cec5SDimitry Andric 3908*0b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally. 3909*0b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token, 3910*0b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character. 3911*0b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8); 3912*0b57cec5SDimitry Andric 3913*0b57cec5SDimitry Andric BufferPtr = CurPtr+1; 3914*0b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with 3915*0b57cec5SDimitry Andric // this lexer. 3916*0b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.) 3917*0b57cec5SDimitry Andric goto LexNextToken; 3918*0b57cec5SDimitry Andric } 3919*0b57cec5SDimitry Andric } 3920*0b57cec5SDimitry Andric 3921*0b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token. 3922*0b57cec5SDimitry Andric MIOpt.ReadToken(); 3923*0b57cec5SDimitry Andric 3924*0b57cec5SDimitry Andric // Update the location of token as well as BufferPtr. 3925*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind); 3926*0b57cec5SDimitry Andric return true; 3927*0b57cec5SDimitry Andric 3928*0b57cec5SDimitry Andric HandleDirective: 3929*0b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive. 3930*0b57cec5SDimitry Andric 3931*0b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash); 3932*0b57cec5SDimitry Andric PP->HandleDirective(Result); 3933*0b57cec5SDimitry Andric 3934*0b57cec5SDimitry Andric if (PP->hadModuleLoaderFatalFailure()) { 3935*0b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing. 3936*0b57cec5SDimitry Andric assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof"); 3937*0b57cec5SDimitry Andric return true; 3938*0b57cec5SDimitry Andric } 3939*0b57cec5SDimitry Andric 3940*0b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state. 3941*0b57cec5SDimitry Andric return false; 3942*0b57cec5SDimitry Andric } 3943