10b57cec5SDimitry Andric //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements the Lexer and Token interfaces.
100b57cec5SDimitry Andric //
110b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
120b57cec5SDimitry Andric
130b57cec5SDimitry Andric #include "clang/Lex/Lexer.h"
140b57cec5SDimitry Andric #include "UnicodeCharSets.h"
150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h"
16e8d8bef9SDimitry Andric #include "clang/Basic/Diagnostic.h"
170b57cec5SDimitry Andric #include "clang/Basic/IdentifierTable.h"
18e8d8bef9SDimitry Andric #include "clang/Basic/LLVM.h"
190b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h"
200b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h"
210b57cec5SDimitry Andric #include "clang/Basic/SourceManager.h"
220b57cec5SDimitry Andric #include "clang/Basic/TokenKinds.h"
230b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h"
240b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h"
250b57cec5SDimitry Andric #include "clang/Lex/MultipleIncludeOpt.h"
260b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h"
270b57cec5SDimitry Andric #include "clang/Lex/PreprocessorOptions.h"
280b57cec5SDimitry Andric #include "clang/Lex/Token.h"
295ffd83dbSDimitry Andric #include "llvm/ADT/STLExtras.h"
300b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h"
310b57cec5SDimitry Andric #include "llvm/ADT/StringRef.h"
32e8d8bef9SDimitry Andric #include "llvm/ADT/StringSwitch.h"
330b57cec5SDimitry Andric #include "llvm/Support/Compiler.h"
340b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
350b57cec5SDimitry Andric #include "llvm/Support/MathExtras.h"
36e8d8bef9SDimitry Andric #include "llvm/Support/MemoryBufferRef.h"
370b57cec5SDimitry Andric #include "llvm/Support/NativeFormatting.h"
3881ad6265SDimitry Andric #include "llvm/Support/Unicode.h"
390b57cec5SDimitry Andric #include "llvm/Support/UnicodeCharRanges.h"
400b57cec5SDimitry Andric #include <algorithm>
410b57cec5SDimitry Andric #include <cassert>
420b57cec5SDimitry Andric #include <cstddef>
430b57cec5SDimitry Andric #include <cstdint>
440b57cec5SDimitry Andric #include <cstring>
45bdd1243dSDimitry Andric #include <optional>
460b57cec5SDimitry Andric #include <string>
470b57cec5SDimitry Andric #include <tuple>
480b57cec5SDimitry Andric #include <utility>
490b57cec5SDimitry Andric
505f757f3fSDimitry Andric #ifdef __SSE4_2__
515f757f3fSDimitry Andric #include <nmmintrin.h>
525f757f3fSDimitry Andric #endif
535f757f3fSDimitry Andric
540b57cec5SDimitry Andric using namespace clang;
550b57cec5SDimitry Andric
560b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
570b57cec5SDimitry Andric // Token Class Implementation
580b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
590b57cec5SDimitry Andric
600b57cec5SDimitry Andric /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const610b57cec5SDimitry Andric bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
620b57cec5SDimitry Andric if (isAnnotation())
630b57cec5SDimitry Andric return false;
645f757f3fSDimitry Andric if (const IdentifierInfo *II = getIdentifierInfo())
650b57cec5SDimitry Andric return II->getObjCKeywordID() == objcKey;
660b57cec5SDimitry Andric return false;
670b57cec5SDimitry Andric }
680b57cec5SDimitry Andric
690b57cec5SDimitry Andric /// getObjCKeywordID - Return the ObjC keyword kind.
getObjCKeywordID() const700b57cec5SDimitry Andric tok::ObjCKeywordKind Token::getObjCKeywordID() const {
710b57cec5SDimitry Andric if (isAnnotation())
720b57cec5SDimitry Andric return tok::objc_not_keyword;
735f757f3fSDimitry Andric const IdentifierInfo *specId = getIdentifierInfo();
740b57cec5SDimitry Andric return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
750b57cec5SDimitry Andric }
760b57cec5SDimitry Andric
77*0fca6ea1SDimitry Andric /// Determine whether the token kind starts a simple-type-specifier.
isSimpleTypeSpecifier(const LangOptions & LangOpts) const78*0fca6ea1SDimitry Andric bool Token::isSimpleTypeSpecifier(const LangOptions &LangOpts) const {
79*0fca6ea1SDimitry Andric switch (getKind()) {
80*0fca6ea1SDimitry Andric case tok::annot_typename:
81*0fca6ea1SDimitry Andric case tok::annot_decltype:
82*0fca6ea1SDimitry Andric case tok::annot_pack_indexing_type:
83*0fca6ea1SDimitry Andric return true;
84*0fca6ea1SDimitry Andric
85*0fca6ea1SDimitry Andric case tok::kw_short:
86*0fca6ea1SDimitry Andric case tok::kw_long:
87*0fca6ea1SDimitry Andric case tok::kw___int64:
88*0fca6ea1SDimitry Andric case tok::kw___int128:
89*0fca6ea1SDimitry Andric case tok::kw_signed:
90*0fca6ea1SDimitry Andric case tok::kw_unsigned:
91*0fca6ea1SDimitry Andric case tok::kw_void:
92*0fca6ea1SDimitry Andric case tok::kw_char:
93*0fca6ea1SDimitry Andric case tok::kw_int:
94*0fca6ea1SDimitry Andric case tok::kw_half:
95*0fca6ea1SDimitry Andric case tok::kw_float:
96*0fca6ea1SDimitry Andric case tok::kw_double:
97*0fca6ea1SDimitry Andric case tok::kw___bf16:
98*0fca6ea1SDimitry Andric case tok::kw__Float16:
99*0fca6ea1SDimitry Andric case tok::kw___float128:
100*0fca6ea1SDimitry Andric case tok::kw___ibm128:
101*0fca6ea1SDimitry Andric case tok::kw_wchar_t:
102*0fca6ea1SDimitry Andric case tok::kw_bool:
103*0fca6ea1SDimitry Andric case tok::kw__Bool:
104*0fca6ea1SDimitry Andric case tok::kw__Accum:
105*0fca6ea1SDimitry Andric case tok::kw__Fract:
106*0fca6ea1SDimitry Andric case tok::kw__Sat:
107*0fca6ea1SDimitry Andric #define TRANSFORM_TYPE_TRAIT_DEF(_, Trait) case tok::kw___##Trait:
108*0fca6ea1SDimitry Andric #include "clang/Basic/TransformTypeTraits.def"
109*0fca6ea1SDimitry Andric case tok::kw___auto_type:
110*0fca6ea1SDimitry Andric case tok::kw_char16_t:
111*0fca6ea1SDimitry Andric case tok::kw_char32_t:
112*0fca6ea1SDimitry Andric case tok::kw_typeof:
113*0fca6ea1SDimitry Andric case tok::kw_decltype:
114*0fca6ea1SDimitry Andric case tok::kw_char8_t:
115*0fca6ea1SDimitry Andric return getIdentifierInfo()->isKeyword(LangOpts);
116*0fca6ea1SDimitry Andric
117*0fca6ea1SDimitry Andric default:
118*0fca6ea1SDimitry Andric return false;
119*0fca6ea1SDimitry Andric }
120*0fca6ea1SDimitry Andric }
121*0fca6ea1SDimitry Andric
1220b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
1230b57cec5SDimitry Andric // Lexer Class Implementation
1240b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
1250b57cec5SDimitry Andric
anchor()1260b57cec5SDimitry Andric void Lexer::anchor() {}
1270b57cec5SDimitry Andric
InitLexer(const char * BufStart,const char * BufPtr,const char * BufEnd)1280b57cec5SDimitry Andric void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
1290b57cec5SDimitry Andric const char *BufEnd) {
1300b57cec5SDimitry Andric BufferStart = BufStart;
1310b57cec5SDimitry Andric BufferPtr = BufPtr;
1320b57cec5SDimitry Andric BufferEnd = BufEnd;
1330b57cec5SDimitry Andric
1340b57cec5SDimitry Andric assert(BufEnd[0] == 0 &&
1350b57cec5SDimitry Andric "We assume that the input buffer has a null character at the end"
1360b57cec5SDimitry Andric " to simplify lexing!");
1370b57cec5SDimitry Andric
1380b57cec5SDimitry Andric // Check whether we have a BOM in the beginning of the buffer. If yes - act
1390b57cec5SDimitry Andric // accordingly. Right now we support only UTF-8 with and without BOM, so, just
1400b57cec5SDimitry Andric // skip the UTF-8 BOM if it's present.
1410b57cec5SDimitry Andric if (BufferStart == BufferPtr) {
1420b57cec5SDimitry Andric // Determine the size of the BOM.
1430b57cec5SDimitry Andric StringRef Buf(BufferStart, BufferEnd - BufferStart);
1440b57cec5SDimitry Andric size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
1450b57cec5SDimitry Andric .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
1460b57cec5SDimitry Andric .Default(0);
1470b57cec5SDimitry Andric
1480b57cec5SDimitry Andric // Skip the BOM.
1490b57cec5SDimitry Andric BufferPtr += BOMLength;
1500b57cec5SDimitry Andric }
1510b57cec5SDimitry Andric
1520b57cec5SDimitry Andric Is_PragmaLexer = false;
1530b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None;
1540b57cec5SDimitry Andric
1550b57cec5SDimitry Andric // Start of the file is a start of line.
1560b57cec5SDimitry Andric IsAtStartOfLine = true;
1570b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true;
1580b57cec5SDimitry Andric
1590b57cec5SDimitry Andric HasLeadingSpace = false;
1600b57cec5SDimitry Andric HasLeadingEmptyMacro = false;
1610b57cec5SDimitry Andric
1620b57cec5SDimitry Andric // We are not after parsing a #.
1630b57cec5SDimitry Andric ParsingPreprocessorDirective = false;
1640b57cec5SDimitry Andric
1650b57cec5SDimitry Andric // We are not after parsing #include.
1660b57cec5SDimitry Andric ParsingFilename = false;
1670b57cec5SDimitry Andric
1680b57cec5SDimitry Andric // We are not in raw mode. Raw mode disables diagnostics and interpretation
1690b57cec5SDimitry Andric // of tokens (e.g. identifiers, thus disabling macro expansion). It is used
1700b57cec5SDimitry Andric // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
1710b57cec5SDimitry Andric // or otherwise skipping over tokens.
1720b57cec5SDimitry Andric LexingRawMode = false;
1730b57cec5SDimitry Andric
1740b57cec5SDimitry Andric // Default to not keeping comments.
1750b57cec5SDimitry Andric ExtendedTokenMode = 0;
176e8d8bef9SDimitry Andric
177e8d8bef9SDimitry Andric NewLinePtr = nullptr;
1780b57cec5SDimitry Andric }
1790b57cec5SDimitry Andric
1800b57cec5SDimitry Andric /// Lexer constructor - Create a new lexer object for the specified buffer
1810b57cec5SDimitry Andric /// with the specified preprocessor managing the lexing process. This lexer
1820b57cec5SDimitry Andric /// assumes that the associated file buffer and Preprocessor objects will
1830b57cec5SDimitry Andric /// outlive it, so it doesn't take ownership of either of them.
Lexer(FileID FID,const llvm::MemoryBufferRef & InputFile,Preprocessor & PP,bool IsFirstIncludeOfFile)184e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
185349cc55cSDimitry Andric Preprocessor &PP, bool IsFirstIncludeOfFile)
1860b57cec5SDimitry Andric : PreprocessorLexer(&PP, FID),
1870b57cec5SDimitry Andric FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
18881ad6265SDimitry Andric LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
18981ad6265SDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
190e8d8bef9SDimitry Andric InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
191e8d8bef9SDimitry Andric InputFile.getBufferEnd());
1920b57cec5SDimitry Andric
1930b57cec5SDimitry Andric resetExtendedTokenMode();
1940b57cec5SDimitry Andric }
1950b57cec5SDimitry Andric
1960b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only
1970b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
1980b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it.
Lexer(SourceLocation fileloc,const LangOptions & langOpts,const char * BufStart,const char * BufPtr,const char * BufEnd,bool IsFirstIncludeOfFile)1990b57cec5SDimitry Andric Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
200349cc55cSDimitry Andric const char *BufStart, const char *BufPtr, const char *BufEnd,
201349cc55cSDimitry Andric bool IsFirstIncludeOfFile)
20281ad6265SDimitry Andric : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
203349cc55cSDimitry Andric IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
2040b57cec5SDimitry Andric InitLexer(BufStart, BufPtr, BufEnd);
2050b57cec5SDimitry Andric
2060b57cec5SDimitry Andric // We *are* in raw mode.
2070b57cec5SDimitry Andric LexingRawMode = true;
2080b57cec5SDimitry Andric }
2090b57cec5SDimitry Andric
2100b57cec5SDimitry Andric /// Lexer constructor - Create a new raw lexer object. This object is only
2110b57cec5SDimitry Andric /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text
2120b57cec5SDimitry Andric /// range will outlive it, so it doesn't take ownership of it.
Lexer(FileID FID,const llvm::MemoryBufferRef & FromFile,const SourceManager & SM,const LangOptions & langOpts,bool IsFirstIncludeOfFile)213e8d8bef9SDimitry Andric Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
214349cc55cSDimitry Andric const SourceManager &SM, const LangOptions &langOpts,
215349cc55cSDimitry Andric bool IsFirstIncludeOfFile)
216e8d8bef9SDimitry Andric : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
217349cc55cSDimitry Andric FromFile.getBufferStart(), FromFile.getBufferEnd(),
218349cc55cSDimitry Andric IsFirstIncludeOfFile) {}
2190b57cec5SDimitry Andric
resetExtendedTokenMode()2200b57cec5SDimitry Andric void Lexer::resetExtendedTokenMode() {
2210b57cec5SDimitry Andric assert(PP && "Cannot reset token mode without a preprocessor");
2220b57cec5SDimitry Andric if (LangOpts.TraditionalCPP)
2230b57cec5SDimitry Andric SetKeepWhitespaceMode(true);
2240b57cec5SDimitry Andric else
2250b57cec5SDimitry Andric SetCommentRetentionState(PP->getCommentRetentionState());
2260b57cec5SDimitry Andric }
2270b57cec5SDimitry Andric
2280b57cec5SDimitry Andric /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
2290b57cec5SDimitry Andric /// _Pragma expansion. This has a variety of magic semantics that this method
2300b57cec5SDimitry Andric /// sets up. It returns a new'd Lexer that must be delete'd when done.
2310b57cec5SDimitry Andric ///
2320b57cec5SDimitry Andric /// On entrance to this routine, TokStartLoc is a macro location which has a
2330b57cec5SDimitry Andric /// spelling loc that indicates the bytes to be lexed for the token and an
2340b57cec5SDimitry Andric /// expansion location that indicates where all lexed tokens should be
2350b57cec5SDimitry Andric /// "expanded from".
2360b57cec5SDimitry Andric ///
2370b57cec5SDimitry Andric /// TODO: It would really be nice to make _Pragma just be a wrapper around a
2380b57cec5SDimitry Andric /// normal lexer that remaps tokens as they fly by. This would require making
2390b57cec5SDimitry Andric /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer
2400b57cec5SDimitry Andric /// interface that could handle this stuff. This would pull GetMappedTokenLoc
2410b57cec5SDimitry Andric /// out of the critical path of the lexer!
2420b57cec5SDimitry Andric ///
Create_PragmaLexer(SourceLocation SpellingLoc,SourceLocation ExpansionLocStart,SourceLocation ExpansionLocEnd,unsigned TokLen,Preprocessor & PP)2430b57cec5SDimitry Andric Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
2440b57cec5SDimitry Andric SourceLocation ExpansionLocStart,
2450b57cec5SDimitry Andric SourceLocation ExpansionLocEnd,
2460b57cec5SDimitry Andric unsigned TokLen, Preprocessor &PP) {
2470b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager();
2480b57cec5SDimitry Andric
2490b57cec5SDimitry Andric // Create the lexer as if we were going to lex the file normally.
2500b57cec5SDimitry Andric FileID SpellingFID = SM.getFileID(SpellingLoc);
251e8d8bef9SDimitry Andric llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
2520b57cec5SDimitry Andric Lexer *L = new Lexer(SpellingFID, InputFile, PP);
2530b57cec5SDimitry Andric
2540b57cec5SDimitry Andric // Now that the lexer is created, change the start/end locations so that we
2550b57cec5SDimitry Andric // just lex the subsection of the file that we want. This is lexing from a
2560b57cec5SDimitry Andric // scratch buffer.
2570b57cec5SDimitry Andric const char *StrData = SM.getCharacterData(SpellingLoc);
2580b57cec5SDimitry Andric
2590b57cec5SDimitry Andric L->BufferPtr = StrData;
2600b57cec5SDimitry Andric L->BufferEnd = StrData+TokLen;
2610b57cec5SDimitry Andric assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
2620b57cec5SDimitry Andric
2630b57cec5SDimitry Andric // Set the SourceLocation with the remapping information. This ensures that
2640b57cec5SDimitry Andric // GetMappedTokenLoc will remap the tokens as they are lexed.
2650b57cec5SDimitry Andric L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
2660b57cec5SDimitry Andric ExpansionLocStart,
2670b57cec5SDimitry Andric ExpansionLocEnd, TokLen);
2680b57cec5SDimitry Andric
2690b57cec5SDimitry Andric // Ensure that the lexer thinks it is inside a directive, so that end \n will
2700b57cec5SDimitry Andric // return an EOD token.
2710b57cec5SDimitry Andric L->ParsingPreprocessorDirective = true;
2720b57cec5SDimitry Andric
2730b57cec5SDimitry Andric // This lexer really is for _Pragma.
2740b57cec5SDimitry Andric L->Is_PragmaLexer = true;
2750b57cec5SDimitry Andric return L;
2760b57cec5SDimitry Andric }
2770b57cec5SDimitry Andric
seek(unsigned Offset,bool IsAtStartOfLine)27881ad6265SDimitry Andric void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
27981ad6265SDimitry Andric this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
28081ad6265SDimitry Andric this->IsAtStartOfLine = IsAtStartOfLine;
28181ad6265SDimitry Andric assert((BufferStart + Offset) <= BufferEnd);
28281ad6265SDimitry Andric BufferPtr = BufferStart + Offset;
283a7dea167SDimitry Andric }
284a7dea167SDimitry Andric
StringifyImpl(T & Str,char Quote)2850b57cec5SDimitry Andric template <typename T> static void StringifyImpl(T &Str, char Quote) {
2860b57cec5SDimitry Andric typename T::size_type i = 0, e = Str.size();
2870b57cec5SDimitry Andric while (i < e) {
2880b57cec5SDimitry Andric if (Str[i] == '\\' || Str[i] == Quote) {
2890b57cec5SDimitry Andric Str.insert(Str.begin() + i, '\\');
2900b57cec5SDimitry Andric i += 2;
2910b57cec5SDimitry Andric ++e;
2920b57cec5SDimitry Andric } else if (Str[i] == '\n' || Str[i] == '\r') {
2930b57cec5SDimitry Andric // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
2940b57cec5SDimitry Andric if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
2950b57cec5SDimitry Andric Str[i] != Str[i + 1]) {
2960b57cec5SDimitry Andric Str[i] = '\\';
2970b57cec5SDimitry Andric Str[i + 1] = 'n';
2980b57cec5SDimitry Andric } else {
2990b57cec5SDimitry Andric // Replace '\n' and '\r' to '\\' followed by 'n'.
3000b57cec5SDimitry Andric Str[i] = '\\';
3010b57cec5SDimitry Andric Str.insert(Str.begin() + i + 1, 'n');
3020b57cec5SDimitry Andric ++e;
3030b57cec5SDimitry Andric }
3040b57cec5SDimitry Andric i += 2;
3050b57cec5SDimitry Andric } else
3060b57cec5SDimitry Andric ++i;
3070b57cec5SDimitry Andric }
3080b57cec5SDimitry Andric }
3090b57cec5SDimitry Andric
Stringify(StringRef Str,bool Charify)3100b57cec5SDimitry Andric std::string Lexer::Stringify(StringRef Str, bool Charify) {
3115ffd83dbSDimitry Andric std::string Result = std::string(Str);
3120b57cec5SDimitry Andric char Quote = Charify ? '\'' : '"';
3130b57cec5SDimitry Andric StringifyImpl(Result, Quote);
3140b57cec5SDimitry Andric return Result;
3150b57cec5SDimitry Andric }
3160b57cec5SDimitry Andric
Stringify(SmallVectorImpl<char> & Str)3170b57cec5SDimitry Andric void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
3180b57cec5SDimitry Andric
3190b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
3200b57cec5SDimitry Andric // Token Spelling
3210b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
3220b57cec5SDimitry Andric
3230b57cec5SDimitry Andric /// Slow case of getSpelling. Extract the characters comprising the
3240b57cec5SDimitry Andric /// spelling of this token from the provided input buffer.
getSpellingSlow(const Token & Tok,const char * BufPtr,const LangOptions & LangOpts,char * Spelling)3250b57cec5SDimitry Andric static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
3260b57cec5SDimitry Andric const LangOptions &LangOpts, char *Spelling) {
3270b57cec5SDimitry Andric assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
3280b57cec5SDimitry Andric
3290b57cec5SDimitry Andric size_t Length = 0;
3300b57cec5SDimitry Andric const char *BufEnd = BufPtr + Tok.getLength();
3310b57cec5SDimitry Andric
3320b57cec5SDimitry Andric if (tok::isStringLiteral(Tok.getKind())) {
3330b57cec5SDimitry Andric // Munch the encoding-prefix and opening double-quote.
3340b57cec5SDimitry Andric while (BufPtr < BufEnd) {
3355f757f3fSDimitry Andric auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
3365f757f3fSDimitry Andric Spelling[Length++] = CharAndSize.Char;
3375f757f3fSDimitry Andric BufPtr += CharAndSize.Size;
3380b57cec5SDimitry Andric
3390b57cec5SDimitry Andric if (Spelling[Length - 1] == '"')
3400b57cec5SDimitry Andric break;
3410b57cec5SDimitry Andric }
3420b57cec5SDimitry Andric
3430b57cec5SDimitry Andric // Raw string literals need special handling; trigraph expansion and line
3440b57cec5SDimitry Andric // splicing do not occur within their d-char-sequence nor within their
3450b57cec5SDimitry Andric // r-char-sequence.
3460b57cec5SDimitry Andric if (Length >= 2 &&
3470b57cec5SDimitry Andric Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
3480b57cec5SDimitry Andric // Search backwards from the end of the token to find the matching closing
3490b57cec5SDimitry Andric // quote.
3500b57cec5SDimitry Andric const char *RawEnd = BufEnd;
3510b57cec5SDimitry Andric do --RawEnd; while (*RawEnd != '"');
3520b57cec5SDimitry Andric size_t RawLength = RawEnd - BufPtr + 1;
3530b57cec5SDimitry Andric
3540b57cec5SDimitry Andric // Everything between the quotes is included verbatim in the spelling.
3550b57cec5SDimitry Andric memcpy(Spelling + Length, BufPtr, RawLength);
3560b57cec5SDimitry Andric Length += RawLength;
3570b57cec5SDimitry Andric BufPtr += RawLength;
3580b57cec5SDimitry Andric
3590b57cec5SDimitry Andric // The rest of the token is lexed normally.
3600b57cec5SDimitry Andric }
3610b57cec5SDimitry Andric }
3620b57cec5SDimitry Andric
3630b57cec5SDimitry Andric while (BufPtr < BufEnd) {
3645f757f3fSDimitry Andric auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
3655f757f3fSDimitry Andric Spelling[Length++] = CharAndSize.Char;
3665f757f3fSDimitry Andric BufPtr += CharAndSize.Size;
3670b57cec5SDimitry Andric }
3680b57cec5SDimitry Andric
3690b57cec5SDimitry Andric assert(Length < Tok.getLength() &&
3700b57cec5SDimitry Andric "NeedsCleaning flag set on token that didn't need cleaning!");
3710b57cec5SDimitry Andric return Length;
3720b57cec5SDimitry Andric }
3730b57cec5SDimitry Andric
3740b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a
3750b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file
3760b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this
3770b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs
3780b57cec5SDimitry Andric /// UCNs, etc.
getSpelling(SourceLocation loc,SmallVectorImpl<char> & buffer,const SourceManager & SM,const LangOptions & options,bool * invalid)3790b57cec5SDimitry Andric StringRef Lexer::getSpelling(SourceLocation loc,
3800b57cec5SDimitry Andric SmallVectorImpl<char> &buffer,
3810b57cec5SDimitry Andric const SourceManager &SM,
3820b57cec5SDimitry Andric const LangOptions &options,
3830b57cec5SDimitry Andric bool *invalid) {
3840b57cec5SDimitry Andric // Break down the source location.
3850b57cec5SDimitry Andric std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
3860b57cec5SDimitry Andric
3870b57cec5SDimitry Andric // Try to the load the file buffer.
3880b57cec5SDimitry Andric bool invalidTemp = false;
3890b57cec5SDimitry Andric StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
3900b57cec5SDimitry Andric if (invalidTemp) {
3910b57cec5SDimitry Andric if (invalid) *invalid = true;
3920b57cec5SDimitry Andric return {};
3930b57cec5SDimitry Andric }
3940b57cec5SDimitry Andric
3950b57cec5SDimitry Andric const char *tokenBegin = file.data() + locInfo.second;
3960b57cec5SDimitry Andric
3970b57cec5SDimitry Andric // Lex from the start of the given location.
3980b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
3990b57cec5SDimitry Andric file.begin(), tokenBegin, file.end());
4000b57cec5SDimitry Andric Token token;
4010b57cec5SDimitry Andric lexer.LexFromRawLexer(token);
4020b57cec5SDimitry Andric
4030b57cec5SDimitry Andric unsigned length = token.getLength();
4040b57cec5SDimitry Andric
4050b57cec5SDimitry Andric // Common case: no need for cleaning.
4060b57cec5SDimitry Andric if (!token.needsCleaning())
4070b57cec5SDimitry Andric return StringRef(tokenBegin, length);
4080b57cec5SDimitry Andric
4090b57cec5SDimitry Andric // Hard case, we need to relex the characters into the string.
4100b57cec5SDimitry Andric buffer.resize(length);
4110b57cec5SDimitry Andric buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
4120b57cec5SDimitry Andric return StringRef(buffer.data(), buffer.size());
4130b57cec5SDimitry Andric }
4140b57cec5SDimitry Andric
4150b57cec5SDimitry Andric /// getSpelling() - Return the 'spelling' of this token. The spelling of a
4160b57cec5SDimitry Andric /// token are the characters used to represent the token in the source file
4170b57cec5SDimitry Andric /// after trigraph expansion and escaped-newline folding. In particular, this
4180b57cec5SDimitry Andric /// wants to get the true, uncanonicalized, spelling of things like digraphs
4190b57cec5SDimitry Andric /// UCNs, etc.
getSpelling(const Token & Tok,const SourceManager & SourceMgr,const LangOptions & LangOpts,bool * Invalid)4200b57cec5SDimitry Andric std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
4210b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) {
4220b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
4230b57cec5SDimitry Andric
4240b57cec5SDimitry Andric bool CharDataInvalid = false;
4250b57cec5SDimitry Andric const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
4260b57cec5SDimitry Andric &CharDataInvalid);
4270b57cec5SDimitry Andric if (Invalid)
4280b57cec5SDimitry Andric *Invalid = CharDataInvalid;
4290b57cec5SDimitry Andric if (CharDataInvalid)
4300b57cec5SDimitry Andric return {};
4310b57cec5SDimitry Andric
4320b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly.
4330b57cec5SDimitry Andric if (!Tok.needsCleaning())
4340b57cec5SDimitry Andric return std::string(TokStart, TokStart + Tok.getLength());
4350b57cec5SDimitry Andric
4360b57cec5SDimitry Andric std::string Result;
4370b57cec5SDimitry Andric Result.resize(Tok.getLength());
4380b57cec5SDimitry Andric Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
4390b57cec5SDimitry Andric return Result;
4400b57cec5SDimitry Andric }
4410b57cec5SDimitry Andric
4420b57cec5SDimitry Andric /// getSpelling - This method is used to get the spelling of a token into a
4430b57cec5SDimitry Andric /// preallocated buffer, instead of as an std::string. The caller is required
4440b57cec5SDimitry Andric /// to allocate enough space for the token, which is guaranteed to be at least
4450b57cec5SDimitry Andric /// Tok.getLength() bytes long. The actual length of the token is returned.
4460b57cec5SDimitry Andric ///
4470b57cec5SDimitry Andric /// Note that this method may do two possible things: it may either fill in
4480b57cec5SDimitry Andric /// the buffer specified with characters, or it may *change the input pointer*
4490b57cec5SDimitry Andric /// to point to a constant buffer with the data already in it (avoiding a
4500b57cec5SDimitry Andric /// copy). The caller is not allowed to modify the returned buffer pointer
4510b57cec5SDimitry Andric /// if an internal buffer is returned.
getSpelling(const Token & Tok,const char * & Buffer,const SourceManager & SourceMgr,const LangOptions & LangOpts,bool * Invalid)4520b57cec5SDimitry Andric unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
4530b57cec5SDimitry Andric const SourceManager &SourceMgr,
4540b57cec5SDimitry Andric const LangOptions &LangOpts, bool *Invalid) {
4550b57cec5SDimitry Andric assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
4560b57cec5SDimitry Andric
4570b57cec5SDimitry Andric const char *TokStart = nullptr;
4580b57cec5SDimitry Andric // NOTE: this has to be checked *before* testing for an IdentifierInfo.
4590b57cec5SDimitry Andric if (Tok.is(tok::raw_identifier))
4600b57cec5SDimitry Andric TokStart = Tok.getRawIdentifier().data();
4610b57cec5SDimitry Andric else if (!Tok.hasUCN()) {
4620b57cec5SDimitry Andric if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
4630b57cec5SDimitry Andric // Just return the string from the identifier table, which is very quick.
4640b57cec5SDimitry Andric Buffer = II->getNameStart();
4650b57cec5SDimitry Andric return II->getLength();
4660b57cec5SDimitry Andric }
4670b57cec5SDimitry Andric }
4680b57cec5SDimitry Andric
4690b57cec5SDimitry Andric // NOTE: this can be checked even after testing for an IdentifierInfo.
4700b57cec5SDimitry Andric if (Tok.isLiteral())
4710b57cec5SDimitry Andric TokStart = Tok.getLiteralData();
4720b57cec5SDimitry Andric
4730b57cec5SDimitry Andric if (!TokStart) {
4740b57cec5SDimitry Andric // Compute the start of the token in the input lexer buffer.
4750b57cec5SDimitry Andric bool CharDataInvalid = false;
4760b57cec5SDimitry Andric TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
4770b57cec5SDimitry Andric if (Invalid)
4780b57cec5SDimitry Andric *Invalid = CharDataInvalid;
4790b57cec5SDimitry Andric if (CharDataInvalid) {
4800b57cec5SDimitry Andric Buffer = "";
4810b57cec5SDimitry Andric return 0;
4820b57cec5SDimitry Andric }
4830b57cec5SDimitry Andric }
4840b57cec5SDimitry Andric
4850b57cec5SDimitry Andric // If this token contains nothing interesting, return it directly.
4860b57cec5SDimitry Andric if (!Tok.needsCleaning()) {
4870b57cec5SDimitry Andric Buffer = TokStart;
4880b57cec5SDimitry Andric return Tok.getLength();
4890b57cec5SDimitry Andric }
4900b57cec5SDimitry Andric
4910b57cec5SDimitry Andric // Otherwise, hard case, relex the characters into the string.
4920b57cec5SDimitry Andric return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
4930b57cec5SDimitry Andric }
4940b57cec5SDimitry Andric
4950b57cec5SDimitry Andric /// MeasureTokenLength - Relex the token at the specified location and return
4960b57cec5SDimitry Andric /// its length in bytes in the input file. If the token needs cleaning (e.g.
4970b57cec5SDimitry Andric /// includes a trigraph or an escaped newline) then this count includes bytes
4980b57cec5SDimitry Andric /// that are part of that.
MeasureTokenLength(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)4990b57cec5SDimitry Andric unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
5000b57cec5SDimitry Andric const SourceManager &SM,
5010b57cec5SDimitry Andric const LangOptions &LangOpts) {
5020b57cec5SDimitry Andric Token TheTok;
5030b57cec5SDimitry Andric if (getRawToken(Loc, TheTok, SM, LangOpts))
5040b57cec5SDimitry Andric return 0;
5050b57cec5SDimitry Andric return TheTok.getLength();
5060b57cec5SDimitry Andric }
5070b57cec5SDimitry Andric
5080b57cec5SDimitry Andric /// Relex the token at the specified location.
5090b57cec5SDimitry Andric /// \returns true if there was a failure, false on success.
getRawToken(SourceLocation Loc,Token & Result,const SourceManager & SM,const LangOptions & LangOpts,bool IgnoreWhiteSpace)5100b57cec5SDimitry Andric bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
5110b57cec5SDimitry Andric const SourceManager &SM,
5120b57cec5SDimitry Andric const LangOptions &LangOpts,
5130b57cec5SDimitry Andric bool IgnoreWhiteSpace) {
5140b57cec5SDimitry Andric // TODO: this could be special cased for common tokens like identifiers, ')',
5150b57cec5SDimitry Andric // etc to make this faster, if it mattered. Just look at StrData[0] to handle
5160b57cec5SDimitry Andric // all obviously single-char tokens. This could use
5170b57cec5SDimitry Andric // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
5180b57cec5SDimitry Andric // something.
5190b57cec5SDimitry Andric
5200b57cec5SDimitry Andric // If this comes from a macro expansion, we really do want the macro name, not
5210b57cec5SDimitry Andric // the token this macro expanded to.
5220b57cec5SDimitry Andric Loc = SM.getExpansionLoc(Loc);
5230b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
5240b57cec5SDimitry Andric bool Invalid = false;
5250b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
5260b57cec5SDimitry Andric if (Invalid)
5270b57cec5SDimitry Andric return true;
5280b57cec5SDimitry Andric
5290b57cec5SDimitry Andric const char *StrData = Buffer.data()+LocInfo.second;
5300b57cec5SDimitry Andric
5310b57cec5SDimitry Andric if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
5320b57cec5SDimitry Andric return true;
5330b57cec5SDimitry Andric
5340b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token.
5350b57cec5SDimitry Andric Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
5360b57cec5SDimitry Andric Buffer.begin(), StrData, Buffer.end());
5370b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true);
5380b57cec5SDimitry Andric TheLexer.LexFromRawLexer(Result);
5390b57cec5SDimitry Andric return false;
5400b57cec5SDimitry Andric }
5410b57cec5SDimitry Andric
5420b57cec5SDimitry Andric /// Returns the pointer that points to the beginning of line that contains
5430b57cec5SDimitry Andric /// the given offset, or null if the offset if invalid.
findBeginningOfLine(StringRef Buffer,unsigned Offset)5440b57cec5SDimitry Andric static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
5450b57cec5SDimitry Andric const char *BufStart = Buffer.data();
5460b57cec5SDimitry Andric if (Offset >= Buffer.size())
5470b57cec5SDimitry Andric return nullptr;
5480b57cec5SDimitry Andric
5490b57cec5SDimitry Andric const char *LexStart = BufStart + Offset;
5500b57cec5SDimitry Andric for (; LexStart != BufStart; --LexStart) {
5510b57cec5SDimitry Andric if (isVerticalWhitespace(LexStart[0]) &&
5520b57cec5SDimitry Andric !Lexer::isNewLineEscaped(BufStart, LexStart)) {
5530b57cec5SDimitry Andric // LexStart should point at first character of logical line.
5540b57cec5SDimitry Andric ++LexStart;
5550b57cec5SDimitry Andric break;
5560b57cec5SDimitry Andric }
5570b57cec5SDimitry Andric }
5580b57cec5SDimitry Andric return LexStart;
5590b57cec5SDimitry Andric }
5600b57cec5SDimitry Andric
getBeginningOfFileToken(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)5610b57cec5SDimitry Andric static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
5620b57cec5SDimitry Andric const SourceManager &SM,
5630b57cec5SDimitry Andric const LangOptions &LangOpts) {
5640b57cec5SDimitry Andric assert(Loc.isFileID());
5650b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
5660b57cec5SDimitry Andric if (LocInfo.first.isInvalid())
5670b57cec5SDimitry Andric return Loc;
5680b57cec5SDimitry Andric
5690b57cec5SDimitry Andric bool Invalid = false;
5700b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
5710b57cec5SDimitry Andric if (Invalid)
5720b57cec5SDimitry Andric return Loc;
5730b57cec5SDimitry Andric
5740b57cec5SDimitry Andric // Back up from the current location until we hit the beginning of a line
5750b57cec5SDimitry Andric // (or the buffer). We'll relex from that point.
5760b57cec5SDimitry Andric const char *StrData = Buffer.data() + LocInfo.second;
5770b57cec5SDimitry Andric const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
5780b57cec5SDimitry Andric if (!LexStart || LexStart == StrData)
5790b57cec5SDimitry Andric return Loc;
5800b57cec5SDimitry Andric
5810b57cec5SDimitry Andric // Create a lexer starting at the beginning of this token.
5820b57cec5SDimitry Andric SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
5830b57cec5SDimitry Andric Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
5840b57cec5SDimitry Andric Buffer.end());
5850b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true);
5860b57cec5SDimitry Andric
5870b57cec5SDimitry Andric // Lex tokens until we find the token that contains the source location.
5880b57cec5SDimitry Andric Token TheTok;
5890b57cec5SDimitry Andric do {
5900b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok);
5910b57cec5SDimitry Andric
5920b57cec5SDimitry Andric if (TheLexer.getBufferLocation() > StrData) {
5930b57cec5SDimitry Andric // Lexing this token has taken the lexer past the source location we're
5940b57cec5SDimitry Andric // looking for. If the current token encompasses our source location,
5950b57cec5SDimitry Andric // return the beginning of that token.
5960b57cec5SDimitry Andric if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
5970b57cec5SDimitry Andric return TheTok.getLocation();
5980b57cec5SDimitry Andric
5990b57cec5SDimitry Andric // We ended up skipping over the source location entirely, which means
6000b57cec5SDimitry Andric // that it points into whitespace. We're done here.
6010b57cec5SDimitry Andric break;
6020b57cec5SDimitry Andric }
6030b57cec5SDimitry Andric } while (TheTok.getKind() != tok::eof);
6040b57cec5SDimitry Andric
6050b57cec5SDimitry Andric // We've passed our source location; just return the original source location.
6060b57cec5SDimitry Andric return Loc;
6070b57cec5SDimitry Andric }
6080b57cec5SDimitry Andric
GetBeginningOfToken(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)6090b57cec5SDimitry Andric SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
6100b57cec5SDimitry Andric const SourceManager &SM,
6110b57cec5SDimitry Andric const LangOptions &LangOpts) {
6120b57cec5SDimitry Andric if (Loc.isFileID())
6130b57cec5SDimitry Andric return getBeginningOfFileToken(Loc, SM, LangOpts);
6140b57cec5SDimitry Andric
6150b57cec5SDimitry Andric if (!SM.isMacroArgExpansion(Loc))
6160b57cec5SDimitry Andric return Loc;
6170b57cec5SDimitry Andric
6180b57cec5SDimitry Andric SourceLocation FileLoc = SM.getSpellingLoc(Loc);
6190b57cec5SDimitry Andric SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
6200b57cec5SDimitry Andric std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
6210b57cec5SDimitry Andric std::pair<FileID, unsigned> BeginFileLocInfo =
6220b57cec5SDimitry Andric SM.getDecomposedLoc(BeginFileLoc);
6230b57cec5SDimitry Andric assert(FileLocInfo.first == BeginFileLocInfo.first &&
6240b57cec5SDimitry Andric FileLocInfo.second >= BeginFileLocInfo.second);
6250b57cec5SDimitry Andric return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
6260b57cec5SDimitry Andric }
6270b57cec5SDimitry Andric
6280b57cec5SDimitry Andric namespace {
6290b57cec5SDimitry Andric
6300b57cec5SDimitry Andric enum PreambleDirectiveKind {
6310b57cec5SDimitry Andric PDK_Skipped,
6320b57cec5SDimitry Andric PDK_Unknown
6330b57cec5SDimitry Andric };
6340b57cec5SDimitry Andric
6350b57cec5SDimitry Andric } // namespace
6360b57cec5SDimitry Andric
ComputePreamble(StringRef Buffer,const LangOptions & LangOpts,unsigned MaxLines)6370b57cec5SDimitry Andric PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
6380b57cec5SDimitry Andric const LangOptions &LangOpts,
6390b57cec5SDimitry Andric unsigned MaxLines) {
6400b57cec5SDimitry Andric // Create a lexer starting at the beginning of the file. Note that we use a
6410b57cec5SDimitry Andric // "fake" file source location at offset 1 so that the lexer will track our
6420b57cec5SDimitry Andric // position within the file.
643fe6060f1SDimitry Andric const SourceLocation::UIntTy StartOffset = 1;
6440b57cec5SDimitry Andric SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
6450b57cec5SDimitry Andric Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
6460b57cec5SDimitry Andric Buffer.end());
6470b57cec5SDimitry Andric TheLexer.SetCommentRetentionState(true);
6480b57cec5SDimitry Andric
6490b57cec5SDimitry Andric bool InPreprocessorDirective = false;
6500b57cec5SDimitry Andric Token TheTok;
6510b57cec5SDimitry Andric SourceLocation ActiveCommentLoc;
6520b57cec5SDimitry Andric
6530b57cec5SDimitry Andric unsigned MaxLineOffset = 0;
6540b57cec5SDimitry Andric if (MaxLines) {
6550b57cec5SDimitry Andric const char *CurPtr = Buffer.begin();
6560b57cec5SDimitry Andric unsigned CurLine = 0;
6570b57cec5SDimitry Andric while (CurPtr != Buffer.end()) {
6580b57cec5SDimitry Andric char ch = *CurPtr++;
6590b57cec5SDimitry Andric if (ch == '\n') {
6600b57cec5SDimitry Andric ++CurLine;
6610b57cec5SDimitry Andric if (CurLine == MaxLines)
6620b57cec5SDimitry Andric break;
6630b57cec5SDimitry Andric }
6640b57cec5SDimitry Andric }
6650b57cec5SDimitry Andric if (CurPtr != Buffer.end())
6660b57cec5SDimitry Andric MaxLineOffset = CurPtr - Buffer.begin();
6670b57cec5SDimitry Andric }
6680b57cec5SDimitry Andric
6690b57cec5SDimitry Andric do {
6700b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok);
6710b57cec5SDimitry Andric
6720b57cec5SDimitry Andric if (InPreprocessorDirective) {
6730b57cec5SDimitry Andric // If we've hit the end of the file, we're done.
6740b57cec5SDimitry Andric if (TheTok.getKind() == tok::eof) {
6750b57cec5SDimitry Andric break;
6760b57cec5SDimitry Andric }
6770b57cec5SDimitry Andric
6780b57cec5SDimitry Andric // If we haven't hit the end of the preprocessor directive, skip this
6790b57cec5SDimitry Andric // token.
6800b57cec5SDimitry Andric if (!TheTok.isAtStartOfLine())
6810b57cec5SDimitry Andric continue;
6820b57cec5SDimitry Andric
6830b57cec5SDimitry Andric // We've passed the end of the preprocessor directive, and will look
6840b57cec5SDimitry Andric // at this token again below.
6850b57cec5SDimitry Andric InPreprocessorDirective = false;
6860b57cec5SDimitry Andric }
6870b57cec5SDimitry Andric
6880b57cec5SDimitry Andric // Keep track of the # of lines in the preamble.
6890b57cec5SDimitry Andric if (TheTok.isAtStartOfLine()) {
6900b57cec5SDimitry Andric unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
6910b57cec5SDimitry Andric
6920b57cec5SDimitry Andric // If we were asked to limit the number of lines in the preamble,
6930b57cec5SDimitry Andric // and we're about to exceed that limit, we're done.
6940b57cec5SDimitry Andric if (MaxLineOffset && TokOffset >= MaxLineOffset)
6950b57cec5SDimitry Andric break;
6960b57cec5SDimitry Andric }
6970b57cec5SDimitry Andric
6980b57cec5SDimitry Andric // Comments are okay; skip over them.
6990b57cec5SDimitry Andric if (TheTok.getKind() == tok::comment) {
7000b57cec5SDimitry Andric if (ActiveCommentLoc.isInvalid())
7010b57cec5SDimitry Andric ActiveCommentLoc = TheTok.getLocation();
7020b57cec5SDimitry Andric continue;
7030b57cec5SDimitry Andric }
7040b57cec5SDimitry Andric
7050b57cec5SDimitry Andric if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
7060b57cec5SDimitry Andric // This is the start of a preprocessor directive.
7070b57cec5SDimitry Andric Token HashTok = TheTok;
7080b57cec5SDimitry Andric InPreprocessorDirective = true;
7090b57cec5SDimitry Andric ActiveCommentLoc = SourceLocation();
7100b57cec5SDimitry Andric
7110b57cec5SDimitry Andric // Figure out which directive this is. Since we're lexing raw tokens,
7120b57cec5SDimitry Andric // we don't have an identifier table available. Instead, just look at
7130b57cec5SDimitry Andric // the raw identifier to recognize and categorize preprocessor directives.
7140b57cec5SDimitry Andric TheLexer.LexFromRawLexer(TheTok);
7150b57cec5SDimitry Andric if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
7160b57cec5SDimitry Andric StringRef Keyword = TheTok.getRawIdentifier();
7170b57cec5SDimitry Andric PreambleDirectiveKind PDK
7180b57cec5SDimitry Andric = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
7190b57cec5SDimitry Andric .Case("include", PDK_Skipped)
7200b57cec5SDimitry Andric .Case("__include_macros", PDK_Skipped)
7210b57cec5SDimitry Andric .Case("define", PDK_Skipped)
7220b57cec5SDimitry Andric .Case("undef", PDK_Skipped)
7230b57cec5SDimitry Andric .Case("line", PDK_Skipped)
7240b57cec5SDimitry Andric .Case("error", PDK_Skipped)
7250b57cec5SDimitry Andric .Case("pragma", PDK_Skipped)
7260b57cec5SDimitry Andric .Case("import", PDK_Skipped)
7270b57cec5SDimitry Andric .Case("include_next", PDK_Skipped)
7280b57cec5SDimitry Andric .Case("warning", PDK_Skipped)
7290b57cec5SDimitry Andric .Case("ident", PDK_Skipped)
7300b57cec5SDimitry Andric .Case("sccs", PDK_Skipped)
7310b57cec5SDimitry Andric .Case("assert", PDK_Skipped)
7320b57cec5SDimitry Andric .Case("unassert", PDK_Skipped)
7330b57cec5SDimitry Andric .Case("if", PDK_Skipped)
7340b57cec5SDimitry Andric .Case("ifdef", PDK_Skipped)
7350b57cec5SDimitry Andric .Case("ifndef", PDK_Skipped)
7360b57cec5SDimitry Andric .Case("elif", PDK_Skipped)
737fe6060f1SDimitry Andric .Case("elifdef", PDK_Skipped)
738fe6060f1SDimitry Andric .Case("elifndef", PDK_Skipped)
7390b57cec5SDimitry Andric .Case("else", PDK_Skipped)
7400b57cec5SDimitry Andric .Case("endif", PDK_Skipped)
7410b57cec5SDimitry Andric .Default(PDK_Unknown);
7420b57cec5SDimitry Andric
7430b57cec5SDimitry Andric switch (PDK) {
7440b57cec5SDimitry Andric case PDK_Skipped:
7450b57cec5SDimitry Andric continue;
7460b57cec5SDimitry Andric
7470b57cec5SDimitry Andric case PDK_Unknown:
7480b57cec5SDimitry Andric // We don't know what this directive is; stop at the '#'.
7490b57cec5SDimitry Andric break;
7500b57cec5SDimitry Andric }
7510b57cec5SDimitry Andric }
7520b57cec5SDimitry Andric
7530b57cec5SDimitry Andric // We only end up here if we didn't recognize the preprocessor
7540b57cec5SDimitry Andric // directive or it was one that can't occur in the preamble at this
7550b57cec5SDimitry Andric // point. Roll back the current token to the location of the '#'.
7560b57cec5SDimitry Andric TheTok = HashTok;
7575f757f3fSDimitry Andric } else if (TheTok.isAtStartOfLine() &&
7585f757f3fSDimitry Andric TheTok.getKind() == tok::raw_identifier &&
7595f757f3fSDimitry Andric TheTok.getRawIdentifier() == "module" &&
7605f757f3fSDimitry Andric LangOpts.CPlusPlusModules) {
7615f757f3fSDimitry Andric // The initial global module fragment introducer "module;" is part of
7625f757f3fSDimitry Andric // the preamble, which runs up to the module declaration "module foo;".
7635f757f3fSDimitry Andric Token ModuleTok = TheTok;
7645f757f3fSDimitry Andric do {
7655f757f3fSDimitry Andric TheLexer.LexFromRawLexer(TheTok);
7665f757f3fSDimitry Andric } while (TheTok.getKind() == tok::comment);
7675f757f3fSDimitry Andric if (TheTok.getKind() != tok::semi) {
7685f757f3fSDimitry Andric // Not global module fragment, roll back.
7695f757f3fSDimitry Andric TheTok = ModuleTok;
7705f757f3fSDimitry Andric break;
7715f757f3fSDimitry Andric }
7725f757f3fSDimitry Andric continue;
7730b57cec5SDimitry Andric }
7740b57cec5SDimitry Andric
7750b57cec5SDimitry Andric // We hit a token that we don't recognize as being in the
7760b57cec5SDimitry Andric // "preprocessing only" part of the file, so we're no longer in
7770b57cec5SDimitry Andric // the preamble.
7780b57cec5SDimitry Andric break;
7790b57cec5SDimitry Andric } while (true);
7800b57cec5SDimitry Andric
7810b57cec5SDimitry Andric SourceLocation End;
7820b57cec5SDimitry Andric if (ActiveCommentLoc.isValid())
7830b57cec5SDimitry Andric End = ActiveCommentLoc; // don't truncate a decl comment.
7840b57cec5SDimitry Andric else
7850b57cec5SDimitry Andric End = TheTok.getLocation();
7860b57cec5SDimitry Andric
7870b57cec5SDimitry Andric return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
7880b57cec5SDimitry Andric TheTok.isAtStartOfLine());
7890b57cec5SDimitry Andric }
7900b57cec5SDimitry Andric
getTokenPrefixLength(SourceLocation TokStart,unsigned CharNo,const SourceManager & SM,const LangOptions & LangOpts)7910b57cec5SDimitry Andric unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
7920b57cec5SDimitry Andric const SourceManager &SM,
7930b57cec5SDimitry Andric const LangOptions &LangOpts) {
7940b57cec5SDimitry Andric // Figure out how many physical characters away the specified expansion
7950b57cec5SDimitry Andric // character is. This needs to take into consideration newlines and
7960b57cec5SDimitry Andric // trigraphs.
7970b57cec5SDimitry Andric bool Invalid = false;
7980b57cec5SDimitry Andric const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
7990b57cec5SDimitry Andric
8000b57cec5SDimitry Andric // If they request the first char of the token, we're trivially done.
8010b57cec5SDimitry Andric if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
8020b57cec5SDimitry Andric return 0;
8030b57cec5SDimitry Andric
8040b57cec5SDimitry Andric unsigned PhysOffset = 0;
8050b57cec5SDimitry Andric
8060b57cec5SDimitry Andric // The usual case is that tokens don't contain anything interesting. Skip
8070b57cec5SDimitry Andric // over the uninteresting characters. If a token only consists of simple
8080b57cec5SDimitry Andric // chars, this method is extremely fast.
8090b57cec5SDimitry Andric while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
8100b57cec5SDimitry Andric if (CharNo == 0)
8110b57cec5SDimitry Andric return PhysOffset;
8120b57cec5SDimitry Andric ++TokPtr;
8130b57cec5SDimitry Andric --CharNo;
8140b57cec5SDimitry Andric ++PhysOffset;
8150b57cec5SDimitry Andric }
8160b57cec5SDimitry Andric
8170b57cec5SDimitry Andric // If we have a character that may be a trigraph or escaped newline, use a
8180b57cec5SDimitry Andric // lexer to parse it correctly.
8190b57cec5SDimitry Andric for (; CharNo; --CharNo) {
8205f757f3fSDimitry Andric auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
8215f757f3fSDimitry Andric TokPtr += CharAndSize.Size;
8225f757f3fSDimitry Andric PhysOffset += CharAndSize.Size;
8230b57cec5SDimitry Andric }
8240b57cec5SDimitry Andric
8250b57cec5SDimitry Andric // Final detail: if we end up on an escaped newline, we want to return the
8260b57cec5SDimitry Andric // location of the actual byte of the token. For example foo\<newline>bar
8270b57cec5SDimitry Andric // advanced by 3 should return the location of b, not of \\. One compounding
8280b57cec5SDimitry Andric // detail of this is that the escape may be made by a trigraph.
8290b57cec5SDimitry Andric if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
8300b57cec5SDimitry Andric PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
8310b57cec5SDimitry Andric
8320b57cec5SDimitry Andric return PhysOffset;
8330b57cec5SDimitry Andric }
8340b57cec5SDimitry Andric
8350b57cec5SDimitry Andric /// Computes the source location just past the end of the
8360b57cec5SDimitry Andric /// token at this source location.
8370b57cec5SDimitry Andric ///
8380b57cec5SDimitry Andric /// This routine can be used to produce a source location that
8390b57cec5SDimitry Andric /// points just past the end of the token referenced by \p Loc, and
8400b57cec5SDimitry Andric /// is generally used when a diagnostic needs to point just after a
8410b57cec5SDimitry Andric /// token where it expected something different that it received. If
8420b57cec5SDimitry Andric /// the returned source location would not be meaningful (e.g., if
8430b57cec5SDimitry Andric /// it points into a macro), this routine returns an invalid
8440b57cec5SDimitry Andric /// source location.
8450b57cec5SDimitry Andric ///
8460b57cec5SDimitry Andric /// \param Offset an offset from the end of the token, where the source
8470b57cec5SDimitry Andric /// location should refer to. The default offset (0) produces a source
8480b57cec5SDimitry Andric /// location pointing just past the end of the token; an offset of 1 produces
8490b57cec5SDimitry Andric /// a source location pointing to the last character in the token, etc.
getLocForEndOfToken(SourceLocation Loc,unsigned Offset,const SourceManager & SM,const LangOptions & LangOpts)8500b57cec5SDimitry Andric SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
8510b57cec5SDimitry Andric const SourceManager &SM,
8520b57cec5SDimitry Andric const LangOptions &LangOpts) {
8530b57cec5SDimitry Andric if (Loc.isInvalid())
8540b57cec5SDimitry Andric return {};
8550b57cec5SDimitry Andric
8560b57cec5SDimitry Andric if (Loc.isMacroID()) {
8570b57cec5SDimitry Andric if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
8580b57cec5SDimitry Andric return {}; // Points inside the macro expansion.
8590b57cec5SDimitry Andric }
8600b57cec5SDimitry Andric
8610b57cec5SDimitry Andric unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
8620b57cec5SDimitry Andric if (Len > Offset)
8630b57cec5SDimitry Andric Len = Len - Offset;
8640b57cec5SDimitry Andric else
8650b57cec5SDimitry Andric return Loc;
8660b57cec5SDimitry Andric
8670b57cec5SDimitry Andric return Loc.getLocWithOffset(Len);
8680b57cec5SDimitry Andric }
8690b57cec5SDimitry Andric
8700b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the first
8710b57cec5SDimitry Andric /// token of the macro expansion.
isAtStartOfMacroExpansion(SourceLocation loc,const SourceManager & SM,const LangOptions & LangOpts,SourceLocation * MacroBegin)8720b57cec5SDimitry Andric bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
8730b57cec5SDimitry Andric const SourceManager &SM,
8740b57cec5SDimitry Andric const LangOptions &LangOpts,
8750b57cec5SDimitry Andric SourceLocation *MacroBegin) {
8760b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
8770b57cec5SDimitry Andric
8780b57cec5SDimitry Andric SourceLocation expansionLoc;
8790b57cec5SDimitry Andric if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
8800b57cec5SDimitry Andric return false;
8810b57cec5SDimitry Andric
8820b57cec5SDimitry Andric if (expansionLoc.isFileID()) {
8830b57cec5SDimitry Andric // No other macro expansions, this is the first.
8840b57cec5SDimitry Andric if (MacroBegin)
8850b57cec5SDimitry Andric *MacroBegin = expansionLoc;
8860b57cec5SDimitry Andric return true;
8870b57cec5SDimitry Andric }
8880b57cec5SDimitry Andric
8890b57cec5SDimitry Andric return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
8900b57cec5SDimitry Andric }
8910b57cec5SDimitry Andric
8920b57cec5SDimitry Andric /// Returns true if the given MacroID location points at the last
8930b57cec5SDimitry Andric /// token of the macro expansion.
isAtEndOfMacroExpansion(SourceLocation loc,const SourceManager & SM,const LangOptions & LangOpts,SourceLocation * MacroEnd)8940b57cec5SDimitry Andric bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
8950b57cec5SDimitry Andric const SourceManager &SM,
8960b57cec5SDimitry Andric const LangOptions &LangOpts,
8970b57cec5SDimitry Andric SourceLocation *MacroEnd) {
8980b57cec5SDimitry Andric assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
8990b57cec5SDimitry Andric
9000b57cec5SDimitry Andric SourceLocation spellLoc = SM.getSpellingLoc(loc);
9010b57cec5SDimitry Andric unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
9020b57cec5SDimitry Andric if (tokLen == 0)
9030b57cec5SDimitry Andric return false;
9040b57cec5SDimitry Andric
9050b57cec5SDimitry Andric SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
9060b57cec5SDimitry Andric SourceLocation expansionLoc;
9070b57cec5SDimitry Andric if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
9080b57cec5SDimitry Andric return false;
9090b57cec5SDimitry Andric
9100b57cec5SDimitry Andric if (expansionLoc.isFileID()) {
9110b57cec5SDimitry Andric // No other macro expansions.
9120b57cec5SDimitry Andric if (MacroEnd)
9130b57cec5SDimitry Andric *MacroEnd = expansionLoc;
9140b57cec5SDimitry Andric return true;
9150b57cec5SDimitry Andric }
9160b57cec5SDimitry Andric
9170b57cec5SDimitry Andric return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
9180b57cec5SDimitry Andric }
9190b57cec5SDimitry Andric
makeRangeFromFileLocs(CharSourceRange Range,const SourceManager & SM,const LangOptions & LangOpts)9200b57cec5SDimitry Andric static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
9210b57cec5SDimitry Andric const SourceManager &SM,
9220b57cec5SDimitry Andric const LangOptions &LangOpts) {
9230b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin();
9240b57cec5SDimitry Andric SourceLocation End = Range.getEnd();
9250b57cec5SDimitry Andric assert(Begin.isFileID() && End.isFileID());
9260b57cec5SDimitry Andric if (Range.isTokenRange()) {
9270b57cec5SDimitry Andric End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
9280b57cec5SDimitry Andric if (End.isInvalid())
9290b57cec5SDimitry Andric return {};
9300b57cec5SDimitry Andric }
9310b57cec5SDimitry Andric
9320b57cec5SDimitry Andric // Break down the source locations.
9330b57cec5SDimitry Andric FileID FID;
9340b57cec5SDimitry Andric unsigned BeginOffs;
9350b57cec5SDimitry Andric std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
9360b57cec5SDimitry Andric if (FID.isInvalid())
9370b57cec5SDimitry Andric return {};
9380b57cec5SDimitry Andric
9390b57cec5SDimitry Andric unsigned EndOffs;
9400b57cec5SDimitry Andric if (!SM.isInFileID(End, FID, &EndOffs) ||
9410b57cec5SDimitry Andric BeginOffs > EndOffs)
9420b57cec5SDimitry Andric return {};
9430b57cec5SDimitry Andric
9440b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End);
9450b57cec5SDimitry Andric }
9460b57cec5SDimitry Andric
947fe6060f1SDimitry Andric // Assumes that `Loc` is in an expansion.
isInExpansionTokenRange(const SourceLocation Loc,const SourceManager & SM)948fe6060f1SDimitry Andric static bool isInExpansionTokenRange(const SourceLocation Loc,
949fe6060f1SDimitry Andric const SourceManager &SM) {
950fe6060f1SDimitry Andric return SM.getSLocEntry(SM.getFileID(Loc))
951fe6060f1SDimitry Andric .getExpansion()
952fe6060f1SDimitry Andric .isExpansionTokenRange();
953fe6060f1SDimitry Andric }
954fe6060f1SDimitry Andric
makeFileCharRange(CharSourceRange Range,const SourceManager & SM,const LangOptions & LangOpts)9550b57cec5SDimitry Andric CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
9560b57cec5SDimitry Andric const SourceManager &SM,
9570b57cec5SDimitry Andric const LangOptions &LangOpts) {
9580b57cec5SDimitry Andric SourceLocation Begin = Range.getBegin();
9590b57cec5SDimitry Andric SourceLocation End = Range.getEnd();
9600b57cec5SDimitry Andric if (Begin.isInvalid() || End.isInvalid())
9610b57cec5SDimitry Andric return {};
9620b57cec5SDimitry Andric
9630b57cec5SDimitry Andric if (Begin.isFileID() && End.isFileID())
9640b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts);
9650b57cec5SDimitry Andric
9660b57cec5SDimitry Andric if (Begin.isMacroID() && End.isFileID()) {
9670b57cec5SDimitry Andric if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
9680b57cec5SDimitry Andric return {};
9690b57cec5SDimitry Andric Range.setBegin(Begin);
9700b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts);
9710b57cec5SDimitry Andric }
9720b57cec5SDimitry Andric
9730b57cec5SDimitry Andric if (Begin.isFileID() && End.isMacroID()) {
974fe6060f1SDimitry Andric if (Range.isTokenRange()) {
975fe6060f1SDimitry Andric if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
976fe6060f1SDimitry Andric return {};
977fe6060f1SDimitry Andric // Use the *original* end, not the expanded one in `End`.
978fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
979fe6060f1SDimitry Andric } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
9800b57cec5SDimitry Andric return {};
9810b57cec5SDimitry Andric Range.setEnd(End);
9820b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts);
9830b57cec5SDimitry Andric }
9840b57cec5SDimitry Andric
9850b57cec5SDimitry Andric assert(Begin.isMacroID() && End.isMacroID());
9860b57cec5SDimitry Andric SourceLocation MacroBegin, MacroEnd;
9870b57cec5SDimitry Andric if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
9880b57cec5SDimitry Andric ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
9890b57cec5SDimitry Andric &MacroEnd)) ||
9900b57cec5SDimitry Andric (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
9910b57cec5SDimitry Andric &MacroEnd)))) {
9920b57cec5SDimitry Andric Range.setBegin(MacroBegin);
9930b57cec5SDimitry Andric Range.setEnd(MacroEnd);
994fe6060f1SDimitry Andric // Use the *original* `End`, not the expanded one in `MacroEnd`.
995fe6060f1SDimitry Andric if (Range.isTokenRange())
996fe6060f1SDimitry Andric Range.setTokenRange(isInExpansionTokenRange(End, SM));
9970b57cec5SDimitry Andric return makeRangeFromFileLocs(Range, SM, LangOpts);
9980b57cec5SDimitry Andric }
9990b57cec5SDimitry Andric
10000b57cec5SDimitry Andric bool Invalid = false;
10010b57cec5SDimitry Andric const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
10020b57cec5SDimitry Andric &Invalid);
10030b57cec5SDimitry Andric if (Invalid)
10040b57cec5SDimitry Andric return {};
10050b57cec5SDimitry Andric
10060b57cec5SDimitry Andric if (BeginEntry.getExpansion().isMacroArgExpansion()) {
10070b57cec5SDimitry Andric const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
10080b57cec5SDimitry Andric &Invalid);
10090b57cec5SDimitry Andric if (Invalid)
10100b57cec5SDimitry Andric return {};
10110b57cec5SDimitry Andric
10120b57cec5SDimitry Andric if (EndEntry.getExpansion().isMacroArgExpansion() &&
10130b57cec5SDimitry Andric BeginEntry.getExpansion().getExpansionLocStart() ==
10140b57cec5SDimitry Andric EndEntry.getExpansion().getExpansionLocStart()) {
10150b57cec5SDimitry Andric Range.setBegin(SM.getImmediateSpellingLoc(Begin));
10160b57cec5SDimitry Andric Range.setEnd(SM.getImmediateSpellingLoc(End));
10170b57cec5SDimitry Andric return makeFileCharRange(Range, SM, LangOpts);
10180b57cec5SDimitry Andric }
10190b57cec5SDimitry Andric }
10200b57cec5SDimitry Andric
10210b57cec5SDimitry Andric return {};
10220b57cec5SDimitry Andric }
10230b57cec5SDimitry Andric
getSourceText(CharSourceRange Range,const SourceManager & SM,const LangOptions & LangOpts,bool * Invalid)10240b57cec5SDimitry Andric StringRef Lexer::getSourceText(CharSourceRange Range,
10250b57cec5SDimitry Andric const SourceManager &SM,
10260b57cec5SDimitry Andric const LangOptions &LangOpts,
10270b57cec5SDimitry Andric bool *Invalid) {
10280b57cec5SDimitry Andric Range = makeFileCharRange(Range, SM, LangOpts);
10290b57cec5SDimitry Andric if (Range.isInvalid()) {
10300b57cec5SDimitry Andric if (Invalid) *Invalid = true;
10310b57cec5SDimitry Andric return {};
10320b57cec5SDimitry Andric }
10330b57cec5SDimitry Andric
10340b57cec5SDimitry Andric // Break down the source location.
10350b57cec5SDimitry Andric std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
10360b57cec5SDimitry Andric if (beginInfo.first.isInvalid()) {
10370b57cec5SDimitry Andric if (Invalid) *Invalid = true;
10380b57cec5SDimitry Andric return {};
10390b57cec5SDimitry Andric }
10400b57cec5SDimitry Andric
10410b57cec5SDimitry Andric unsigned EndOffs;
10420b57cec5SDimitry Andric if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
10430b57cec5SDimitry Andric beginInfo.second > EndOffs) {
10440b57cec5SDimitry Andric if (Invalid) *Invalid = true;
10450b57cec5SDimitry Andric return {};
10460b57cec5SDimitry Andric }
10470b57cec5SDimitry Andric
10480b57cec5SDimitry Andric // Try to the load the file buffer.
10490b57cec5SDimitry Andric bool invalidTemp = false;
10500b57cec5SDimitry Andric StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
10510b57cec5SDimitry Andric if (invalidTemp) {
10520b57cec5SDimitry Andric if (Invalid) *Invalid = true;
10530b57cec5SDimitry Andric return {};
10540b57cec5SDimitry Andric }
10550b57cec5SDimitry Andric
10560b57cec5SDimitry Andric if (Invalid) *Invalid = false;
10570b57cec5SDimitry Andric return file.substr(beginInfo.second, EndOffs - beginInfo.second);
10580b57cec5SDimitry Andric }
10590b57cec5SDimitry Andric
getImmediateMacroName(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)10600b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
10610b57cec5SDimitry Andric const SourceManager &SM,
10620b57cec5SDimitry Andric const LangOptions &LangOpts) {
10630b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros");
10640b57cec5SDimitry Andric
10650b57cec5SDimitry Andric // Find the location of the immediate macro expansion.
10660b57cec5SDimitry Andric while (true) {
10670b57cec5SDimitry Andric FileID FID = SM.getFileID(Loc);
10680b57cec5SDimitry Andric const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
10690b57cec5SDimitry Andric const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
10700b57cec5SDimitry Andric Loc = Expansion.getExpansionLocStart();
10710b57cec5SDimitry Andric if (!Expansion.isMacroArgExpansion())
10720b57cec5SDimitry Andric break;
10730b57cec5SDimitry Andric
10740b57cec5SDimitry Andric // For macro arguments we need to check that the argument did not come
10750b57cec5SDimitry Andric // from an inner macro, e.g: "MAC1( MAC2(foo) )"
10760b57cec5SDimitry Andric
10770b57cec5SDimitry Andric // Loc points to the argument id of the macro definition, move to the
10780b57cec5SDimitry Andric // macro expansion.
10790b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin();
10800b57cec5SDimitry Andric SourceLocation SpellLoc = Expansion.getSpellingLoc();
10810b57cec5SDimitry Andric if (SpellLoc.isFileID())
10820b57cec5SDimitry Andric break; // No inner macro.
10830b57cec5SDimitry Andric
10840b57cec5SDimitry Andric // If spelling location resides in the same FileID as macro expansion
10850b57cec5SDimitry Andric // location, it means there is no inner macro.
10860b57cec5SDimitry Andric FileID MacroFID = SM.getFileID(Loc);
10870b57cec5SDimitry Andric if (SM.isInFileID(SpellLoc, MacroFID))
10880b57cec5SDimitry Andric break;
10890b57cec5SDimitry Andric
10900b57cec5SDimitry Andric // Argument came from inner macro.
10910b57cec5SDimitry Andric Loc = SpellLoc;
10920b57cec5SDimitry Andric }
10930b57cec5SDimitry Andric
10940b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion
10950b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin
10960b57cec5SDimitry Andric // expanding this macro.
10970b57cec5SDimitry Andric Loc = SM.getSpellingLoc(Loc);
10980b57cec5SDimitry Andric
10990b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the
11000b57cec5SDimitry Andric // name so that we can render it into the expansion note.
11010b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
11020b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
11030b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
11040b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
11050b57cec5SDimitry Andric }
11060b57cec5SDimitry Andric
getImmediateMacroNameForDiagnostics(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)11070b57cec5SDimitry Andric StringRef Lexer::getImmediateMacroNameForDiagnostics(
11080b57cec5SDimitry Andric SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
11090b57cec5SDimitry Andric assert(Loc.isMacroID() && "Only reasonable to call this on macros");
11100b57cec5SDimitry Andric // Walk past macro argument expansions.
11110b57cec5SDimitry Andric while (SM.isMacroArgExpansion(Loc))
11120b57cec5SDimitry Andric Loc = SM.getImmediateExpansionRange(Loc).getBegin();
11130b57cec5SDimitry Andric
1114bdd1243dSDimitry Andric // If the macro's spelling isn't FileID or from scratch space, then it's
1115bdd1243dSDimitry Andric // actually a token paste or stringization (or similar) and not a macro at
1116bdd1243dSDimitry Andric // all.
1117bdd1243dSDimitry Andric SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1118bdd1243dSDimitry Andric if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
11190b57cec5SDimitry Andric return {};
11200b57cec5SDimitry Andric
11210b57cec5SDimitry Andric // Find the spelling location of the start of the non-argument expansion
11220b57cec5SDimitry Andric // range. This is where the macro name was spelled in order to begin
11230b57cec5SDimitry Andric // expanding this macro.
11240b57cec5SDimitry Andric Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
11250b57cec5SDimitry Andric
11260b57cec5SDimitry Andric // Dig out the buffer where the macro name was spelled and the extents of the
11270b57cec5SDimitry Andric // name so that we can render it into the expansion note.
11280b57cec5SDimitry Andric std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
11290b57cec5SDimitry Andric unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
11300b57cec5SDimitry Andric StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
11310b57cec5SDimitry Andric return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
11320b57cec5SDimitry Andric }
11330b57cec5SDimitry Andric
isAsciiIdentifierContinueChar(char c,const LangOptions & LangOpts)1134349cc55cSDimitry Andric bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1135349cc55cSDimitry Andric return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
11360b57cec5SDimitry Andric }
11370b57cec5SDimitry Andric
isNewLineEscaped(const char * BufferStart,const char * Str)11380b57cec5SDimitry Andric bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
11390b57cec5SDimitry Andric assert(isVerticalWhitespace(Str[0]));
11400b57cec5SDimitry Andric if (Str - 1 < BufferStart)
11410b57cec5SDimitry Andric return false;
11420b57cec5SDimitry Andric
11430b57cec5SDimitry Andric if ((Str[0] == '\n' && Str[-1] == '\r') ||
11440b57cec5SDimitry Andric (Str[0] == '\r' && Str[-1] == '\n')) {
11450b57cec5SDimitry Andric if (Str - 2 < BufferStart)
11460b57cec5SDimitry Andric return false;
11470b57cec5SDimitry Andric --Str;
11480b57cec5SDimitry Andric }
11490b57cec5SDimitry Andric --Str;
11500b57cec5SDimitry Andric
11510b57cec5SDimitry Andric // Rewind to first non-space character:
11520b57cec5SDimitry Andric while (Str > BufferStart && isHorizontalWhitespace(*Str))
11530b57cec5SDimitry Andric --Str;
11540b57cec5SDimitry Andric
11550b57cec5SDimitry Andric return *Str == '\\';
11560b57cec5SDimitry Andric }
11570b57cec5SDimitry Andric
getIndentationForLine(SourceLocation Loc,const SourceManager & SM)11580b57cec5SDimitry Andric StringRef Lexer::getIndentationForLine(SourceLocation Loc,
11590b57cec5SDimitry Andric const SourceManager &SM) {
11600b57cec5SDimitry Andric if (Loc.isInvalid() || Loc.isMacroID())
11610b57cec5SDimitry Andric return {};
11620b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
11630b57cec5SDimitry Andric if (LocInfo.first.isInvalid())
11640b57cec5SDimitry Andric return {};
11650b57cec5SDimitry Andric bool Invalid = false;
11660b57cec5SDimitry Andric StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
11670b57cec5SDimitry Andric if (Invalid)
11680b57cec5SDimitry Andric return {};
11690b57cec5SDimitry Andric const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
11700b57cec5SDimitry Andric if (!Line)
11710b57cec5SDimitry Andric return {};
11720b57cec5SDimitry Andric StringRef Rest = Buffer.substr(Line - Buffer.data());
11730b57cec5SDimitry Andric size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
11740b57cec5SDimitry Andric return NumWhitespaceChars == StringRef::npos
11750b57cec5SDimitry Andric ? ""
11760b57cec5SDimitry Andric : Rest.take_front(NumWhitespaceChars);
11770b57cec5SDimitry Andric }
11780b57cec5SDimitry Andric
11790b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
11800b57cec5SDimitry Andric // Diagnostics forwarding code.
11810b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
11820b57cec5SDimitry Andric
11830b57cec5SDimitry Andric /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
11840b57cec5SDimitry Andric /// lexer buffer was all expanded at a single point, perform the mapping.
11850b57cec5SDimitry Andric /// This is currently only used for _Pragma implementation, so it is the slow
11860b57cec5SDimitry Andric /// path of the hot getSourceLocation method. Do not allow it to be inlined.
11870b57cec5SDimitry Andric static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
11880b57cec5SDimitry Andric Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
GetMappedTokenLoc(Preprocessor & PP,SourceLocation FileLoc,unsigned CharNo,unsigned TokLen)11890b57cec5SDimitry Andric static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
11900b57cec5SDimitry Andric SourceLocation FileLoc,
11910b57cec5SDimitry Andric unsigned CharNo, unsigned TokLen) {
11920b57cec5SDimitry Andric assert(FileLoc.isMacroID() && "Must be a macro expansion");
11930b57cec5SDimitry Andric
11940b57cec5SDimitry Andric // Otherwise, we're lexing "mapped tokens". This is used for things like
11950b57cec5SDimitry Andric // _Pragma handling. Combine the expansion location of FileLoc with the
11960b57cec5SDimitry Andric // spelling location.
11970b57cec5SDimitry Andric SourceManager &SM = PP.getSourceManager();
11980b57cec5SDimitry Andric
11990b57cec5SDimitry Andric // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
12000b57cec5SDimitry Andric // characters come from spelling(FileLoc)+Offset.
12010b57cec5SDimitry Andric SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
12020b57cec5SDimitry Andric SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
12030b57cec5SDimitry Andric
12040b57cec5SDimitry Andric // Figure out the expansion loc range, which is the range covered by the
12050b57cec5SDimitry Andric // original _Pragma(...) sequence.
12060b57cec5SDimitry Andric CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
12070b57cec5SDimitry Andric
12080b57cec5SDimitry Andric return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
12090b57cec5SDimitry Andric }
12100b57cec5SDimitry Andric
12110b57cec5SDimitry Andric /// getSourceLocation - Return a source location identifier for the specified
12120b57cec5SDimitry Andric /// offset in the current file.
getSourceLocation(const char * Loc,unsigned TokLen) const12130b57cec5SDimitry Andric SourceLocation Lexer::getSourceLocation(const char *Loc,
12140b57cec5SDimitry Andric unsigned TokLen) const {
12150b57cec5SDimitry Andric assert(Loc >= BufferStart && Loc <= BufferEnd &&
12160b57cec5SDimitry Andric "Location out of range for this buffer!");
12170b57cec5SDimitry Andric
12180b57cec5SDimitry Andric // In the normal case, we're just lexing from a simple file buffer, return
12190b57cec5SDimitry Andric // the file id from FileLoc with the offset specified.
12200b57cec5SDimitry Andric unsigned CharNo = Loc-BufferStart;
12210b57cec5SDimitry Andric if (FileLoc.isFileID())
12220b57cec5SDimitry Andric return FileLoc.getLocWithOffset(CharNo);
12230b57cec5SDimitry Andric
12240b57cec5SDimitry Andric // Otherwise, this is the _Pragma lexer case, which pretends that all of the
12250b57cec5SDimitry Andric // tokens are lexed from where the _Pragma was defined.
12260b57cec5SDimitry Andric assert(PP && "This doesn't work on raw lexers");
12270b57cec5SDimitry Andric return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
12280b57cec5SDimitry Andric }
12290b57cec5SDimitry Andric
12300b57cec5SDimitry Andric /// Diag - Forwarding function for diagnostics. This translate a source
12310b57cec5SDimitry Andric /// position in the current buffer into a SourceLocation object for rendering.
Diag(const char * Loc,unsigned DiagID) const12320b57cec5SDimitry Andric DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
12330b57cec5SDimitry Andric return PP->Diag(getSourceLocation(Loc), DiagID);
12340b57cec5SDimitry Andric }
12350b57cec5SDimitry Andric
12360b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
12370b57cec5SDimitry Andric // Trigraph and Escaped Newline Handling Code.
12380b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
12390b57cec5SDimitry Andric
12400b57cec5SDimitry Andric /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
12410b57cec5SDimitry Andric /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
GetTrigraphCharForLetter(char Letter)12420b57cec5SDimitry Andric static char GetTrigraphCharForLetter(char Letter) {
12430b57cec5SDimitry Andric switch (Letter) {
12440b57cec5SDimitry Andric default: return 0;
12450b57cec5SDimitry Andric case '=': return '#';
12460b57cec5SDimitry Andric case ')': return ']';
12470b57cec5SDimitry Andric case '(': return '[';
12480b57cec5SDimitry Andric case '!': return '|';
12490b57cec5SDimitry Andric case '\'': return '^';
12500b57cec5SDimitry Andric case '>': return '}';
12510b57cec5SDimitry Andric case '/': return '\\';
12520b57cec5SDimitry Andric case '<': return '{';
12530b57cec5SDimitry Andric case '-': return '~';
12540b57cec5SDimitry Andric }
12550b57cec5SDimitry Andric }
12560b57cec5SDimitry Andric
12570b57cec5SDimitry Andric /// DecodeTrigraphChar - If the specified character is a legal trigraph when
12580b57cec5SDimitry Andric /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled,
12590b57cec5SDimitry Andric /// return the result character. Finally, emit a warning about trigraph use
12600b57cec5SDimitry Andric /// whether trigraphs are enabled or not.
DecodeTrigraphChar(const char * CP,Lexer * L,bool Trigraphs)126181ad6265SDimitry Andric static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
12620b57cec5SDimitry Andric char Res = GetTrigraphCharForLetter(*CP);
1263bdd1243dSDimitry Andric if (!Res)
1264bdd1243dSDimitry Andric return Res;
12650b57cec5SDimitry Andric
126681ad6265SDimitry Andric if (!Trigraphs) {
1267bdd1243dSDimitry Andric if (L && !L->isLexingRawMode())
12680b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_ignored);
12690b57cec5SDimitry Andric return 0;
12700b57cec5SDimitry Andric }
12710b57cec5SDimitry Andric
1272bdd1243dSDimitry Andric if (L && !L->isLexingRawMode())
12730b57cec5SDimitry Andric L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
12740b57cec5SDimitry Andric return Res;
12750b57cec5SDimitry Andric }
12760b57cec5SDimitry Andric
12770b57cec5SDimitry Andric /// getEscapedNewLineSize - Return the size of the specified escaped newline,
12780b57cec5SDimitry Andric /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
12790b57cec5SDimitry Andric /// trigraph equivalent on entry to this function.
getEscapedNewLineSize(const char * Ptr)12800b57cec5SDimitry Andric unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
12810b57cec5SDimitry Andric unsigned Size = 0;
12820b57cec5SDimitry Andric while (isWhitespace(Ptr[Size])) {
12830b57cec5SDimitry Andric ++Size;
12840b57cec5SDimitry Andric
12850b57cec5SDimitry Andric if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
12860b57cec5SDimitry Andric continue;
12870b57cec5SDimitry Andric
12880b57cec5SDimitry Andric // If this is a \r\n or \n\r, skip the other half.
12890b57cec5SDimitry Andric if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
12900b57cec5SDimitry Andric Ptr[Size-1] != Ptr[Size])
12910b57cec5SDimitry Andric ++Size;
12920b57cec5SDimitry Andric
12930b57cec5SDimitry Andric return Size;
12940b57cec5SDimitry Andric }
12950b57cec5SDimitry Andric
12960b57cec5SDimitry Andric // Not an escaped newline, must be a \t or something else.
12970b57cec5SDimitry Andric return 0;
12980b57cec5SDimitry Andric }
12990b57cec5SDimitry Andric
13000b57cec5SDimitry Andric /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
13010b57cec5SDimitry Andric /// them), skip over them and return the first non-escaped-newline found,
13020b57cec5SDimitry Andric /// otherwise return P.
SkipEscapedNewLines(const char * P)13030b57cec5SDimitry Andric const char *Lexer::SkipEscapedNewLines(const char *P) {
13040b57cec5SDimitry Andric while (true) {
13050b57cec5SDimitry Andric const char *AfterEscape;
13060b57cec5SDimitry Andric if (*P == '\\') {
13070b57cec5SDimitry Andric AfterEscape = P+1;
13080b57cec5SDimitry Andric } else if (*P == '?') {
13090b57cec5SDimitry Andric // If not a trigraph for escape, bail out.
13100b57cec5SDimitry Andric if (P[1] != '?' || P[2] != '/')
13110b57cec5SDimitry Andric return P;
13120b57cec5SDimitry Andric // FIXME: Take LangOpts into account; the language might not
13130b57cec5SDimitry Andric // support trigraphs.
13140b57cec5SDimitry Andric AfterEscape = P+3;
13150b57cec5SDimitry Andric } else {
13160b57cec5SDimitry Andric return P;
13170b57cec5SDimitry Andric }
13180b57cec5SDimitry Andric
13190b57cec5SDimitry Andric unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
13200b57cec5SDimitry Andric if (NewLineSize == 0) return P;
13210b57cec5SDimitry Andric P = AfterEscape+NewLineSize;
13220b57cec5SDimitry Andric }
13230b57cec5SDimitry Andric }
13240b57cec5SDimitry Andric
findNextToken(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)1325bdd1243dSDimitry Andric std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
13260b57cec5SDimitry Andric const SourceManager &SM,
13270b57cec5SDimitry Andric const LangOptions &LangOpts) {
13280b57cec5SDimitry Andric if (Loc.isMacroID()) {
13290b57cec5SDimitry Andric if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1330bdd1243dSDimitry Andric return std::nullopt;
13310b57cec5SDimitry Andric }
13320b57cec5SDimitry Andric Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
13330b57cec5SDimitry Andric
13340b57cec5SDimitry Andric // Break down the source location.
13350b57cec5SDimitry Andric std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
13360b57cec5SDimitry Andric
13370b57cec5SDimitry Andric // Try to load the file buffer.
13380b57cec5SDimitry Andric bool InvalidTemp = false;
13390b57cec5SDimitry Andric StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
13400b57cec5SDimitry Andric if (InvalidTemp)
1341bdd1243dSDimitry Andric return std::nullopt;
13420b57cec5SDimitry Andric
13430b57cec5SDimitry Andric const char *TokenBegin = File.data() + LocInfo.second;
13440b57cec5SDimitry Andric
13450b57cec5SDimitry Andric // Lex from the start of the given location.
13460b57cec5SDimitry Andric Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
13470b57cec5SDimitry Andric TokenBegin, File.end());
13480b57cec5SDimitry Andric // Find the token.
13490b57cec5SDimitry Andric Token Tok;
13500b57cec5SDimitry Andric lexer.LexFromRawLexer(Tok);
13510b57cec5SDimitry Andric return Tok;
13520b57cec5SDimitry Andric }
13530b57cec5SDimitry Andric
13540b57cec5SDimitry Andric /// Checks that the given token is the first token that occurs after the
13550b57cec5SDimitry Andric /// given location (this excludes comments and whitespace). Returns the location
13560b57cec5SDimitry Andric /// immediately after the specified token. If the token is not found or the
13570b57cec5SDimitry Andric /// location is inside a macro, the returned source location will be invalid.
findLocationAfterToken(SourceLocation Loc,tok::TokenKind TKind,const SourceManager & SM,const LangOptions & LangOpts,bool SkipTrailingWhitespaceAndNewLine)13580b57cec5SDimitry Andric SourceLocation Lexer::findLocationAfterToken(
13590b57cec5SDimitry Andric SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
13600b57cec5SDimitry Andric const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1361bdd1243dSDimitry Andric std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
13620b57cec5SDimitry Andric if (!Tok || Tok->isNot(TKind))
13630b57cec5SDimitry Andric return {};
13640b57cec5SDimitry Andric SourceLocation TokenLoc = Tok->getLocation();
13650b57cec5SDimitry Andric
13660b57cec5SDimitry Andric // Calculate how much whitespace needs to be skipped if any.
13670b57cec5SDimitry Andric unsigned NumWhitespaceChars = 0;
13680b57cec5SDimitry Andric if (SkipTrailingWhitespaceAndNewLine) {
13690b57cec5SDimitry Andric const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
13700b57cec5SDimitry Andric unsigned char C = *TokenEnd;
13710b57cec5SDimitry Andric while (isHorizontalWhitespace(C)) {
13720b57cec5SDimitry Andric C = *(++TokenEnd);
13730b57cec5SDimitry Andric NumWhitespaceChars++;
13740b57cec5SDimitry Andric }
13750b57cec5SDimitry Andric
13760b57cec5SDimitry Andric // Skip \r, \n, \r\n, or \n\r
13770b57cec5SDimitry Andric if (C == '\n' || C == '\r') {
13780b57cec5SDimitry Andric char PrevC = C;
13790b57cec5SDimitry Andric C = *(++TokenEnd);
13800b57cec5SDimitry Andric NumWhitespaceChars++;
13810b57cec5SDimitry Andric if ((C == '\n' || C == '\r') && C != PrevC)
13820b57cec5SDimitry Andric NumWhitespaceChars++;
13830b57cec5SDimitry Andric }
13840b57cec5SDimitry Andric }
13850b57cec5SDimitry Andric
13860b57cec5SDimitry Andric return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
13870b57cec5SDimitry Andric }
13880b57cec5SDimitry Andric
13890b57cec5SDimitry Andric /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
13900b57cec5SDimitry Andric /// get its size, and return it. This is tricky in several cases:
13910b57cec5SDimitry Andric /// 1. If currently at the start of a trigraph, we warn about the trigraph,
13920b57cec5SDimitry Andric /// then either return the trigraph (skipping 3 chars) or the '?',
13930b57cec5SDimitry Andric /// depending on whether trigraphs are enabled or not.
13940b57cec5SDimitry Andric /// 2. If this is an escaped newline (potentially with whitespace between
13950b57cec5SDimitry Andric /// the backslash and newline), implicitly skip the newline and return
13960b57cec5SDimitry Andric /// the char after it.
13970b57cec5SDimitry Andric ///
13980b57cec5SDimitry Andric /// This handles the slow/uncommon case of the getCharAndSize method. Here we
13990b57cec5SDimitry Andric /// know that we can accumulate into Size, and that we have already incremented
14000b57cec5SDimitry Andric /// Ptr by Size bytes.
14010b57cec5SDimitry Andric ///
14020b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
14030b57cec5SDimitry Andric /// be updated to match.
getCharAndSizeSlow(const char * Ptr,Token * Tok)14045f757f3fSDimitry Andric Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
14055f757f3fSDimitry Andric unsigned Size = 0;
14060b57cec5SDimitry Andric // If we have a slash, look for an escaped newline.
14070b57cec5SDimitry Andric if (Ptr[0] == '\\') {
14080b57cec5SDimitry Andric ++Size;
14090b57cec5SDimitry Andric ++Ptr;
14100b57cec5SDimitry Andric Slash:
14110b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace.
14125f757f3fSDimitry Andric if (!isWhitespace(Ptr[0]))
14135f757f3fSDimitry Andric return {'\\', Size};
14140b57cec5SDimitry Andric
14150b57cec5SDimitry Andric // See if we have optional whitespace characters between the slash and
14160b57cec5SDimitry Andric // newline.
14170b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
14180b57cec5SDimitry Andric // Remember that this token needs to be cleaned.
14190b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning);
14200b57cec5SDimitry Andric
14210b57cec5SDimitry Andric // Warn if there was whitespace between the backslash and newline.
14220b57cec5SDimitry Andric if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
14230b57cec5SDimitry Andric Diag(Ptr, diag::backslash_newline_space);
14240b57cec5SDimitry Andric
14250b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it.
14260b57cec5SDimitry Andric Size += EscapedNewLineSize;
14270b57cec5SDimitry Andric Ptr += EscapedNewLineSize;
14280b57cec5SDimitry Andric
14290b57cec5SDimitry Andric // Use slow version to accumulate a correct size field.
14305f757f3fSDimitry Andric auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
14315f757f3fSDimitry Andric CharAndSize.Size += Size;
14325f757f3fSDimitry Andric return CharAndSize;
14330b57cec5SDimitry Andric }
14340b57cec5SDimitry Andric
14350b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash.
14365f757f3fSDimitry Andric return {'\\', Size};
14370b57cec5SDimitry Andric }
14380b57cec5SDimitry Andric
14390b57cec5SDimitry Andric // If this is a trigraph, process it.
14400b57cec5SDimitry Andric if (Ptr[0] == '?' && Ptr[1] == '?') {
14410b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), emit
14420b57cec5SDimitry Andric // a trigraph warning. If so, and if trigraphs are enabled, return it.
144381ad6265SDimitry Andric if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
144481ad6265SDimitry Andric LangOpts.Trigraphs)) {
14450b57cec5SDimitry Andric // Remember that this token needs to be cleaned.
14460b57cec5SDimitry Andric if (Tok) Tok->setFlag(Token::NeedsCleaning);
14470b57cec5SDimitry Andric
14480b57cec5SDimitry Andric Ptr += 3;
14490b57cec5SDimitry Andric Size += 3;
14500b57cec5SDimitry Andric if (C == '\\') goto Slash;
14515f757f3fSDimitry Andric return {C, Size};
14520b57cec5SDimitry Andric }
14530b57cec5SDimitry Andric }
14540b57cec5SDimitry Andric
14550b57cec5SDimitry Andric // If this is neither, return a single character.
14565f757f3fSDimitry Andric return {*Ptr, Size + 1u};
14570b57cec5SDimitry Andric }
14580b57cec5SDimitry Andric
14590b57cec5SDimitry Andric /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
14600b57cec5SDimitry Andric /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size,
14610b57cec5SDimitry Andric /// and that we have already incremented Ptr by Size bytes.
14620b57cec5SDimitry Andric ///
14630b57cec5SDimitry Andric /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
14640b57cec5SDimitry Andric /// be updated to match.
getCharAndSizeSlowNoWarn(const char * Ptr,const LangOptions & LangOpts)14655f757f3fSDimitry Andric Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
14660b57cec5SDimitry Andric const LangOptions &LangOpts) {
14675f757f3fSDimitry Andric
14685f757f3fSDimitry Andric unsigned Size = 0;
14690b57cec5SDimitry Andric // If we have a slash, look for an escaped newline.
14700b57cec5SDimitry Andric if (Ptr[0] == '\\') {
14710b57cec5SDimitry Andric ++Size;
14720b57cec5SDimitry Andric ++Ptr;
14730b57cec5SDimitry Andric Slash:
14740b57cec5SDimitry Andric // Common case, backslash-char where the char is not whitespace.
14755f757f3fSDimitry Andric if (!isWhitespace(Ptr[0]))
14765f757f3fSDimitry Andric return {'\\', Size};
14770b57cec5SDimitry Andric
14780b57cec5SDimitry Andric // See if we have optional whitespace characters followed by a newline.
14790b57cec5SDimitry Andric if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
14800b57cec5SDimitry Andric // Found backslash<whitespace><newline>. Parse the char after it.
14810b57cec5SDimitry Andric Size += EscapedNewLineSize;
14820b57cec5SDimitry Andric Ptr += EscapedNewLineSize;
14830b57cec5SDimitry Andric
14840b57cec5SDimitry Andric // Use slow version to accumulate a correct size field.
14855f757f3fSDimitry Andric auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
14865f757f3fSDimitry Andric CharAndSize.Size += Size;
14875f757f3fSDimitry Andric return CharAndSize;
14880b57cec5SDimitry Andric }
14890b57cec5SDimitry Andric
14900b57cec5SDimitry Andric // Otherwise, this is not an escaped newline, just return the slash.
14915f757f3fSDimitry Andric return {'\\', Size};
14920b57cec5SDimitry Andric }
14930b57cec5SDimitry Andric
14940b57cec5SDimitry Andric // If this is a trigraph, process it.
14950b57cec5SDimitry Andric if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
14960b57cec5SDimitry Andric // If this is actually a legal trigraph (not something like "??x"), return
14970b57cec5SDimitry Andric // it.
14980b57cec5SDimitry Andric if (char C = GetTrigraphCharForLetter(Ptr[2])) {
14990b57cec5SDimitry Andric Ptr += 3;
15000b57cec5SDimitry Andric Size += 3;
15010b57cec5SDimitry Andric if (C == '\\') goto Slash;
15025f757f3fSDimitry Andric return {C, Size};
15030b57cec5SDimitry Andric }
15040b57cec5SDimitry Andric }
15050b57cec5SDimitry Andric
15060b57cec5SDimitry Andric // If this is neither, return a single character.
15075f757f3fSDimitry Andric return {*Ptr, Size + 1u};
15080b57cec5SDimitry Andric }
15090b57cec5SDimitry Andric
15100b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
15110b57cec5SDimitry Andric // Helper methods for lexing.
15120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
15130b57cec5SDimitry Andric
15140b57cec5SDimitry Andric /// Routine that indiscriminately sets the offset into the source file.
SetByteOffset(unsigned Offset,bool StartOfLine)15150b57cec5SDimitry Andric void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
15160b57cec5SDimitry Andric BufferPtr = BufferStart + Offset;
15170b57cec5SDimitry Andric if (BufferPtr > BufferEnd)
15180b57cec5SDimitry Andric BufferPtr = BufferEnd;
15190b57cec5SDimitry Andric // FIXME: What exactly does the StartOfLine bit mean? There are two
15200b57cec5SDimitry Andric // possible meanings for the "start" of the line: the first token on the
15210b57cec5SDimitry Andric // unexpanded line, or the first token on the expanded line.
15220b57cec5SDimitry Andric IsAtStartOfLine = StartOfLine;
15230b57cec5SDimitry Andric IsAtPhysicalStartOfLine = StartOfLine;
15240b57cec5SDimitry Andric }
15250b57cec5SDimitry Andric
isUnicodeWhitespace(uint32_t Codepoint)1526349cc55cSDimitry Andric static bool isUnicodeWhitespace(uint32_t Codepoint) {
1527349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1528349cc55cSDimitry Andric UnicodeWhitespaceCharRanges);
1529349cc55cSDimitry Andric return UnicodeWhitespaceChars.contains(Codepoint);
1530349cc55cSDimitry Andric }
1531349cc55cSDimitry Andric
codepointAsHexString(uint32_t C)1532bdd1243dSDimitry Andric static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
1533bdd1243dSDimitry Andric llvm::SmallString<5> CharBuf;
1534bdd1243dSDimitry Andric llvm::raw_svector_ostream CharOS(CharBuf);
1535bdd1243dSDimitry Andric llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1536bdd1243dSDimitry Andric return CharBuf;
1537bdd1243dSDimitry Andric }
1538bdd1243dSDimitry Andric
1539bdd1243dSDimitry Andric // To mitigate https://github.com/llvm/llvm-project/issues/54732,
1540bdd1243dSDimitry Andric // we allow "Mathematical Notation Characters" in identifiers.
1541bdd1243dSDimitry Andric // This is a proposed profile that extends the XID_Start/XID_continue
1542bdd1243dSDimitry Andric // with mathematical symbols, superscipts and subscripts digits
1543bdd1243dSDimitry Andric // found in some production software.
1544bdd1243dSDimitry Andric // https://www.unicode.org/L2/L2022/22230-math-profile.pdf
isMathematicalExtensionID(uint32_t C,const LangOptions & LangOpts,bool IsStart,bool & IsExtension)1545bdd1243dSDimitry Andric static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1546bdd1243dSDimitry Andric bool IsStart, bool &IsExtension) {
1547bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathStartChars(
1548bdd1243dSDimitry Andric MathematicalNotationProfileIDStartRanges);
1549bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathContinueChars(
1550bdd1243dSDimitry Andric MathematicalNotationProfileIDContinueRanges);
1551bdd1243dSDimitry Andric if (MathStartChars.contains(C) ||
1552bdd1243dSDimitry Andric (!IsStart && MathContinueChars.contains(C))) {
1553bdd1243dSDimitry Andric IsExtension = true;
1554bdd1243dSDimitry Andric return true;
1555bdd1243dSDimitry Andric }
1556bdd1243dSDimitry Andric return false;
1557bdd1243dSDimitry Andric }
1558bdd1243dSDimitry Andric
isAllowedIDChar(uint32_t C,const LangOptions & LangOpts,bool & IsExtension)1559bdd1243dSDimitry Andric static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1560bdd1243dSDimitry Andric bool &IsExtension) {
15610b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) {
15620b57cec5SDimitry Andric return false;
1563480093f4SDimitry Andric } else if (LangOpts.DollarIdents && '$' == C) {
1564480093f4SDimitry Andric return true;
15655f757f3fSDimitry Andric } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1566349cc55cSDimitry Andric // A non-leading codepoint must have the XID_Continue property.
1567349cc55cSDimitry Andric // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1568349cc55cSDimitry Andric // so we need to check both tables.
1569fcaf7f86SDimitry Andric // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1570349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1571349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1572bdd1243dSDimitry Andric if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1573bdd1243dSDimitry Andric return true;
1574bdd1243dSDimitry Andric return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1575bdd1243dSDimitry Andric IsExtension);
1576349cc55cSDimitry Andric } else if (LangOpts.C11) {
15770b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
15780b57cec5SDimitry Andric C11AllowedIDCharRanges);
15790b57cec5SDimitry Andric return C11AllowedIDChars.contains(C);
15800b57cec5SDimitry Andric } else {
15810b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
15820b57cec5SDimitry Andric C99AllowedIDCharRanges);
15830b57cec5SDimitry Andric return C99AllowedIDChars.contains(C);
15840b57cec5SDimitry Andric }
15850b57cec5SDimitry Andric }
15860b57cec5SDimitry Andric
isAllowedInitiallyIDChar(uint32_t C,const LangOptions & LangOpts,bool & IsExtension)1587bdd1243dSDimitry Andric static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1588bdd1243dSDimitry Andric bool &IsExtension) {
1589bdd1243dSDimitry Andric assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1590bdd1243dSDimitry Andric IsExtension = false;
15910b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor) {
15920b57cec5SDimitry Andric return false;
1593349cc55cSDimitry Andric }
15945f757f3fSDimitry Andric if (LangOpts.CPlusPlus || LangOpts.C23) {
1595349cc55cSDimitry Andric static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1596bdd1243dSDimitry Andric if (XIDStartChars.contains(C))
1597bdd1243dSDimitry Andric return true;
1598bdd1243dSDimitry Andric return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1599bdd1243dSDimitry Andric IsExtension);
1600349cc55cSDimitry Andric }
1601bdd1243dSDimitry Andric if (!isAllowedIDChar(C, LangOpts, IsExtension))
1602349cc55cSDimitry Andric return false;
1603349cc55cSDimitry Andric if (LangOpts.C11) {
16040b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
16050b57cec5SDimitry Andric C11DisallowedInitialIDCharRanges);
16060b57cec5SDimitry Andric return !C11DisallowedInitialIDChars.contains(C);
1607349cc55cSDimitry Andric }
16080b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
16090b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges);
16100b57cec5SDimitry Andric return !C99DisallowedInitialIDChars.contains(C);
16110b57cec5SDimitry Andric }
16120b57cec5SDimitry Andric
diagnoseExtensionInIdentifier(DiagnosticsEngine & Diags,uint32_t C,CharSourceRange Range)1613bdd1243dSDimitry Andric static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1614bdd1243dSDimitry Andric CharSourceRange Range) {
1615bdd1243dSDimitry Andric
1616bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathStartChars(
1617bdd1243dSDimitry Andric MathematicalNotationProfileIDStartRanges);
1618bdd1243dSDimitry Andric static const llvm::sys::UnicodeCharSet MathContinueChars(
1619bdd1243dSDimitry Andric MathematicalNotationProfileIDContinueRanges);
1620bdd1243dSDimitry Andric
1621bdd1243dSDimitry Andric (void)MathStartChars;
1622bdd1243dSDimitry Andric (void)MathContinueChars;
1623bdd1243dSDimitry Andric assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1624bdd1243dSDimitry Andric "Unexpected mathematical notation codepoint");
1625bdd1243dSDimitry Andric Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1626bdd1243dSDimitry Andric << codepointAsHexString(C) << Range;
1627bdd1243dSDimitry Andric }
1628bdd1243dSDimitry Andric
makeCharRange(Lexer & L,const char * Begin,const char * End)16290b57cec5SDimitry Andric static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
16300b57cec5SDimitry Andric const char *End) {
16310b57cec5SDimitry Andric return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
16320b57cec5SDimitry Andric L.getSourceLocation(End));
16330b57cec5SDimitry Andric }
16340b57cec5SDimitry Andric
maybeDiagnoseIDCharCompat(DiagnosticsEngine & Diags,uint32_t C,CharSourceRange Range,bool IsFirst)16350b57cec5SDimitry Andric static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
16360b57cec5SDimitry Andric CharSourceRange Range, bool IsFirst) {
16370b57cec5SDimitry Andric // Check C99 compatibility.
16380b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
16390b57cec5SDimitry Andric enum {
16400b57cec5SDimitry Andric CannotAppearInIdentifier = 0,
16410b57cec5SDimitry Andric CannotStartIdentifier
16420b57cec5SDimitry Andric };
16430b57cec5SDimitry Andric
16440b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
16450b57cec5SDimitry Andric C99AllowedIDCharRanges);
16460b57cec5SDimitry Andric static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
16470b57cec5SDimitry Andric C99DisallowedInitialIDCharRanges);
16480b57cec5SDimitry Andric if (!C99AllowedIDChars.contains(C)) {
16490b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
16500b57cec5SDimitry Andric << Range
16510b57cec5SDimitry Andric << CannotAppearInIdentifier;
16520b57cec5SDimitry Andric } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
16530b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
16540b57cec5SDimitry Andric << Range
16550b57cec5SDimitry Andric << CannotStartIdentifier;
16560b57cec5SDimitry Andric }
16570b57cec5SDimitry Andric }
16580b57cec5SDimitry Andric }
16590b57cec5SDimitry Andric
16600b57cec5SDimitry Andric /// After encountering UTF-8 character C and interpreting it as an identifier
16610b57cec5SDimitry Andric /// character, check whether it's a homoglyph for a common non-identifier
16620b57cec5SDimitry Andric /// source character that is unlikely to be an intentional identifier
16630b57cec5SDimitry Andric /// character and warn if so.
maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine & Diags,uint32_t C,CharSourceRange Range)16640b57cec5SDimitry Andric static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
16650b57cec5SDimitry Andric CharSourceRange Range) {
16660b57cec5SDimitry Andric // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
16670b57cec5SDimitry Andric struct HomoglyphPair {
16680b57cec5SDimitry Andric uint32_t Character;
16690b57cec5SDimitry Andric char LooksLike;
16700b57cec5SDimitry Andric bool operator<(HomoglyphPair R) const { return Character < R.Character; }
16710b57cec5SDimitry Andric };
16720b57cec5SDimitry Andric static constexpr HomoglyphPair SortedHomoglyphs[] = {
16730b57cec5SDimitry Andric {U'\u00ad', 0}, // SOFT HYPHEN
16740b57cec5SDimitry Andric {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
16750b57cec5SDimitry Andric {U'\u037e', ';'}, // GREEK QUESTION MARK
16760b57cec5SDimitry Andric {U'\u200b', 0}, // ZERO WIDTH SPACE
16770b57cec5SDimitry Andric {U'\u200c', 0}, // ZERO WIDTH NON-JOINER
16780b57cec5SDimitry Andric {U'\u200d', 0}, // ZERO WIDTH JOINER
16790b57cec5SDimitry Andric {U'\u2060', 0}, // WORD JOINER
16800b57cec5SDimitry Andric {U'\u2061', 0}, // FUNCTION APPLICATION
16810b57cec5SDimitry Andric {U'\u2062', 0}, // INVISIBLE TIMES
16820b57cec5SDimitry Andric {U'\u2063', 0}, // INVISIBLE SEPARATOR
16830b57cec5SDimitry Andric {U'\u2064', 0}, // INVISIBLE PLUS
16840b57cec5SDimitry Andric {U'\u2212', '-'}, // MINUS SIGN
16850b57cec5SDimitry Andric {U'\u2215', '/'}, // DIVISION SLASH
16860b57cec5SDimitry Andric {U'\u2216', '\\'}, // SET MINUS
16870b57cec5SDimitry Andric {U'\u2217', '*'}, // ASTERISK OPERATOR
16880b57cec5SDimitry Andric {U'\u2223', '|'}, // DIVIDES
16890b57cec5SDimitry Andric {U'\u2227', '^'}, // LOGICAL AND
16900b57cec5SDimitry Andric {U'\u2236', ':'}, // RATIO
16910b57cec5SDimitry Andric {U'\u223c', '~'}, // TILDE OPERATOR
16920b57cec5SDimitry Andric {U'\ua789', ':'}, // MODIFIER LETTER COLON
16930b57cec5SDimitry Andric {U'\ufeff', 0}, // ZERO WIDTH NO-BREAK SPACE
16940b57cec5SDimitry Andric {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
16950b57cec5SDimitry Andric {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
16960b57cec5SDimitry Andric {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
16970b57cec5SDimitry Andric {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
16980b57cec5SDimitry Andric {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
16990b57cec5SDimitry Andric {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
17000b57cec5SDimitry Andric {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
17010b57cec5SDimitry Andric {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
17020b57cec5SDimitry Andric {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
17030b57cec5SDimitry Andric {U'\uff0c', ','}, // FULLWIDTH COMMA
17040b57cec5SDimitry Andric {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
17050b57cec5SDimitry Andric {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
17060b57cec5SDimitry Andric {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
17070b57cec5SDimitry Andric {U'\uff1a', ':'}, // FULLWIDTH COLON
17080b57cec5SDimitry Andric {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
17090b57cec5SDimitry Andric {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
17100b57cec5SDimitry Andric {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
17110b57cec5SDimitry Andric {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
17120b57cec5SDimitry Andric {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
17130b57cec5SDimitry Andric {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
17140b57cec5SDimitry Andric {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
17150b57cec5SDimitry Andric {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
17160b57cec5SDimitry Andric {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
17170b57cec5SDimitry Andric {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
17180b57cec5SDimitry Andric {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
17190b57cec5SDimitry Andric {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
17200b57cec5SDimitry Andric {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
17210b57cec5SDimitry Andric {U'\uff5e', '~'}, // FULLWIDTH TILDE
17220b57cec5SDimitry Andric {0, 0}
17230b57cec5SDimitry Andric };
17240b57cec5SDimitry Andric auto Homoglyph =
17250b57cec5SDimitry Andric std::lower_bound(std::begin(SortedHomoglyphs),
17260b57cec5SDimitry Andric std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
17270b57cec5SDimitry Andric if (Homoglyph->Character == C) {
17280b57cec5SDimitry Andric if (Homoglyph->LooksLike) {
17290b57cec5SDimitry Andric const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
17300b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1731bdd1243dSDimitry Andric << Range << codepointAsHexString(C) << LooksLikeStr;
17320b57cec5SDimitry Andric } else {
17330b57cec5SDimitry Andric Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1734bdd1243dSDimitry Andric << Range << codepointAsHexString(C);
17350b57cec5SDimitry Andric }
17360b57cec5SDimitry Andric }
17370b57cec5SDimitry Andric }
17380b57cec5SDimitry Andric
diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine & Diags,const LangOptions & LangOpts,uint32_t CodePoint,CharSourceRange Range,bool IsFirst)1739349cc55cSDimitry Andric static void diagnoseInvalidUnicodeCodepointInIdentifier(
1740349cc55cSDimitry Andric DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1741349cc55cSDimitry Andric CharSourceRange Range, bool IsFirst) {
1742349cc55cSDimitry Andric if (isASCII(CodePoint))
1743349cc55cSDimitry Andric return;
1744349cc55cSDimitry Andric
1745bdd1243dSDimitry Andric bool IsExtension;
1746bdd1243dSDimitry Andric bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1747bdd1243dSDimitry Andric bool IsIDContinue =
1748bdd1243dSDimitry Andric IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1749349cc55cSDimitry Andric
1750349cc55cSDimitry Andric if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1751349cc55cSDimitry Andric return;
1752349cc55cSDimitry Andric
1753349cc55cSDimitry Andric bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1754349cc55cSDimitry Andric
1755349cc55cSDimitry Andric if (!IsFirst || InvalidOnlyAtStart) {
1756349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1757bdd1243dSDimitry Andric << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1758349cc55cSDimitry Andric << FixItHint::CreateRemoval(Range);
1759349cc55cSDimitry Andric } else {
1760349cc55cSDimitry Andric Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1761bdd1243dSDimitry Andric << Range << codepointAsHexString(CodePoint)
1762bdd1243dSDimitry Andric << FixItHint::CreateRemoval(Range);
1763349cc55cSDimitry Andric }
1764349cc55cSDimitry Andric }
1765349cc55cSDimitry Andric
tryConsumeIdentifierUCN(const char * & CurPtr,unsigned Size,Token & Result)17660b57cec5SDimitry Andric bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
17670b57cec5SDimitry Andric Token &Result) {
17680b57cec5SDimitry Andric const char *UCNPtr = CurPtr + Size;
17690b57cec5SDimitry Andric uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1770349cc55cSDimitry Andric if (CodePoint == 0) {
17710b57cec5SDimitry Andric return false;
1772349cc55cSDimitry Andric }
1773bdd1243dSDimitry Andric bool IsExtension = false;
1774bdd1243dSDimitry Andric if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1775349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1776349cc55cSDimitry Andric return false;
1777349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1778349cc55cSDimitry Andric !PP->isPreprocessedOutput())
1779349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier(
1780349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint,
1781349cc55cSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr),
1782349cc55cSDimitry Andric /*IsFirst=*/false);
1783349cc55cSDimitry Andric
1784349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a
1785349cc55cSDimitry Andric // a valid identifier part.
1786349cc55cSDimitry Andric // Carry on as if the codepoint was valid for recovery purposes.
1787bdd1243dSDimitry Andric } else if (!isLexingRawMode()) {
1788bdd1243dSDimitry Andric if (IsExtension)
1789bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
1790bdd1243dSDimitry Andric makeCharRange(*this, CurPtr, UCNPtr));
1791bdd1243dSDimitry Andric
17920b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
17930b57cec5SDimitry Andric makeCharRange(*this, CurPtr, UCNPtr),
17940b57cec5SDimitry Andric /*IsFirst=*/false);
1795bdd1243dSDimitry Andric }
17960b57cec5SDimitry Andric
17970b57cec5SDimitry Andric Result.setFlag(Token::HasUCN);
17980b57cec5SDimitry Andric if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') ||
17990b57cec5SDimitry Andric (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
18000b57cec5SDimitry Andric CurPtr = UCNPtr;
18010b57cec5SDimitry Andric else
18020b57cec5SDimitry Andric while (CurPtr != UCNPtr)
18030b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result);
18040b57cec5SDimitry Andric return true;
18050b57cec5SDimitry Andric }
18060b57cec5SDimitry Andric
tryConsumeIdentifierUTF8Char(const char * & CurPtr,Token & Result)18075f757f3fSDimitry Andric bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
18080b57cec5SDimitry Andric llvm::UTF32 CodePoint;
18095f757f3fSDimitry Andric
18105f757f3fSDimitry Andric // If a UTF-8 codepoint appears immediately after an escaped new line,
18115f757f3fSDimitry Andric // CurPtr may point to the splicing \ on the preceding line,
18125f757f3fSDimitry Andric // so we need to skip it.
18135f757f3fSDimitry Andric unsigned FirstCodeUnitSize;
18145f757f3fSDimitry Andric getCharAndSize(CurPtr, FirstCodeUnitSize);
18155f757f3fSDimitry Andric const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
18165f757f3fSDimitry Andric const char *UnicodePtr = CharStart;
18175f757f3fSDimitry Andric
18185f757f3fSDimitry Andric llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
18195f757f3fSDimitry Andric (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
18205f757f3fSDimitry Andric &CodePoint, llvm::strictConversion);
18215f757f3fSDimitry Andric if (ConvResult != llvm::conversionOK)
18220b57cec5SDimitry Andric return false;
18230b57cec5SDimitry Andric
1824bdd1243dSDimitry Andric bool IsExtension = false;
1825bdd1243dSDimitry Andric if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1826bdd1243dSDimitry Andric IsExtension)) {
1827349cc55cSDimitry Andric if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1828349cc55cSDimitry Andric return false;
1829349cc55cSDimitry Andric
1830349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1831349cc55cSDimitry Andric !PP->isPreprocessedOutput())
1832349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier(
1833349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, CodePoint,
18345f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1835349cc55cSDimitry Andric // We got a unicode codepoint that is neither a space nor a
1836349cc55cSDimitry Andric // a valid identifier part. Carry on as if the codepoint was
1837349cc55cSDimitry Andric // valid for recovery purposes.
1838349cc55cSDimitry Andric } else if (!isLexingRawMode()) {
1839bdd1243dSDimitry Andric if (IsExtension)
18405f757f3fSDimitry Andric diagnoseExtensionInIdentifier(
18415f757f3fSDimitry Andric PP->getDiagnostics(), CodePoint,
18425f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr));
18430b57cec5SDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
18445f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr),
18450b57cec5SDimitry Andric /*IsFirst=*/false);
18460b57cec5SDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
18475f757f3fSDimitry Andric makeCharRange(*this, CharStart, UnicodePtr));
18480b57cec5SDimitry Andric }
18490b57cec5SDimitry Andric
18505f757f3fSDimitry Andric // Once we sucessfully parsed some UTF-8,
18515f757f3fSDimitry Andric // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
18525f757f3fSDimitry Andric // being lexed, and that warnings about trailing spaces are emitted.
18535f757f3fSDimitry Andric ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
18540b57cec5SDimitry Andric CurPtr = UnicodePtr;
18550b57cec5SDimitry Andric return true;
18560b57cec5SDimitry Andric }
18570b57cec5SDimitry Andric
LexUnicodeIdentifierStart(Token & Result,uint32_t C,const char * CurPtr)1858349cc55cSDimitry Andric bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1859349cc55cSDimitry Andric const char *CurPtr) {
1860bdd1243dSDimitry Andric bool IsExtension = false;
1861bdd1243dSDimitry Andric if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1862349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1863349cc55cSDimitry Andric !PP->isPreprocessedOutput()) {
1864bdd1243dSDimitry Andric if (IsExtension)
1865bdd1243dSDimitry Andric diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
1866bdd1243dSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr));
1867349cc55cSDimitry Andric maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
1868349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr),
1869349cc55cSDimitry Andric /*IsFirst=*/true);
1870349cc55cSDimitry Andric maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
1871349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr));
1872349cc55cSDimitry Andric }
1873349cc55cSDimitry Andric
1874349cc55cSDimitry Andric MIOpt.ReadToken();
1875349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr);
1876349cc55cSDimitry Andric }
1877349cc55cSDimitry Andric
1878349cc55cSDimitry Andric if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1879349cc55cSDimitry Andric !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1880bdd1243dSDimitry Andric !isUnicodeWhitespace(C)) {
1881349cc55cSDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally.
1882349cc55cSDimitry Andric // Instead of letting the parser complain about the unknown token,
1883349cc55cSDimitry Andric // just drop the character.
1884349cc55cSDimitry Andric // Note that we can /only/ do this when the non-ASCII character is actually
1885349cc55cSDimitry Andric // spelled as Unicode, not written as a UCN. The standard requires that
1886349cc55cSDimitry Andric // we not throw away any possible preprocessor tokens, but there's a
1887349cc55cSDimitry Andric // loophole in the mapping of Unicode characters to basic character set
1888349cc55cSDimitry Andric // characters that allows us to map these particular characters to, say,
1889349cc55cSDimitry Andric // whitespace.
1890349cc55cSDimitry Andric diagnoseInvalidUnicodeCodepointInIdentifier(
1891349cc55cSDimitry Andric PP->getDiagnostics(), LangOpts, C,
1892349cc55cSDimitry Andric makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1893349cc55cSDimitry Andric BufferPtr = CurPtr;
1894349cc55cSDimitry Andric return false;
1895349cc55cSDimitry Andric }
1896349cc55cSDimitry Andric
1897349cc55cSDimitry Andric // Otherwise, we have an explicit UCN or a character that's unlikely to show
1898349cc55cSDimitry Andric // up by accident.
1899349cc55cSDimitry Andric MIOpt.ReadToken();
1900349cc55cSDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
1901349cc55cSDimitry Andric return true;
1902349cc55cSDimitry Andric }
1903349cc55cSDimitry Andric
19045f757f3fSDimitry Andric static const char *
fastParseASCIIIdentifier(const char * CurPtr,const char * BufferEnd)19055f757f3fSDimitry Andric fastParseASCIIIdentifier(const char *CurPtr,
19065f757f3fSDimitry Andric [[maybe_unused]] const char *BufferEnd) {
19075f757f3fSDimitry Andric #ifdef __SSE4_2__
19085f757f3fSDimitry Andric alignas(16) static constexpr char AsciiIdentifierRange[16] = {
19095f757f3fSDimitry Andric '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
19105f757f3fSDimitry Andric };
19115f757f3fSDimitry Andric constexpr ssize_t BytesPerRegister = 16;
19125f757f3fSDimitry Andric
19135f757f3fSDimitry Andric __m128i AsciiIdentifierRangeV =
19145f757f3fSDimitry Andric _mm_load_si128((const __m128i *)AsciiIdentifierRange);
19155f757f3fSDimitry Andric
19165f757f3fSDimitry Andric while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
19175f757f3fSDimitry Andric __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
19185f757f3fSDimitry Andric
19195f757f3fSDimitry Andric int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
19205f757f3fSDimitry Andric _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES |
19215f757f3fSDimitry Andric _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY);
19225f757f3fSDimitry Andric CurPtr += Consumed;
19235f757f3fSDimitry Andric if (Consumed == BytesPerRegister)
19245f757f3fSDimitry Andric continue;
19255f757f3fSDimitry Andric return CurPtr;
19265f757f3fSDimitry Andric }
19275f757f3fSDimitry Andric #endif
19285f757f3fSDimitry Andric
19295f757f3fSDimitry Andric unsigned char C = *CurPtr;
19305f757f3fSDimitry Andric while (isAsciiIdentifierContinue(C))
19315f757f3fSDimitry Andric C = *++CurPtr;
19325f757f3fSDimitry Andric return CurPtr;
19335f757f3fSDimitry Andric }
19345f757f3fSDimitry Andric
LexIdentifierContinue(Token & Result,const char * CurPtr)1935349cc55cSDimitry Andric bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1936349cc55cSDimitry Andric // Match [_A-Za-z0-9]*, we have already matched an identifier start.
19375f757f3fSDimitry Andric
1938349cc55cSDimitry Andric while (true) {
19395f757f3fSDimitry Andric
19405f757f3fSDimitry Andric CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1941349cc55cSDimitry Andric
19420b57cec5SDimitry Andric unsigned Size;
1943349cc55cSDimitry Andric // Slow path: handle trigraph, unicode codepoints, UCNs.
19445f757f3fSDimitry Andric unsigned char C = getCharAndSize(CurPtr, Size);
1945349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) {
1946349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result);
1947349cc55cSDimitry Andric continue;
1948349cc55cSDimitry Andric }
1949349cc55cSDimitry Andric if (C == '$') {
1950349cc55cSDimitry Andric // If we hit a $ and they are not supported in identifiers, we are done.
1951349cc55cSDimitry Andric if (!LangOpts.DollarIdents)
1952349cc55cSDimitry Andric break;
1953349cc55cSDimitry Andric // Otherwise, emit a diagnostic and continue.
1954349cc55cSDimitry Andric if (!isLexingRawMode())
1955349cc55cSDimitry Andric Diag(CurPtr, diag::ext_dollar_in_identifier);
1956349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result);
1957349cc55cSDimitry Andric continue;
1958349cc55cSDimitry Andric }
1959349cc55cSDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1960349cc55cSDimitry Andric continue;
19615f757f3fSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1962349cc55cSDimitry Andric continue;
1963349cc55cSDimitry Andric // Neither an expected Unicode codepoint nor a UCN.
1964349cc55cSDimitry Andric break;
1965349cc55cSDimitry Andric }
19660b57cec5SDimitry Andric
19670b57cec5SDimitry Andric const char *IdStart = BufferPtr;
19680b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
19690b57cec5SDimitry Andric Result.setRawIdentifierData(IdStart);
19700b57cec5SDimitry Andric
19710b57cec5SDimitry Andric // If we are in raw mode, return this identifier raw. There is no need to
19720b57cec5SDimitry Andric // look up identifier information or attempt to macro expand it.
19730b57cec5SDimitry Andric if (LexingRawMode)
19740b57cec5SDimitry Andric return true;
19750b57cec5SDimitry Andric
19760b57cec5SDimitry Andric // Fill in Result.IdentifierInfo and update the token kind,
19770b57cec5SDimitry Andric // looking up the identifier in the identifier table.
19785f757f3fSDimitry Andric const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
19790b57cec5SDimitry Andric // Note that we have to call PP->LookUpIdentifierInfo() even for code
19800b57cec5SDimitry Andric // completion, it writes IdentifierInfo into Result, and callers rely on it.
19810b57cec5SDimitry Andric
19820b57cec5SDimitry Andric // If the completion point is at the end of an identifier, we want to treat
19830b57cec5SDimitry Andric // the identifier as incomplete even if it resolves to a macro or a keyword.
19840b57cec5SDimitry Andric // This allows e.g. 'class^' to complete to 'classifier'.
19850b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr)) {
19860b57cec5SDimitry Andric // Return the code-completion token.
19870b57cec5SDimitry Andric Result.setKind(tok::code_completion);
19880b57cec5SDimitry Andric // Skip the code-completion char and all immediate identifier characters.
19890b57cec5SDimitry Andric // This ensures we get consistent behavior when completing at any point in
19900b57cec5SDimitry Andric // an identifier (i.e. at the start, in the middle, at the end). Note that
19910b57cec5SDimitry Andric // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
19920b57cec5SDimitry Andric // simpler.
19930b57cec5SDimitry Andric assert(*CurPtr == 0 && "Completion character must be 0");
19940b57cec5SDimitry Andric ++CurPtr;
19950b57cec5SDimitry Andric // Note that code completion token is not added as a separate character
19960b57cec5SDimitry Andric // when the completion point is at the end of the buffer. Therefore, we need
19970b57cec5SDimitry Andric // to check if the buffer has ended.
19980b57cec5SDimitry Andric if (CurPtr < BufferEnd) {
1999349cc55cSDimitry Andric while (isAsciiIdentifierContinue(*CurPtr))
20000b57cec5SDimitry Andric ++CurPtr;
20010b57cec5SDimitry Andric }
20020b57cec5SDimitry Andric BufferPtr = CurPtr;
20030b57cec5SDimitry Andric return true;
20040b57cec5SDimitry Andric }
20050b57cec5SDimitry Andric
20060b57cec5SDimitry Andric // Finally, now that we know we have an identifier, pass this off to the
20070b57cec5SDimitry Andric // preprocessor, which may macro expand it or something.
20080b57cec5SDimitry Andric if (II->isHandleIdentifierCase())
20090b57cec5SDimitry Andric return PP->HandleIdentifier(Result);
20100b57cec5SDimitry Andric
20110b57cec5SDimitry Andric return true;
20120b57cec5SDimitry Andric }
20130b57cec5SDimitry Andric
20140b57cec5SDimitry Andric /// isHexaLiteral - Return true if Start points to a hex constant.
20150b57cec5SDimitry Andric /// in microsoft mode (where this is supposed to be several different tokens).
isHexaLiteral(const char * Start,const LangOptions & LangOpts)20160b57cec5SDimitry Andric bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
20175f757f3fSDimitry Andric auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
20185f757f3fSDimitry Andric char C1 = CharAndSize1.Char;
20190b57cec5SDimitry Andric if (C1 != '0')
20200b57cec5SDimitry Andric return false;
20215f757f3fSDimitry Andric
20225f757f3fSDimitry Andric auto CharAndSize2 =
20235f757f3fSDimitry Andric Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
20245f757f3fSDimitry Andric char C2 = CharAndSize2.Char;
20250b57cec5SDimitry Andric return (C2 == 'x' || C2 == 'X');
20260b57cec5SDimitry Andric }
20270b57cec5SDimitry Andric
20280b57cec5SDimitry Andric /// LexNumericConstant - Lex the remainder of a integer or floating point
20290b57cec5SDimitry Andric /// constant. From[-1] is the first character lexed. Return the end of the
20300b57cec5SDimitry Andric /// constant.
LexNumericConstant(Token & Result,const char * CurPtr)20310b57cec5SDimitry Andric bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
20320b57cec5SDimitry Andric unsigned Size;
20330b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size);
20340b57cec5SDimitry Andric char PrevCh = 0;
20350b57cec5SDimitry Andric while (isPreprocessingNumberBody(C)) {
20360b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result);
20370b57cec5SDimitry Andric PrevCh = C;
20385f757f3fSDimitry Andric if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
20395f757f3fSDimitry Andric CurPtr -= Size;
20405f757f3fSDimitry Andric break;
20415f757f3fSDimitry Andric }
20420b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size);
20430b57cec5SDimitry Andric }
20440b57cec5SDimitry Andric
20450b57cec5SDimitry Andric // If we fell out, check for a sign, due to 1e+12. If we have one, continue.
20460b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
20470b57cec5SDimitry Andric // If we are in Microsoft mode, don't continue if the constant is hex.
20480b57cec5SDimitry Andric // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
20490b57cec5SDimitry Andric if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
20500b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
20510b57cec5SDimitry Andric }
20520b57cec5SDimitry Andric
20530b57cec5SDimitry Andric // If we have a hex FP constant, continue.
20540b57cec5SDimitry Andric if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
20550b57cec5SDimitry Andric // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
20560b57cec5SDimitry Andric // not-quite-conforming extension. Only do so if this looks like it's
20570b57cec5SDimitry Andric // actually meant to be a hexfloat, and not if it has a ud-suffix.
20580b57cec5SDimitry Andric bool IsHexFloat = true;
20590b57cec5SDimitry Andric if (!LangOpts.C99) {
20600b57cec5SDimitry Andric if (!isHexaLiteral(BufferPtr, LangOpts))
20610b57cec5SDimitry Andric IsHexFloat = false;
206281ad6265SDimitry Andric else if (!LangOpts.CPlusPlus17 &&
20630b57cec5SDimitry Andric std::find(BufferPtr, CurPtr, '_') != CurPtr)
20640b57cec5SDimitry Andric IsHexFloat = false;
20650b57cec5SDimitry Andric }
20660b57cec5SDimitry Andric if (IsHexFloat)
20670b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
20680b57cec5SDimitry Andric }
20690b57cec5SDimitry Andric
20700b57cec5SDimitry Andric // If we have a digit separator, continue.
20715f757f3fSDimitry Andric if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
20725f757f3fSDimitry Andric auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2073349cc55cSDimitry Andric if (isAsciiIdentifierContinue(Next)) {
20740b57cec5SDimitry Andric if (!isLexingRawMode())
207581ad6265SDimitry Andric Diag(CurPtr, LangOpts.CPlusPlus
2076fe6060f1SDimitry Andric ? diag::warn_cxx11_compat_digit_separator
20775f757f3fSDimitry Andric : diag::warn_c23_compat_digit_separator);
20780b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result);
20790b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, NextSize, Result);
20800b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr);
20810b57cec5SDimitry Andric }
20820b57cec5SDimitry Andric }
20830b57cec5SDimitry Andric
20840b57cec5SDimitry Andric // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
20850b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
20860b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr);
20875f757f3fSDimitry Andric if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
20880b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr);
20890b57cec5SDimitry Andric
20900b57cec5SDimitry Andric // Update the location of token as well as BufferPtr.
20910b57cec5SDimitry Andric const char *TokStart = BufferPtr;
20920b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
20930b57cec5SDimitry Andric Result.setLiteralData(TokStart);
20940b57cec5SDimitry Andric return true;
20950b57cec5SDimitry Andric }
20960b57cec5SDimitry Andric
20970b57cec5SDimitry Andric /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
20980b57cec5SDimitry Andric /// in C++11, or warn on a ud-suffix in C++98.
LexUDSuffix(Token & Result,const char * CurPtr,bool IsStringLiteral)20990b57cec5SDimitry Andric const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
21000b57cec5SDimitry Andric bool IsStringLiteral) {
210181ad6265SDimitry Andric assert(LangOpts.CPlusPlus);
21020b57cec5SDimitry Andric
21030b57cec5SDimitry Andric // Maximally munch an identifier.
21040b57cec5SDimitry Andric unsigned Size;
21050b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, Size);
21060b57cec5SDimitry Andric bool Consumed = false;
21070b57cec5SDimitry Andric
2108349cc55cSDimitry Andric if (!isAsciiIdentifierStart(C)) {
21090b57cec5SDimitry Andric if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
21100b57cec5SDimitry Andric Consumed = true;
21115f757f3fSDimitry Andric else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
21120b57cec5SDimitry Andric Consumed = true;
21130b57cec5SDimitry Andric else
21140b57cec5SDimitry Andric return CurPtr;
21150b57cec5SDimitry Andric }
21160b57cec5SDimitry Andric
211781ad6265SDimitry Andric if (!LangOpts.CPlusPlus11) {
21180b57cec5SDimitry Andric if (!isLexingRawMode())
21190b57cec5SDimitry Andric Diag(CurPtr,
21200b57cec5SDimitry Andric C == '_' ? diag::warn_cxx11_compat_user_defined_literal
21210b57cec5SDimitry Andric : diag::warn_cxx11_compat_reserved_user_defined_literal)
21220b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
21230b57cec5SDimitry Andric return CurPtr;
21240b57cec5SDimitry Andric }
21250b57cec5SDimitry Andric
21260b57cec5SDimitry Andric // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
21270b57cec5SDimitry Andric // that does not start with an underscore is ill-formed. As a conforming
21280b57cec5SDimitry Andric // extension, we treat all such suffixes as if they had whitespace before
21290b57cec5SDimitry Andric // them. We assume a suffix beginning with a UCN or UTF-8 character is more
21300b57cec5SDimitry Andric // likely to be a ud-suffix than a macro, however, and accept that.
21310b57cec5SDimitry Andric if (!Consumed) {
21320b57cec5SDimitry Andric bool IsUDSuffix = false;
21330b57cec5SDimitry Andric if (C == '_')
21340b57cec5SDimitry Andric IsUDSuffix = true;
213581ad6265SDimitry Andric else if (IsStringLiteral && LangOpts.CPlusPlus14) {
21360b57cec5SDimitry Andric // In C++1y, we need to look ahead a few characters to see if this is a
21370b57cec5SDimitry Andric // valid suffix for a string literal or a numeric literal (this could be
21380b57cec5SDimitry Andric // the 'operator""if' defining a numeric literal operator).
21390b57cec5SDimitry Andric const unsigned MaxStandardSuffixLength = 3;
21400b57cec5SDimitry Andric char Buffer[MaxStandardSuffixLength] = { C };
21410b57cec5SDimitry Andric unsigned Consumed = Size;
21420b57cec5SDimitry Andric unsigned Chars = 1;
21430b57cec5SDimitry Andric while (true) {
21445f757f3fSDimitry Andric auto [Next, NextSize] =
21455f757f3fSDimitry Andric getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2146349cc55cSDimitry Andric if (!isAsciiIdentifierContinue(Next)) {
21475ffd83dbSDimitry Andric // End of suffix. Check whether this is on the allowed list.
21480b57cec5SDimitry Andric const StringRef CompleteSuffix(Buffer, Chars);
214981ad6265SDimitry Andric IsUDSuffix =
215081ad6265SDimitry Andric StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
21510b57cec5SDimitry Andric break;
21520b57cec5SDimitry Andric }
21530b57cec5SDimitry Andric
21540b57cec5SDimitry Andric if (Chars == MaxStandardSuffixLength)
21550b57cec5SDimitry Andric // Too long: can't be a standard suffix.
21560b57cec5SDimitry Andric break;
21570b57cec5SDimitry Andric
21580b57cec5SDimitry Andric Buffer[Chars++] = Next;
21590b57cec5SDimitry Andric Consumed += NextSize;
21600b57cec5SDimitry Andric }
21610b57cec5SDimitry Andric }
21620b57cec5SDimitry Andric
21630b57cec5SDimitry Andric if (!IsUDSuffix) {
21640b57cec5SDimitry Andric if (!isLexingRawMode())
216581ad6265SDimitry Andric Diag(CurPtr, LangOpts.MSVCCompat
21660b57cec5SDimitry Andric ? diag::ext_ms_reserved_user_defined_literal
21670b57cec5SDimitry Andric : diag::ext_reserved_user_defined_literal)
21680b57cec5SDimitry Andric << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
21690b57cec5SDimitry Andric return CurPtr;
21700b57cec5SDimitry Andric }
21710b57cec5SDimitry Andric
21720b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result);
21730b57cec5SDimitry Andric }
21740b57cec5SDimitry Andric
21750b57cec5SDimitry Andric Result.setFlag(Token::HasUDSuffix);
21760b57cec5SDimitry Andric while (true) {
21770b57cec5SDimitry Andric C = getCharAndSize(CurPtr, Size);
2178349cc55cSDimitry Andric if (isAsciiIdentifierContinue(C)) {
2179349cc55cSDimitry Andric CurPtr = ConsumeChar(CurPtr, Size, Result);
2180349cc55cSDimitry Andric } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
21815f757f3fSDimitry Andric } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2182349cc55cSDimitry Andric } else
2183349cc55cSDimitry Andric break;
21840b57cec5SDimitry Andric }
21850b57cec5SDimitry Andric
21860b57cec5SDimitry Andric return CurPtr;
21870b57cec5SDimitry Andric }
21880b57cec5SDimitry Andric
21890b57cec5SDimitry Andric /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
21900b57cec5SDimitry Andric /// either " or L" or u8" or u" or U".
LexStringLiteral(Token & Result,const char * CurPtr,tok::TokenKind Kind)21910b57cec5SDimitry Andric bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
21920b57cec5SDimitry Andric tok::TokenKind Kind) {
21930b57cec5SDimitry Andric const char *AfterQuote = CurPtr;
21940b57cec5SDimitry Andric // Does this string contain the \0 character?
21950b57cec5SDimitry Andric const char *NulCharacter = nullptr;
21960b57cec5SDimitry Andric
21970b57cec5SDimitry Andric if (!isLexingRawMode() &&
21980b57cec5SDimitry Andric (Kind == tok::utf8_string_literal ||
21990b57cec5SDimitry Andric Kind == tok::utf16_string_literal ||
22000b57cec5SDimitry Andric Kind == tok::utf32_string_literal))
220181ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
22020b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal);
22030b57cec5SDimitry Andric
22040b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result);
22050b57cec5SDimitry Andric while (C != '"') {
22060b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by
22070b57cec5SDimitry Andric // getAndAdvanceChar.
22080b57cec5SDimitry Andric if (C == '\\')
22090b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
22100b57cec5SDimitry Andric
22110b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline.
22120b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
22130b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
22140b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
22150b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown);
22160b57cec5SDimitry Andric return true;
22170b57cec5SDimitry Andric }
22180b57cec5SDimitry Andric
22190b57cec5SDimitry Andric if (C == 0) {
22200b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) {
22210b57cec5SDimitry Andric if (ParsingFilename)
22220b57cec5SDimitry Andric codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
22230b57cec5SDimitry Andric else
22240b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage();
22250b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
22260b57cec5SDimitry Andric cutOffLexing();
22270b57cec5SDimitry Andric return true;
22280b57cec5SDimitry Andric }
22290b57cec5SDimitry Andric
22300b57cec5SDimitry Andric NulCharacter = CurPtr-1;
22310b57cec5SDimitry Andric }
22320b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
22330b57cec5SDimitry Andric }
22340b57cec5SDimitry Andric
22350b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix.
223681ad6265SDimitry Andric if (LangOpts.CPlusPlus)
22370b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true);
22380b57cec5SDimitry Andric
22390b57cec5SDimitry Andric // If a nul character existed in the string, warn about it.
22400b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode())
22410b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1;
22420b57cec5SDimitry Andric
22430b57cec5SDimitry Andric // Update the location of the token as well as the BufferPtr instance var.
22440b57cec5SDimitry Andric const char *TokStart = BufferPtr;
22450b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind);
22460b57cec5SDimitry Andric Result.setLiteralData(TokStart);
22470b57cec5SDimitry Andric return true;
22480b57cec5SDimitry Andric }
22490b57cec5SDimitry Andric
22500b57cec5SDimitry Andric /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
22510b57cec5SDimitry Andric /// having lexed R", LR", u8R", uR", or UR".
LexRawStringLiteral(Token & Result,const char * CurPtr,tok::TokenKind Kind)22520b57cec5SDimitry Andric bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
22530b57cec5SDimitry Andric tok::TokenKind Kind) {
22540b57cec5SDimitry Andric // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
22550b57cec5SDimitry Andric // Between the initial and final double quote characters of the raw string,
22560b57cec5SDimitry Andric // any transformations performed in phases 1 and 2 (trigraphs,
22570b57cec5SDimitry Andric // universal-character-names, and line splicing) are reverted.
22580b57cec5SDimitry Andric
22590b57cec5SDimitry Andric if (!isLexingRawMode())
22600b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
22610b57cec5SDimitry Andric
22620b57cec5SDimitry Andric unsigned PrefixLen = 0;
22630b57cec5SDimitry Andric
2264*0fca6ea1SDimitry Andric while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) {
2265*0fca6ea1SDimitry Andric if (!isLexingRawMode() &&
2266*0fca6ea1SDimitry Andric llvm::is_contained({'$', '@', '`'}, CurPtr[PrefixLen])) {
2267*0fca6ea1SDimitry Andric const char *Pos = &CurPtr[PrefixLen];
2268*0fca6ea1SDimitry Andric Diag(Pos, LangOpts.CPlusPlus26
2269*0fca6ea1SDimitry Andric ? diag::warn_cxx26_compat_raw_string_literal_character_set
2270*0fca6ea1SDimitry Andric : diag::ext_cxx26_raw_string_literal_character_set)
2271*0fca6ea1SDimitry Andric << StringRef(Pos, 1);
2272*0fca6ea1SDimitry Andric }
22730b57cec5SDimitry Andric ++PrefixLen;
2274*0fca6ea1SDimitry Andric }
22750b57cec5SDimitry Andric
22760b57cec5SDimitry Andric // If the last character was not a '(', then we didn't lex a valid delimiter.
22770b57cec5SDimitry Andric if (CurPtr[PrefixLen] != '(') {
22780b57cec5SDimitry Andric if (!isLexingRawMode()) {
22790b57cec5SDimitry Andric const char *PrefixEnd = &CurPtr[PrefixLen];
22800b57cec5SDimitry Andric if (PrefixLen == 16) {
22810b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_raw_delim_too_long);
2282*0fca6ea1SDimitry Andric } else if (*PrefixEnd == '\n') {
2283*0fca6ea1SDimitry Andric Diag(PrefixEnd, diag::err_invalid_newline_raw_delim);
22840b57cec5SDimitry Andric } else {
22850b57cec5SDimitry Andric Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
22860b57cec5SDimitry Andric << StringRef(PrefixEnd, 1);
22870b57cec5SDimitry Andric }
22880b57cec5SDimitry Andric }
22890b57cec5SDimitry Andric
22900b57cec5SDimitry Andric // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
22910b57cec5SDimitry Andric // it's possible the '"' was intended to be part of the raw string, but
22920b57cec5SDimitry Andric // there's not much we can do about that.
22930b57cec5SDimitry Andric while (true) {
22940b57cec5SDimitry Andric char C = *CurPtr++;
22950b57cec5SDimitry Andric
22960b57cec5SDimitry Andric if (C == '"')
22970b57cec5SDimitry Andric break;
22980b57cec5SDimitry Andric if (C == 0 && CurPtr-1 == BufferEnd) {
22990b57cec5SDimitry Andric --CurPtr;
23000b57cec5SDimitry Andric break;
23010b57cec5SDimitry Andric }
23020b57cec5SDimitry Andric }
23030b57cec5SDimitry Andric
23040b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
23050b57cec5SDimitry Andric return true;
23060b57cec5SDimitry Andric }
23070b57cec5SDimitry Andric
23080b57cec5SDimitry Andric // Save prefix and move CurPtr past it
23090b57cec5SDimitry Andric const char *Prefix = CurPtr;
23100b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '('
23110b57cec5SDimitry Andric
23120b57cec5SDimitry Andric while (true) {
23130b57cec5SDimitry Andric char C = *CurPtr++;
23140b57cec5SDimitry Andric
23150b57cec5SDimitry Andric if (C == ')') {
23160b57cec5SDimitry Andric // Check for prefix match and closing quote.
23170b57cec5SDimitry Andric if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
23180b57cec5SDimitry Andric CurPtr += PrefixLen + 1; // skip over prefix and '"'
23190b57cec5SDimitry Andric break;
23200b57cec5SDimitry Andric }
23210b57cec5SDimitry Andric } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
23220b57cec5SDimitry Andric if (!isLexingRawMode())
23230b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_raw_string)
23240b57cec5SDimitry Andric << StringRef(Prefix, PrefixLen);
23250b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown);
23260b57cec5SDimitry Andric return true;
23270b57cec5SDimitry Andric }
23280b57cec5SDimitry Andric }
23290b57cec5SDimitry Andric
23300b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix.
233181ad6265SDimitry Andric if (LangOpts.CPlusPlus)
23320b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, true);
23330b57cec5SDimitry Andric
23340b57cec5SDimitry Andric // Update the location of token as well as BufferPtr.
23350b57cec5SDimitry Andric const char *TokStart = BufferPtr;
23360b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind);
23370b57cec5SDimitry Andric Result.setLiteralData(TokStart);
23380b57cec5SDimitry Andric return true;
23390b57cec5SDimitry Andric }
23400b57cec5SDimitry Andric
23410b57cec5SDimitry Andric /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
23420b57cec5SDimitry Andric /// after having lexed the '<' character. This is used for #include filenames.
LexAngledStringLiteral(Token & Result,const char * CurPtr)23430b57cec5SDimitry Andric bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
23440b57cec5SDimitry Andric // Does this string contain the \0 character?
23450b57cec5SDimitry Andric const char *NulCharacter = nullptr;
23460b57cec5SDimitry Andric const char *AfterLessPos = CurPtr;
23470b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result);
23480b57cec5SDimitry Andric while (C != '>') {
23490b57cec5SDimitry Andric // Skip escaped characters. Escaped newlines will already be processed by
23500b57cec5SDimitry Andric // getAndAdvanceChar.
23510b57cec5SDimitry Andric if (C == '\\')
23520b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
23530b57cec5SDimitry Andric
2354fe6060f1SDimitry Andric if (isVerticalWhitespace(C) || // Newline.
23550b57cec5SDimitry Andric (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
23560b57cec5SDimitry Andric // If the filename is unterminated, then it must just be a lone <
23570b57cec5SDimitry Andric // character. Return this as such.
23580b57cec5SDimitry Andric FormTokenWithChars(Result, AfterLessPos, tok::less);
23590b57cec5SDimitry Andric return true;
23600b57cec5SDimitry Andric }
23610b57cec5SDimitry Andric
23620b57cec5SDimitry Andric if (C == 0) {
23630b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr - 1)) {
23640b57cec5SDimitry Andric codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
23650b57cec5SDimitry Andric cutOffLexing();
23660b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
23670b57cec5SDimitry Andric return true;
23680b57cec5SDimitry Andric }
23690b57cec5SDimitry Andric NulCharacter = CurPtr-1;
23700b57cec5SDimitry Andric }
23710b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
23720b57cec5SDimitry Andric }
23730b57cec5SDimitry Andric
23740b57cec5SDimitry Andric // If a nul character existed in the string, warn about it.
23750b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode())
23760b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 1;
23770b57cec5SDimitry Andric
23780b57cec5SDimitry Andric // Update the location of token as well as BufferPtr.
23790b57cec5SDimitry Andric const char *TokStart = BufferPtr;
23800b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::header_name);
23810b57cec5SDimitry Andric Result.setLiteralData(TokStart);
23820b57cec5SDimitry Andric return true;
23830b57cec5SDimitry Andric }
23840b57cec5SDimitry Andric
codeCompleteIncludedFile(const char * PathStart,const char * CompletionPoint,bool IsAngled)23850b57cec5SDimitry Andric void Lexer::codeCompleteIncludedFile(const char *PathStart,
23860b57cec5SDimitry Andric const char *CompletionPoint,
23870b57cec5SDimitry Andric bool IsAngled) {
23880b57cec5SDimitry Andric // Completion only applies to the filename, after the last slash.
23890b57cec5SDimitry Andric StringRef PartialPath(PathStart, CompletionPoint - PathStart);
23905ffd83dbSDimitry Andric llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
23915ffd83dbSDimitry Andric auto Slash = PartialPath.find_last_of(SlashChars);
23920b57cec5SDimitry Andric StringRef Dir =
23930b57cec5SDimitry Andric (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
23940b57cec5SDimitry Andric const char *StartOfFilename =
23950b57cec5SDimitry Andric (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
23960b57cec5SDimitry Andric // Code completion filter range is the filename only, up to completion point.
23970b57cec5SDimitry Andric PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
23980b57cec5SDimitry Andric StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
23995ffd83dbSDimitry Andric // We should replace the characters up to the closing quote or closest slash,
24005ffd83dbSDimitry Andric // if any.
24010b57cec5SDimitry Andric while (CompletionPoint < BufferEnd) {
24020b57cec5SDimitry Andric char Next = *(CompletionPoint + 1);
24030b57cec5SDimitry Andric if (Next == 0 || Next == '\r' || Next == '\n')
24040b57cec5SDimitry Andric break;
24050b57cec5SDimitry Andric ++CompletionPoint;
24060b57cec5SDimitry Andric if (Next == (IsAngled ? '>' : '"'))
24070b57cec5SDimitry Andric break;
240806c3fb27SDimitry Andric if (SlashChars.contains(Next))
24095ffd83dbSDimitry Andric break;
24100b57cec5SDimitry Andric }
24115ffd83dbSDimitry Andric
24120b57cec5SDimitry Andric PP->setCodeCompletionTokenRange(
24130b57cec5SDimitry Andric FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
24140b57cec5SDimitry Andric FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
24150b57cec5SDimitry Andric PP->CodeCompleteIncludedFile(Dir, IsAngled);
24160b57cec5SDimitry Andric }
24170b57cec5SDimitry Andric
24180b57cec5SDimitry Andric /// LexCharConstant - Lex the remainder of a character constant, after having
24190b57cec5SDimitry Andric /// lexed either ' or L' or u8' or u' or U'.
LexCharConstant(Token & Result,const char * CurPtr,tok::TokenKind Kind)24200b57cec5SDimitry Andric bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
24210b57cec5SDimitry Andric tok::TokenKind Kind) {
24220b57cec5SDimitry Andric // Does this character contain the \0 character?
24230b57cec5SDimitry Andric const char *NulCharacter = nullptr;
24240b57cec5SDimitry Andric
24250b57cec5SDimitry Andric if (!isLexingRawMode()) {
24260b57cec5SDimitry Andric if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
242781ad6265SDimitry Andric Diag(BufferPtr, LangOpts.CPlusPlus
24280b57cec5SDimitry Andric ? diag::warn_cxx98_compat_unicode_literal
24290b57cec5SDimitry Andric : diag::warn_c99_compat_unicode_literal);
24300b57cec5SDimitry Andric else if (Kind == tok::utf8_char_constant)
24310b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
24320b57cec5SDimitry Andric }
24330b57cec5SDimitry Andric
24340b57cec5SDimitry Andric char C = getAndAdvanceChar(CurPtr, Result);
24350b57cec5SDimitry Andric if (C == '\'') {
24360b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
24370b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_empty_character);
24380b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
24390b57cec5SDimitry Andric return true;
24400b57cec5SDimitry Andric }
24410b57cec5SDimitry Andric
24420b57cec5SDimitry Andric while (C != '\'') {
24430b57cec5SDimitry Andric // Skip escaped characters.
24440b57cec5SDimitry Andric if (C == '\\')
24450b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
24460b57cec5SDimitry Andric
24470b57cec5SDimitry Andric if (C == '\n' || C == '\r' || // Newline.
24480b57cec5SDimitry Andric (C == 0 && CurPtr-1 == BufferEnd)) { // End of file.
24490b57cec5SDimitry Andric if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
24500b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
24510b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown);
24520b57cec5SDimitry Andric return true;
24530b57cec5SDimitry Andric }
24540b57cec5SDimitry Andric
24550b57cec5SDimitry Andric if (C == 0) {
24560b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) {
24570b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage();
24580b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr-1, tok::unknown);
24590b57cec5SDimitry Andric cutOffLexing();
24600b57cec5SDimitry Andric return true;
24610b57cec5SDimitry Andric }
24620b57cec5SDimitry Andric
24630b57cec5SDimitry Andric NulCharacter = CurPtr-1;
24640b57cec5SDimitry Andric }
24650b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
24660b57cec5SDimitry Andric }
24670b57cec5SDimitry Andric
24680b57cec5SDimitry Andric // If we are in C++11, lex the optional ud-suffix.
246981ad6265SDimitry Andric if (LangOpts.CPlusPlus)
24700b57cec5SDimitry Andric CurPtr = LexUDSuffix(Result, CurPtr, false);
24710b57cec5SDimitry Andric
24720b57cec5SDimitry Andric // If a nul character existed in the character, warn about it.
24730b57cec5SDimitry Andric if (NulCharacter && !isLexingRawMode())
24740b57cec5SDimitry Andric Diag(NulCharacter, diag::null_in_char_or_string) << 0;
24750b57cec5SDimitry Andric
24760b57cec5SDimitry Andric // Update the location of token as well as BufferPtr.
24770b57cec5SDimitry Andric const char *TokStart = BufferPtr;
24780b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind);
24790b57cec5SDimitry Andric Result.setLiteralData(TokStart);
24800b57cec5SDimitry Andric return true;
24810b57cec5SDimitry Andric }
24820b57cec5SDimitry Andric
24830b57cec5SDimitry Andric /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
24840b57cec5SDimitry Andric /// Update BufferPtr to point to the next non-whitespace character and return.
24850b57cec5SDimitry Andric ///
24860b57cec5SDimitry Andric /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
SkipWhitespace(Token & Result,const char * CurPtr,bool & TokAtPhysicalStartOfLine)24870b57cec5SDimitry Andric bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
24880b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) {
24890b57cec5SDimitry Andric // Whitespace - Skip it, then return the token after the whitespace.
24900b57cec5SDimitry Andric bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
24910b57cec5SDimitry Andric
24920b57cec5SDimitry Andric unsigned char Char = *CurPtr;
24930b57cec5SDimitry Andric
2494e8d8bef9SDimitry Andric const char *lastNewLine = nullptr;
2495e8d8bef9SDimitry Andric auto setLastNewLine = [&](const char *Ptr) {
2496e8d8bef9SDimitry Andric lastNewLine = Ptr;
2497e8d8bef9SDimitry Andric if (!NewLinePtr)
2498e8d8bef9SDimitry Andric NewLinePtr = Ptr;
2499e8d8bef9SDimitry Andric };
2500e8d8bef9SDimitry Andric if (SawNewline)
2501e8d8bef9SDimitry Andric setLastNewLine(CurPtr - 1);
2502e8d8bef9SDimitry Andric
25030b57cec5SDimitry Andric // Skip consecutive spaces efficiently.
25040b57cec5SDimitry Andric while (true) {
25050b57cec5SDimitry Andric // Skip horizontal whitespace very aggressively.
25060b57cec5SDimitry Andric while (isHorizontalWhitespace(Char))
25070b57cec5SDimitry Andric Char = *++CurPtr;
25080b57cec5SDimitry Andric
25090b57cec5SDimitry Andric // Otherwise if we have something other than whitespace, we're done.
25100b57cec5SDimitry Andric if (!isVerticalWhitespace(Char))
25110b57cec5SDimitry Andric break;
25120b57cec5SDimitry Andric
25130b57cec5SDimitry Andric if (ParsingPreprocessorDirective) {
25140b57cec5SDimitry Andric // End of preprocessor directive line, let LexTokenInternal handle this.
25150b57cec5SDimitry Andric BufferPtr = CurPtr;
25160b57cec5SDimitry Andric return false;
25170b57cec5SDimitry Andric }
25180b57cec5SDimitry Andric
25190b57cec5SDimitry Andric // OK, but handle newline.
2520e8d8bef9SDimitry Andric if (*CurPtr == '\n')
2521e8d8bef9SDimitry Andric setLastNewLine(CurPtr);
25220b57cec5SDimitry Andric SawNewline = true;
25230b57cec5SDimitry Andric Char = *++CurPtr;
25240b57cec5SDimitry Andric }
25250b57cec5SDimitry Andric
25260b57cec5SDimitry Andric // If the client wants us to return whitespace, return it now.
25270b57cec5SDimitry Andric if (isKeepWhitespaceMode()) {
25280b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
25290b57cec5SDimitry Andric if (SawNewline) {
25300b57cec5SDimitry Andric IsAtStartOfLine = true;
25310b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true;
25320b57cec5SDimitry Andric }
25330b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set.
25340b57cec5SDimitry Andric return true;
25350b57cec5SDimitry Andric }
25360b57cec5SDimitry Andric
25370b57cec5SDimitry Andric // If this isn't immediately after a newline, there is leading space.
25380b57cec5SDimitry Andric char PrevChar = CurPtr[-1];
25390b57cec5SDimitry Andric bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
25400b57cec5SDimitry Andric
25410b57cec5SDimitry Andric Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
25420b57cec5SDimitry Andric if (SawNewline) {
25430b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine);
25440b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true;
2545e8d8bef9SDimitry Andric
2546e8d8bef9SDimitry Andric if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2547e8d8bef9SDimitry Andric if (auto *Handler = PP->getEmptylineHandler())
2548e8d8bef9SDimitry Andric Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2549e8d8bef9SDimitry Andric getSourceLocation(lastNewLine)));
2550e8d8bef9SDimitry Andric }
25510b57cec5SDimitry Andric }
25520b57cec5SDimitry Andric
25530b57cec5SDimitry Andric BufferPtr = CurPtr;
25540b57cec5SDimitry Andric return false;
25550b57cec5SDimitry Andric }
25560b57cec5SDimitry Andric
25570b57cec5SDimitry Andric /// We have just read the // characters from input. Skip until we find the
25580b57cec5SDimitry Andric /// newline character that terminates the comment. Then update BufferPtr and
25590b57cec5SDimitry Andric /// return.
25600b57cec5SDimitry Andric ///
25610b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted
25620b57cec5SDimitry Andric /// some tokens, this will store the first token and return true.
SkipLineComment(Token & Result,const char * CurPtr,bool & TokAtPhysicalStartOfLine)25630b57cec5SDimitry Andric bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
25640b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) {
25650b57cec5SDimitry Andric // If Line comments aren't explicitly enabled for this language, emit an
25660b57cec5SDimitry Andric // extension warning.
256781ad6265SDimitry Andric if (!LineComment) {
25681fd87a68SDimitry Andric if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
25690b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_line_comment);
25700b57cec5SDimitry Andric
25710b57cec5SDimitry Andric // Mark them enabled so we only emit one warning for this translation
25720b57cec5SDimitry Andric // unit.
257381ad6265SDimitry Andric LineComment = true;
25740b57cec5SDimitry Andric }
25750b57cec5SDimitry Andric
25760b57cec5SDimitry Andric // Scan over the body of the comment. The common case, when scanning, is that
25770b57cec5SDimitry Andric // the comment contains normal ascii characters with nothing interesting in
25780b57cec5SDimitry Andric // them. As such, optimize for this case with the inner loop.
25790b57cec5SDimitry Andric //
25800b57cec5SDimitry Andric // This loop terminates with CurPtr pointing at the newline (or end of buffer)
25810b57cec5SDimitry Andric // character that ends the line comment.
2582753f127fSDimitry Andric
2583753f127fSDimitry Andric // C++23 [lex.phases] p1
2584753f127fSDimitry Andric // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2585753f127fSDimitry Andric // diagnostic only once per entire ill-formed subsequence to avoid
2586753f127fSDimitry Andric // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2587753f127fSDimitry Andric bool UnicodeDecodingAlreadyDiagnosed = false;
2588753f127fSDimitry Andric
25890b57cec5SDimitry Andric char C;
25900b57cec5SDimitry Andric while (true) {
25910b57cec5SDimitry Andric C = *CurPtr;
25920b57cec5SDimitry Andric // Skip over characters in the fast loop.
2593753f127fSDimitry Andric while (isASCII(C) && C != 0 && // Potentially EOF.
2594753f127fSDimitry Andric C != '\n' && C != '\r') { // Newline or DOS-style newline.
25950b57cec5SDimitry Andric C = *++CurPtr;
2596753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false;
2597753f127fSDimitry Andric }
2598753f127fSDimitry Andric
2599753f127fSDimitry Andric if (!isASCII(C)) {
2600753f127fSDimitry Andric unsigned Length = llvm::getUTF8SequenceSize(
2601753f127fSDimitry Andric (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2602753f127fSDimitry Andric if (Length == 0) {
2603753f127fSDimitry Andric if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2604753f127fSDimitry Andric Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2605753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = true;
2606753f127fSDimitry Andric ++CurPtr;
2607753f127fSDimitry Andric } else {
2608753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false;
2609753f127fSDimitry Andric CurPtr += Length;
2610753f127fSDimitry Andric }
2611753f127fSDimitry Andric continue;
2612753f127fSDimitry Andric }
26130b57cec5SDimitry Andric
26140b57cec5SDimitry Andric const char *NextLine = CurPtr;
26150b57cec5SDimitry Andric if (C != 0) {
26160b57cec5SDimitry Andric // We found a newline, see if it's escaped.
26170b57cec5SDimitry Andric const char *EscapePtr = CurPtr-1;
26180b57cec5SDimitry Andric bool HasSpace = false;
26190b57cec5SDimitry Andric while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
26200b57cec5SDimitry Andric --EscapePtr;
26210b57cec5SDimitry Andric HasSpace = true;
26220b57cec5SDimitry Andric }
26230b57cec5SDimitry Andric
26240b57cec5SDimitry Andric if (*EscapePtr == '\\')
26250b57cec5SDimitry Andric // Escaped newline.
26260b57cec5SDimitry Andric CurPtr = EscapePtr;
26270b57cec5SDimitry Andric else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
26280b57cec5SDimitry Andric EscapePtr[-2] == '?' && LangOpts.Trigraphs)
26290b57cec5SDimitry Andric // Trigraph-escaped newline.
26300b57cec5SDimitry Andric CurPtr = EscapePtr-2;
26310b57cec5SDimitry Andric else
26320b57cec5SDimitry Andric break; // This is a newline, we're done.
26330b57cec5SDimitry Andric
26340b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it.
26350b57cec5SDimitry Andric if (HasSpace && !isLexingRawMode())
26360b57cec5SDimitry Andric Diag(EscapePtr, diag::backslash_newline_space);
26370b57cec5SDimitry Andric }
26380b57cec5SDimitry Andric
26390b57cec5SDimitry Andric // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to
26400b57cec5SDimitry Andric // properly decode the character. Read it in raw mode to avoid emitting
26410b57cec5SDimitry Andric // diagnostics about things like trigraphs. If we see an escaped newline,
26420b57cec5SDimitry Andric // we'll handle it below.
26430b57cec5SDimitry Andric const char *OldPtr = CurPtr;
26440b57cec5SDimitry Andric bool OldRawMode = isLexingRawMode();
26450b57cec5SDimitry Andric LexingRawMode = true;
26460b57cec5SDimitry Andric C = getAndAdvanceChar(CurPtr, Result);
26470b57cec5SDimitry Andric LexingRawMode = OldRawMode;
26480b57cec5SDimitry Andric
26490b57cec5SDimitry Andric // If we only read only one character, then no special handling is needed.
26500b57cec5SDimitry Andric // We're done and can skip forward to the newline.
26510b57cec5SDimitry Andric if (C != 0 && CurPtr == OldPtr+1) {
26520b57cec5SDimitry Andric CurPtr = NextLine;
26530b57cec5SDimitry Andric break;
26540b57cec5SDimitry Andric }
26550b57cec5SDimitry Andric
26560b57cec5SDimitry Andric // If we read multiple characters, and one of those characters was a \r or
26570b57cec5SDimitry Andric // \n, then we had an escaped newline within the comment. Emit diagnostic
26580b57cec5SDimitry Andric // unless the next line is also a // comment.
26590b57cec5SDimitry Andric if (CurPtr != OldPtr + 1 && C != '/' &&
26600b57cec5SDimitry Andric (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
26610b57cec5SDimitry Andric for (; OldPtr != CurPtr; ++OldPtr)
26620b57cec5SDimitry Andric if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
26630b57cec5SDimitry Andric // Okay, we found a // comment that ends in a newline, if the next
26640b57cec5SDimitry Andric // line is also a // comment, but has spaces, don't emit a diagnostic.
26650b57cec5SDimitry Andric if (isWhitespace(C)) {
26660b57cec5SDimitry Andric const char *ForwardPtr = CurPtr;
26670b57cec5SDimitry Andric while (isWhitespace(*ForwardPtr)) // Skip whitespace.
26680b57cec5SDimitry Andric ++ForwardPtr;
26690b57cec5SDimitry Andric if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
26700b57cec5SDimitry Andric break;
26710b57cec5SDimitry Andric }
26720b57cec5SDimitry Andric
26730b57cec5SDimitry Andric if (!isLexingRawMode())
26740b57cec5SDimitry Andric Diag(OldPtr-1, diag::ext_multi_line_line_comment);
26750b57cec5SDimitry Andric break;
26760b57cec5SDimitry Andric }
26770b57cec5SDimitry Andric }
26780b57cec5SDimitry Andric
26790b57cec5SDimitry Andric if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
26800b57cec5SDimitry Andric --CurPtr;
26810b57cec5SDimitry Andric break;
26820b57cec5SDimitry Andric }
26830b57cec5SDimitry Andric
26840b57cec5SDimitry Andric if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
26850b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage();
26860b57cec5SDimitry Andric cutOffLexing();
26870b57cec5SDimitry Andric return false;
26880b57cec5SDimitry Andric }
26890b57cec5SDimitry Andric }
26900b57cec5SDimitry Andric
26910b57cec5SDimitry Andric // Found but did not consume the newline. Notify comment handlers about the
26920b57cec5SDimitry Andric // comment unless we're in a #if 0 block.
26930b57cec5SDimitry Andric if (PP && !isLexingRawMode() &&
26940b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
26950b57cec5SDimitry Andric getSourceLocation(CurPtr)))) {
26960b57cec5SDimitry Andric BufferPtr = CurPtr;
26970b57cec5SDimitry Andric return true; // A token has to be returned.
26980b57cec5SDimitry Andric }
26990b57cec5SDimitry Andric
27000b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token.
27010b57cec5SDimitry Andric if (inKeepCommentMode())
27020b57cec5SDimitry Andric return SaveLineComment(Result, CurPtr);
27030b57cec5SDimitry Andric
27040b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line,
27050b57cec5SDimitry Andric // return immediately, so that the lexer can return this as an EOD token.
27060b57cec5SDimitry Andric if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
27070b57cec5SDimitry Andric BufferPtr = CurPtr;
27080b57cec5SDimitry Andric return false;
27090b57cec5SDimitry Andric }
27100b57cec5SDimitry Andric
27110b57cec5SDimitry Andric // Otherwise, eat the \n character. We don't care if this is a \n\r or
27120b57cec5SDimitry Andric // \r\n sequence. This is an efficiency hack (because we know the \n can't
27130b57cec5SDimitry Andric // contribute to another token), it isn't needed for correctness. Note that
27140b57cec5SDimitry Andric // this is ok even in KeepWhitespaceMode, because we would have returned the
27155f757f3fSDimitry Andric // comment above in that mode.
2716e8d8bef9SDimitry Andric NewLinePtr = CurPtr++;
27170b57cec5SDimitry Andric
27180b57cec5SDimitry Andric // The next returned token is at the start of the line.
27190b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine);
27200b57cec5SDimitry Andric TokAtPhysicalStartOfLine = true;
27210b57cec5SDimitry Andric // No leading whitespace seen so far.
27220b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace);
27230b57cec5SDimitry Andric BufferPtr = CurPtr;
27240b57cec5SDimitry Andric return false;
27250b57cec5SDimitry Andric }
27260b57cec5SDimitry Andric
27270b57cec5SDimitry Andric /// If in save-comment mode, package up this Line comment in an appropriate
27280b57cec5SDimitry Andric /// way and return it.
SaveLineComment(Token & Result,const char * CurPtr)27290b57cec5SDimitry Andric bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
27300b57cec5SDimitry Andric // If we're not in a preprocessor directive, just return the // comment
27310b57cec5SDimitry Andric // directly.
27320b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment);
27330b57cec5SDimitry Andric
27340b57cec5SDimitry Andric if (!ParsingPreprocessorDirective || LexingRawMode)
27350b57cec5SDimitry Andric return true;
27360b57cec5SDimitry Andric
27370b57cec5SDimitry Andric // If this Line-style comment is in a macro definition, transmogrify it into
27380b57cec5SDimitry Andric // a C-style block comment.
27390b57cec5SDimitry Andric bool Invalid = false;
27400b57cec5SDimitry Andric std::string Spelling = PP->getSpelling(Result, &Invalid);
27410b57cec5SDimitry Andric if (Invalid)
27420b57cec5SDimitry Andric return true;
27430b57cec5SDimitry Andric
27440b57cec5SDimitry Andric assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
27450b57cec5SDimitry Andric Spelling[1] = '*'; // Change prefix to "/*".
27460b57cec5SDimitry Andric Spelling += "*/"; // add suffix.
27470b57cec5SDimitry Andric
27480b57cec5SDimitry Andric Result.setKind(tok::comment);
27490b57cec5SDimitry Andric PP->CreateString(Spelling, Result,
27500b57cec5SDimitry Andric Result.getLocation(), Result.getLocation());
27510b57cec5SDimitry Andric return true;
27520b57cec5SDimitry Andric }
27530b57cec5SDimitry Andric
27540b57cec5SDimitry Andric /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
27550b57cec5SDimitry Andric /// character (either \\n or \\r) is part of an escaped newline sequence. Issue
27560b57cec5SDimitry Andric /// a diagnostic if so. We know that the newline is inside of a block comment.
isEndOfBlockCommentWithEscapedNewLine(const char * CurPtr,Lexer * L,bool Trigraphs)275781ad6265SDimitry Andric static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
275881ad6265SDimitry Andric bool Trigraphs) {
27590b57cec5SDimitry Andric assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
27600b57cec5SDimitry Andric
2761fe6060f1SDimitry Andric // Position of the first trigraph in the ending sequence.
276204eeddc0SDimitry Andric const char *TrigraphPos = nullptr;
2763fe6060f1SDimitry Andric // Position of the first whitespace after a '\' in the ending sequence.
276404eeddc0SDimitry Andric const char *SpacePos = nullptr;
2765fe6060f1SDimitry Andric
2766fe6060f1SDimitry Andric while (true) {
27670b57cec5SDimitry Andric // Back up off the newline.
27680b57cec5SDimitry Andric --CurPtr;
27690b57cec5SDimitry Andric
27700b57cec5SDimitry Andric // If this is a two-character newline sequence, skip the other character.
27710b57cec5SDimitry Andric if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
27720b57cec5SDimitry Andric // \n\n or \r\r -> not escaped newline.
27730b57cec5SDimitry Andric if (CurPtr[0] == CurPtr[1])
27740b57cec5SDimitry Andric return false;
27750b57cec5SDimitry Andric // \n\r or \r\n -> skip the newline.
27760b57cec5SDimitry Andric --CurPtr;
27770b57cec5SDimitry Andric }
27780b57cec5SDimitry Andric
27790b57cec5SDimitry Andric // If we have horizontal whitespace, skip over it. We allow whitespace
27800b57cec5SDimitry Andric // between the slash and newline.
27810b57cec5SDimitry Andric while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2782fe6060f1SDimitry Andric SpacePos = CurPtr;
27830b57cec5SDimitry Andric --CurPtr;
27840b57cec5SDimitry Andric }
27850b57cec5SDimitry Andric
2786fe6060f1SDimitry Andric // If we have a slash, this is an escaped newline.
27870b57cec5SDimitry Andric if (*CurPtr == '\\') {
2788fe6060f1SDimitry Andric --CurPtr;
2789fe6060f1SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2790fe6060f1SDimitry Andric // This is a trigraph encoding of a slash.
2791fe6060f1SDimitry Andric TrigraphPos = CurPtr - 2;
2792fe6060f1SDimitry Andric CurPtr -= 3;
27930b57cec5SDimitry Andric } else {
27940b57cec5SDimitry Andric return false;
2795fe6060f1SDimitry Andric }
27960b57cec5SDimitry Andric
2797fe6060f1SDimitry Andric // If the character preceding the escaped newline is a '*', then after line
2798fe6060f1SDimitry Andric // splicing we have a '*/' ending the comment.
2799fe6060f1SDimitry Andric if (*CurPtr == '*')
2800fe6060f1SDimitry Andric break;
28010b57cec5SDimitry Andric
2802fe6060f1SDimitry Andric if (*CurPtr != '\n' && *CurPtr != '\r')
2803fe6060f1SDimitry Andric return false;
2804fe6060f1SDimitry Andric }
2805fe6060f1SDimitry Andric
2806fe6060f1SDimitry Andric if (TrigraphPos) {
28070b57cec5SDimitry Andric // If no trigraphs are enabled, warn that we ignored this trigraph and
28080b57cec5SDimitry Andric // ignore this * character.
280981ad6265SDimitry Andric if (!Trigraphs) {
28100b57cec5SDimitry Andric if (!L->isLexingRawMode())
2811fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
28120b57cec5SDimitry Andric return false;
28130b57cec5SDimitry Andric }
28140b57cec5SDimitry Andric if (!L->isLexingRawMode())
2815fe6060f1SDimitry Andric L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
28160b57cec5SDimitry Andric }
28170b57cec5SDimitry Andric
28180b57cec5SDimitry Andric // Warn about having an escaped newline between the */ characters.
28190b57cec5SDimitry Andric if (!L->isLexingRawMode())
2820fe6060f1SDimitry Andric L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
28210b57cec5SDimitry Andric
28220b57cec5SDimitry Andric // If there was space between the backslash and newline, warn about it.
2823fe6060f1SDimitry Andric if (SpacePos && !L->isLexingRawMode())
2824fe6060f1SDimitry Andric L->Diag(SpacePos, diag::backslash_newline_space);
28250b57cec5SDimitry Andric
28260b57cec5SDimitry Andric return true;
28270b57cec5SDimitry Andric }
28280b57cec5SDimitry Andric
28290b57cec5SDimitry Andric #ifdef __SSE2__
28300b57cec5SDimitry Andric #include <emmintrin.h>
28310b57cec5SDimitry Andric #elif __ALTIVEC__
28320b57cec5SDimitry Andric #include <altivec.h>
28330b57cec5SDimitry Andric #undef bool
28340b57cec5SDimitry Andric #endif
28350b57cec5SDimitry Andric
28360b57cec5SDimitry Andric /// We have just read from input the / and * characters that started a comment.
28370b57cec5SDimitry Andric /// Read until we find the * and / characters that terminate the comment.
28380b57cec5SDimitry Andric /// Note that we don't bother decoding trigraphs or escaped newlines in block
28390b57cec5SDimitry Andric /// comments, because they cannot cause the comment to end. The only thing
28400b57cec5SDimitry Andric /// that can happen is the comment could end with an escaped newline between
28410b57cec5SDimitry Andric /// the terminating * and /.
28420b57cec5SDimitry Andric ///
28430b57cec5SDimitry Andric /// If we're in KeepCommentMode or any CommentHandler has inserted
28440b57cec5SDimitry Andric /// some tokens, this will store the first token and return true.
SkipBlockComment(Token & Result,const char * CurPtr,bool & TokAtPhysicalStartOfLine)28450b57cec5SDimitry Andric bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
28460b57cec5SDimitry Andric bool &TokAtPhysicalStartOfLine) {
28470b57cec5SDimitry Andric // Scan one character past where we should, looking for a '/' character. Once
28480b57cec5SDimitry Andric // we find it, check to see if it was preceded by a *. This common
28490b57cec5SDimitry Andric // optimization helps people who like to put a lot of * characters in their
28500b57cec5SDimitry Andric // comments.
28510b57cec5SDimitry Andric
28520b57cec5SDimitry Andric // The first character we get with newlines and trigraphs skipped to handle
28530b57cec5SDimitry Andric // the degenerate /*/ case below correctly if the * has an escaped newline
28540b57cec5SDimitry Andric // after it.
28550b57cec5SDimitry Andric unsigned CharSize;
28560b57cec5SDimitry Andric unsigned char C = getCharAndSize(CurPtr, CharSize);
28570b57cec5SDimitry Andric CurPtr += CharSize;
28580b57cec5SDimitry Andric if (C == 0 && CurPtr == BufferEnd+1) {
28590b57cec5SDimitry Andric if (!isLexingRawMode())
28600b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment);
28610b57cec5SDimitry Andric --CurPtr;
28620b57cec5SDimitry Andric
28630b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since
28640b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token.
28650b57cec5SDimitry Andric if (isKeepWhitespaceMode()) {
28660b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
28670b57cec5SDimitry Andric return true;
28680b57cec5SDimitry Andric }
28690b57cec5SDimitry Andric
28700b57cec5SDimitry Andric BufferPtr = CurPtr;
28710b57cec5SDimitry Andric return false;
28720b57cec5SDimitry Andric }
28730b57cec5SDimitry Andric
28740b57cec5SDimitry Andric // Check to see if the first character after the '/*' is another /. If so,
28750b57cec5SDimitry Andric // then this slash does not end the block comment, it is part of it.
28760b57cec5SDimitry Andric if (C == '/')
28770b57cec5SDimitry Andric C = *CurPtr++;
28780b57cec5SDimitry Andric
2879753f127fSDimitry Andric // C++23 [lex.phases] p1
2880753f127fSDimitry Andric // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2881753f127fSDimitry Andric // diagnostic only once per entire ill-formed subsequence to avoid
2882753f127fSDimitry Andric // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2883753f127fSDimitry Andric bool UnicodeDecodingAlreadyDiagnosed = false;
2884753f127fSDimitry Andric
28850b57cec5SDimitry Andric while (true) {
28860b57cec5SDimitry Andric // Skip over all non-interesting characters until we find end of buffer or a
28870b57cec5SDimitry Andric // (probably ending) '/' character.
28880b57cec5SDimitry Andric if (CurPtr + 24 < BufferEnd &&
28890b57cec5SDimitry Andric // If there is a code-completion point avoid the fast scan because it
28900b57cec5SDimitry Andric // doesn't check for '\0'.
28910b57cec5SDimitry Andric !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
28920b57cec5SDimitry Andric // While not aligned to a 16-byte boundary.
2893753f127fSDimitry Andric while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2894753f127fSDimitry Andric if (!isASCII(C))
2895753f127fSDimitry Andric goto MultiByteUTF8;
28960b57cec5SDimitry Andric C = *CurPtr++;
2897753f127fSDimitry Andric }
28980b57cec5SDimitry Andric if (C == '/') goto FoundSlash;
28990b57cec5SDimitry Andric
29000b57cec5SDimitry Andric #ifdef __SSE2__
29010b57cec5SDimitry Andric __m128i Slashes = _mm_set1_epi8('/');
2902753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) {
2903753f127fSDimitry Andric int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2904753f127fSDimitry Andric if (LLVM_UNLIKELY(Mask != 0)) {
2905753f127fSDimitry Andric goto MultiByteUTF8;
2906753f127fSDimitry Andric }
2907753f127fSDimitry Andric // look for slashes
29080b57cec5SDimitry Andric int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
29090b57cec5SDimitry Andric Slashes));
29100b57cec5SDimitry Andric if (cmp != 0) {
29110b57cec5SDimitry Andric // Adjust the pointer to point directly after the first slash. It's
29120b57cec5SDimitry Andric // not necessary to set C here, it will be overwritten at the end of
29130b57cec5SDimitry Andric // the outer loop.
291406c3fb27SDimitry Andric CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
29150b57cec5SDimitry Andric goto FoundSlash;
29160b57cec5SDimitry Andric }
29170b57cec5SDimitry Andric CurPtr += 16;
29180b57cec5SDimitry Andric }
29190b57cec5SDimitry Andric #elif __ALTIVEC__
2920753f127fSDimitry Andric __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2921753f127fSDimitry Andric 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2922753f127fSDimitry Andric 0x80, 0x80, 0x80, 0x80};
29230b57cec5SDimitry Andric __vector unsigned char Slashes = {
29240b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/',
29250b57cec5SDimitry Andric '/', '/', '/', '/', '/', '/', '/', '/'
29260b57cec5SDimitry Andric };
2927753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) {
2928753f127fSDimitry Andric if (LLVM_UNLIKELY(
2929753f127fSDimitry Andric vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2930753f127fSDimitry Andric goto MultiByteUTF8;
2931753f127fSDimitry Andric if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2932753f127fSDimitry Andric break;
2933753f127fSDimitry Andric }
29340b57cec5SDimitry Andric CurPtr += 16;
2935753f127fSDimitry Andric }
2936753f127fSDimitry Andric
29370b57cec5SDimitry Andric #else
2938753f127fSDimitry Andric while (CurPtr + 16 < BufferEnd) {
2939753f127fSDimitry Andric bool HasNonASCII = false;
2940753f127fSDimitry Andric for (unsigned I = 0; I < 16; ++I)
2941753f127fSDimitry Andric HasNonASCII |= !isASCII(CurPtr[I]);
2942753f127fSDimitry Andric
2943753f127fSDimitry Andric if (LLVM_UNLIKELY(HasNonASCII))
2944753f127fSDimitry Andric goto MultiByteUTF8;
2945753f127fSDimitry Andric
2946753f127fSDimitry Andric bool HasSlash = false;
2947753f127fSDimitry Andric for (unsigned I = 0; I < 16; ++I)
2948753f127fSDimitry Andric HasSlash |= CurPtr[I] == '/';
2949753f127fSDimitry Andric if (HasSlash)
2950753f127fSDimitry Andric break;
2951753f127fSDimitry Andric CurPtr += 16;
29520b57cec5SDimitry Andric }
29530b57cec5SDimitry Andric #endif
29540b57cec5SDimitry Andric
29550b57cec5SDimitry Andric // It has to be one of the bytes scanned, increment to it and read one.
29560b57cec5SDimitry Andric C = *CurPtr++;
29570b57cec5SDimitry Andric }
29580b57cec5SDimitry Andric
2959753f127fSDimitry Andric // Loop to scan the remainder, warning on invalid UTF-8
2960753f127fSDimitry Andric // if the corresponding warning is enabled, emitting a diagnostic only once
2961753f127fSDimitry Andric // per sequence that cannot be decoded.
2962753f127fSDimitry Andric while (C != '/' && C != '\0') {
2963753f127fSDimitry Andric if (isASCII(C)) {
2964753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false;
29650b57cec5SDimitry Andric C = *CurPtr++;
2966753f127fSDimitry Andric continue;
2967753f127fSDimitry Andric }
2968753f127fSDimitry Andric MultiByteUTF8:
2969753f127fSDimitry Andric // CurPtr is 1 code unit past C, so to decode
2970753f127fSDimitry Andric // the codepoint, we need to read from the previous position.
2971753f127fSDimitry Andric unsigned Length = llvm::getUTF8SequenceSize(
2972753f127fSDimitry Andric (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2973753f127fSDimitry Andric if (Length == 0) {
2974753f127fSDimitry Andric if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2975753f127fSDimitry Andric Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2976753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = true;
2977753f127fSDimitry Andric } else {
2978753f127fSDimitry Andric UnicodeDecodingAlreadyDiagnosed = false;
2979753f127fSDimitry Andric CurPtr += Length - 1;
2980753f127fSDimitry Andric }
2981753f127fSDimitry Andric C = *CurPtr++;
2982753f127fSDimitry Andric }
29830b57cec5SDimitry Andric
29840b57cec5SDimitry Andric if (C == '/') {
29850b57cec5SDimitry Andric FoundSlash:
29860b57cec5SDimitry Andric if (CurPtr[-2] == '*') // We found the final */. We're done!
29870b57cec5SDimitry Andric break;
29880b57cec5SDimitry Andric
29890b57cec5SDimitry Andric if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
299081ad6265SDimitry Andric if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
299181ad6265SDimitry Andric LangOpts.Trigraphs)) {
29920b57cec5SDimitry Andric // We found the final */, though it had an escaped newline between the
29930b57cec5SDimitry Andric // * and /. We're done!
29940b57cec5SDimitry Andric break;
29950b57cec5SDimitry Andric }
29960b57cec5SDimitry Andric }
29970b57cec5SDimitry Andric if (CurPtr[0] == '*' && CurPtr[1] != '/') {
29980b57cec5SDimitry Andric // If this is a /* inside of the comment, emit a warning. Don't do this
29990b57cec5SDimitry Andric // if this is a /*/, which will end the comment. This misses cases with
30000b57cec5SDimitry Andric // embedded escaped newlines, but oh well.
30010b57cec5SDimitry Andric if (!isLexingRawMode())
30020b57cec5SDimitry Andric Diag(CurPtr-1, diag::warn_nested_block_comment);
30030b57cec5SDimitry Andric }
30040b57cec5SDimitry Andric } else if (C == 0 && CurPtr == BufferEnd+1) {
30050b57cec5SDimitry Andric if (!isLexingRawMode())
30060b57cec5SDimitry Andric Diag(BufferPtr, diag::err_unterminated_block_comment);
30070b57cec5SDimitry Andric // Note: the user probably forgot a */. We could continue immediately
30080b57cec5SDimitry Andric // after the /*, but this would involve lexing a lot of what really is the
30090b57cec5SDimitry Andric // comment, which surely would confuse the parser.
30100b57cec5SDimitry Andric --CurPtr;
30110b57cec5SDimitry Andric
30120b57cec5SDimitry Andric // KeepWhitespaceMode should return this broken comment as a token. Since
30130b57cec5SDimitry Andric // it isn't a well formed comment, just return it as an 'unknown' token.
30140b57cec5SDimitry Andric if (isKeepWhitespaceMode()) {
30150b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
30160b57cec5SDimitry Andric return true;
30170b57cec5SDimitry Andric }
30180b57cec5SDimitry Andric
30190b57cec5SDimitry Andric BufferPtr = CurPtr;
30200b57cec5SDimitry Andric return false;
30210b57cec5SDimitry Andric } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
30220b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage();
30230b57cec5SDimitry Andric cutOffLexing();
30240b57cec5SDimitry Andric return false;
30250b57cec5SDimitry Andric }
30260b57cec5SDimitry Andric
30270b57cec5SDimitry Andric C = *CurPtr++;
30280b57cec5SDimitry Andric }
30290b57cec5SDimitry Andric
30300b57cec5SDimitry Andric // Notify comment handlers about the comment unless we're in a #if 0 block.
30310b57cec5SDimitry Andric if (PP && !isLexingRawMode() &&
30320b57cec5SDimitry Andric PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
30330b57cec5SDimitry Andric getSourceLocation(CurPtr)))) {
30340b57cec5SDimitry Andric BufferPtr = CurPtr;
30350b57cec5SDimitry Andric return true; // A token has to be returned.
30360b57cec5SDimitry Andric }
30370b57cec5SDimitry Andric
30380b57cec5SDimitry Andric // If we are returning comments as tokens, return this comment as a token.
30390b57cec5SDimitry Andric if (inKeepCommentMode()) {
30400b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::comment);
30410b57cec5SDimitry Andric return true;
30420b57cec5SDimitry Andric }
30430b57cec5SDimitry Andric
30440b57cec5SDimitry Andric // It is common for the tokens immediately after a /**/ comment to be
30450b57cec5SDimitry Andric // whitespace. Instead of going through the big switch, handle it
30460b57cec5SDimitry Andric // efficiently now. This is safe even in KeepWhitespaceMode because we would
30470b57cec5SDimitry Andric // have already returned above with the comment as a token.
30480b57cec5SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) {
30490b57cec5SDimitry Andric SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
30500b57cec5SDimitry Andric return false;
30510b57cec5SDimitry Andric }
30520b57cec5SDimitry Andric
30530b57cec5SDimitry Andric // Otherwise, just return so that the next character will be lexed as a token.
30540b57cec5SDimitry Andric BufferPtr = CurPtr;
30550b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace);
30560b57cec5SDimitry Andric return false;
30570b57cec5SDimitry Andric }
30580b57cec5SDimitry Andric
30590b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
30600b57cec5SDimitry Andric // Primary Lexing Entry Points
30610b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
30620b57cec5SDimitry Andric
30630b57cec5SDimitry Andric /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
30640b57cec5SDimitry Andric /// uninterpreted string. This switches the lexer out of directive mode.
ReadToEndOfLine(SmallVectorImpl<char> * Result)30650b57cec5SDimitry Andric void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
30660b57cec5SDimitry Andric assert(ParsingPreprocessorDirective && ParsingFilename == false &&
30670b57cec5SDimitry Andric "Must be in a preprocessing directive!");
30680b57cec5SDimitry Andric Token Tmp;
3069480093f4SDimitry Andric Tmp.startToken();
30700b57cec5SDimitry Andric
30710b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable.
30720b57cec5SDimitry Andric const char *CurPtr = BufferPtr;
30730b57cec5SDimitry Andric while (true) {
30740b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Tmp);
30750b57cec5SDimitry Andric switch (Char) {
30760b57cec5SDimitry Andric default:
30770b57cec5SDimitry Andric if (Result)
30780b57cec5SDimitry Andric Result->push_back(Char);
30790b57cec5SDimitry Andric break;
30800b57cec5SDimitry Andric case 0: // Null.
30810b57cec5SDimitry Andric // Found end of file?
30820b57cec5SDimitry Andric if (CurPtr-1 != BufferEnd) {
30830b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) {
30840b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage();
30850b57cec5SDimitry Andric cutOffLexing();
30860b57cec5SDimitry Andric return;
30870b57cec5SDimitry Andric }
30880b57cec5SDimitry Andric
30890b57cec5SDimitry Andric // Nope, normal character, continue.
30900b57cec5SDimitry Andric if (Result)
30910b57cec5SDimitry Andric Result->push_back(Char);
30920b57cec5SDimitry Andric break;
30930b57cec5SDimitry Andric }
30940b57cec5SDimitry Andric // FALL THROUGH.
3095bdd1243dSDimitry Andric [[fallthrough]];
30960b57cec5SDimitry Andric case '\r':
30970b57cec5SDimitry Andric case '\n':
30980b57cec5SDimitry Andric // Okay, we found the end of the line. First, back up past the \0, \r, \n.
30990b57cec5SDimitry Andric assert(CurPtr[-1] == Char && "Trigraphs for newline?");
31000b57cec5SDimitry Andric BufferPtr = CurPtr-1;
31010b57cec5SDimitry Andric
31020b57cec5SDimitry Andric // Next, lex the character, which should handle the EOD transition.
31030b57cec5SDimitry Andric Lex(Tmp);
31040b57cec5SDimitry Andric if (Tmp.is(tok::code_completion)) {
31050b57cec5SDimitry Andric if (PP)
31060b57cec5SDimitry Andric PP->CodeCompleteNaturalLanguage();
31070b57cec5SDimitry Andric Lex(Tmp);
31080b57cec5SDimitry Andric }
31090b57cec5SDimitry Andric assert(Tmp.is(tok::eod) && "Unexpected token!");
31100b57cec5SDimitry Andric
31110b57cec5SDimitry Andric // Finally, we're done;
31120b57cec5SDimitry Andric return;
31130b57cec5SDimitry Andric }
31140b57cec5SDimitry Andric }
31150b57cec5SDimitry Andric }
31160b57cec5SDimitry Andric
31170b57cec5SDimitry Andric /// LexEndOfFile - CurPtr points to the end of this file. Handle this
31180b57cec5SDimitry Andric /// condition, reporting diagnostics and handling other edge cases as required.
31190b57cec5SDimitry Andric /// This returns true if Result contains a token, false if PP.Lex should be
31200b57cec5SDimitry Andric /// called again.
LexEndOfFile(Token & Result,const char * CurPtr)31210b57cec5SDimitry Andric bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
31220b57cec5SDimitry Andric // If we hit the end of the file while parsing a preprocessor directive,
31230b57cec5SDimitry Andric // end the preprocessor directive first. The next token returned will
31240b57cec5SDimitry Andric // then be the end of file.
31250b57cec5SDimitry Andric if (ParsingPreprocessorDirective) {
31260b57cec5SDimitry Andric // Done parsing the "line".
31270b57cec5SDimitry Andric ParsingPreprocessorDirective = false;
31280b57cec5SDimitry Andric // Update the location of token as well as BufferPtr.
31290b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::eod);
31300b57cec5SDimitry Andric
31310b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive.
31320b57cec5SDimitry Andric if (PP)
31330b57cec5SDimitry Andric resetExtendedTokenMode();
31340b57cec5SDimitry Andric return true; // Have a token.
31350b57cec5SDimitry Andric }
31360b57cec5SDimitry Andric
31370b57cec5SDimitry Andric // If we are in raw mode, return this event as an EOF token. Let the caller
31380b57cec5SDimitry Andric // that put us in raw mode handle the event.
31390b57cec5SDimitry Andric if (isLexingRawMode()) {
31400b57cec5SDimitry Andric Result.startToken();
31410b57cec5SDimitry Andric BufferPtr = BufferEnd;
31420b57cec5SDimitry Andric FormTokenWithChars(Result, BufferEnd, tok::eof);
31430b57cec5SDimitry Andric return true;
31440b57cec5SDimitry Andric }
31450b57cec5SDimitry Andric
31460b57cec5SDimitry Andric if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
31470b57cec5SDimitry Andric PP->setRecordedPreambleConditionalStack(ConditionalStack);
3148fe6060f1SDimitry Andric // If the preamble cuts off the end of a header guard, consider it guarded.
3149fe6060f1SDimitry Andric // The guard is valid for the preamble content itself, and for tools the
3150fe6060f1SDimitry Andric // most useful answer is "yes, this file has a header guard".
3151fe6060f1SDimitry Andric if (!ConditionalStack.empty())
3152fe6060f1SDimitry Andric MIOpt.ExitTopLevelConditional();
31530b57cec5SDimitry Andric ConditionalStack.clear();
31540b57cec5SDimitry Andric }
31550b57cec5SDimitry Andric
31560b57cec5SDimitry Andric // Issue diagnostics for unterminated #if and missing newline.
31570b57cec5SDimitry Andric
31580b57cec5SDimitry Andric // If we are in a #if directive, emit an error.
31590b57cec5SDimitry Andric while (!ConditionalStack.empty()) {
31600b57cec5SDimitry Andric if (PP->getCodeCompletionFileLoc() != FileLoc)
31610b57cec5SDimitry Andric PP->Diag(ConditionalStack.back().IfLoc,
31620b57cec5SDimitry Andric diag::err_pp_unterminated_conditional);
31630b57cec5SDimitry Andric ConditionalStack.pop_back();
31640b57cec5SDimitry Andric }
31650b57cec5SDimitry Andric
31660b57cec5SDimitry Andric // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
31670b57cec5SDimitry Andric // a pedwarn.
31680b57cec5SDimitry Andric if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
31690b57cec5SDimitry Andric DiagnosticsEngine &Diags = PP->getDiagnostics();
317081ad6265SDimitry Andric SourceLocation EndLoc = getSourceLocation(BufferEnd);
31710b57cec5SDimitry Andric unsigned DiagID;
31720b57cec5SDimitry Andric
31730b57cec5SDimitry Andric if (LangOpts.CPlusPlus11) {
31740b57cec5SDimitry Andric // C++11 [lex.phases] 2.2 p2
31750b57cec5SDimitry Andric // Prefer the C++98 pedantic compatibility warning over the generic,
31760b57cec5SDimitry Andric // non-extension, user-requested "missing newline at EOF" warning.
31770b57cec5SDimitry Andric if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
31780b57cec5SDimitry Andric DiagID = diag::warn_cxx98_compat_no_newline_eof;
31790b57cec5SDimitry Andric } else {
31800b57cec5SDimitry Andric DiagID = diag::warn_no_newline_eof;
31810b57cec5SDimitry Andric }
31820b57cec5SDimitry Andric } else {
31830b57cec5SDimitry Andric DiagID = diag::ext_no_newline_eof;
31840b57cec5SDimitry Andric }
31850b57cec5SDimitry Andric
31860b57cec5SDimitry Andric Diag(BufferEnd, DiagID)
31870b57cec5SDimitry Andric << FixItHint::CreateInsertion(EndLoc, "\n");
31880b57cec5SDimitry Andric }
31890b57cec5SDimitry Andric
31900b57cec5SDimitry Andric BufferPtr = CurPtr;
31910b57cec5SDimitry Andric
31920b57cec5SDimitry Andric // Finally, let the preprocessor handle this.
319381ad6265SDimitry Andric return PP->HandleEndOfFile(Result, isPragmaLexer());
31940b57cec5SDimitry Andric }
31950b57cec5SDimitry Andric
31960b57cec5SDimitry Andric /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
31970b57cec5SDimitry Andric /// the specified lexer will return a tok::l_paren token, 0 if it is something
31980b57cec5SDimitry Andric /// else and 2 if there are no more tokens in the buffer controlled by the
31990b57cec5SDimitry Andric /// lexer.
isNextPPTokenLParen()32000b57cec5SDimitry Andric unsigned Lexer::isNextPPTokenLParen() {
32010b57cec5SDimitry Andric assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
32020b57cec5SDimitry Andric
320381ad6265SDimitry Andric if (isDependencyDirectivesLexer()) {
320481ad6265SDimitry Andric if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
320581ad6265SDimitry Andric return 2;
320681ad6265SDimitry Andric return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
320781ad6265SDimitry Andric tok::l_paren);
320881ad6265SDimitry Andric }
320981ad6265SDimitry Andric
32100b57cec5SDimitry Andric // Switch to 'skipping' mode. This will ensure that we can lex a token
32110b57cec5SDimitry Andric // without emitting diagnostics, disables macro expansion, and will cause EOF
32120b57cec5SDimitry Andric // to return an EOF token instead of popping the include stack.
32130b57cec5SDimitry Andric LexingRawMode = true;
32140b57cec5SDimitry Andric
32150b57cec5SDimitry Andric // Save state that can be changed while lexing so that we can restore it.
32160b57cec5SDimitry Andric const char *TmpBufferPtr = BufferPtr;
32170b57cec5SDimitry Andric bool inPPDirectiveMode = ParsingPreprocessorDirective;
32180b57cec5SDimitry Andric bool atStartOfLine = IsAtStartOfLine;
32190b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
32200b57cec5SDimitry Andric bool leadingSpace = HasLeadingSpace;
32210b57cec5SDimitry Andric
32220b57cec5SDimitry Andric Token Tok;
32230b57cec5SDimitry Andric Lex(Tok);
32240b57cec5SDimitry Andric
32250b57cec5SDimitry Andric // Restore state that may have changed.
32260b57cec5SDimitry Andric BufferPtr = TmpBufferPtr;
32270b57cec5SDimitry Andric ParsingPreprocessorDirective = inPPDirectiveMode;
32280b57cec5SDimitry Andric HasLeadingSpace = leadingSpace;
32290b57cec5SDimitry Andric IsAtStartOfLine = atStartOfLine;
32300b57cec5SDimitry Andric IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
32310b57cec5SDimitry Andric
32320b57cec5SDimitry Andric // Restore the lexer back to non-skipping mode.
32330b57cec5SDimitry Andric LexingRawMode = false;
32340b57cec5SDimitry Andric
32350b57cec5SDimitry Andric if (Tok.is(tok::eof))
32360b57cec5SDimitry Andric return 2;
32370b57cec5SDimitry Andric return Tok.is(tok::l_paren);
32380b57cec5SDimitry Andric }
32390b57cec5SDimitry Andric
32400b57cec5SDimitry Andric /// Find the end of a version control conflict marker.
FindConflictEnd(const char * CurPtr,const char * BufferEnd,ConflictMarkerKind CMK)32410b57cec5SDimitry Andric static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
32420b57cec5SDimitry Andric ConflictMarkerKind CMK) {
32430b57cec5SDimitry Andric const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
32440b57cec5SDimitry Andric size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
32450b57cec5SDimitry Andric auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
32460b57cec5SDimitry Andric size_t Pos = RestOfBuffer.find(Terminator);
32470b57cec5SDimitry Andric while (Pos != StringRef::npos) {
32480b57cec5SDimitry Andric // Must occur at start of line.
32490b57cec5SDimitry Andric if (Pos == 0 ||
32500b57cec5SDimitry Andric (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
32510b57cec5SDimitry Andric RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
32520b57cec5SDimitry Andric Pos = RestOfBuffer.find(Terminator);
32530b57cec5SDimitry Andric continue;
32540b57cec5SDimitry Andric }
32550b57cec5SDimitry Andric return RestOfBuffer.data()+Pos;
32560b57cec5SDimitry Andric }
32570b57cec5SDimitry Andric return nullptr;
32580b57cec5SDimitry Andric }
32590b57cec5SDimitry Andric
32600b57cec5SDimitry Andric /// IsStartOfConflictMarker - If the specified pointer is the start of a version
32610b57cec5SDimitry Andric /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
32620b57cec5SDimitry Andric /// and recover nicely. This returns true if it is a conflict marker and false
32630b57cec5SDimitry Andric /// if not.
IsStartOfConflictMarker(const char * CurPtr)32640b57cec5SDimitry Andric bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
32650b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line.
32660b57cec5SDimitry Andric if (CurPtr != BufferStart &&
32670b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
32680b57cec5SDimitry Andric return false;
32690b57cec5SDimitry Andric
32700b57cec5SDimitry Andric // Check to see if we have <<<<<<< or >>>>.
32715f757f3fSDimitry Andric if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
32725f757f3fSDimitry Andric !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
32730b57cec5SDimitry Andric return false;
32740b57cec5SDimitry Andric
32750b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore
32760b57cec5SDimitry Andric // it.
32770b57cec5SDimitry Andric if (CurrentConflictMarkerState || isLexingRawMode())
32780b57cec5SDimitry Andric return false;
32790b57cec5SDimitry Andric
32800b57cec5SDimitry Andric ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
32810b57cec5SDimitry Andric
32820b57cec5SDimitry Andric // Check to see if there is an ending marker somewhere in the buffer at the
32830b57cec5SDimitry Andric // start of a line to terminate this conflict marker.
32840b57cec5SDimitry Andric if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
32850b57cec5SDimitry Andric // We found a match. We are really in a conflict marker.
32860b57cec5SDimitry Andric // Diagnose this, and ignore to the end of line.
32870b57cec5SDimitry Andric Diag(CurPtr, diag::err_conflict_marker);
32880b57cec5SDimitry Andric CurrentConflictMarkerState = Kind;
32890b57cec5SDimitry Andric
32900b57cec5SDimitry Andric // Skip ahead to the end of line. We know this exists because the
32910b57cec5SDimitry Andric // end-of-conflict marker starts with \r or \n.
32920b57cec5SDimitry Andric while (*CurPtr != '\r' && *CurPtr != '\n') {
32930b57cec5SDimitry Andric assert(CurPtr != BufferEnd && "Didn't find end of line");
32940b57cec5SDimitry Andric ++CurPtr;
32950b57cec5SDimitry Andric }
32960b57cec5SDimitry Andric BufferPtr = CurPtr;
32970b57cec5SDimitry Andric return true;
32980b57cec5SDimitry Andric }
32990b57cec5SDimitry Andric
33000b57cec5SDimitry Andric // No end of conflict marker found.
33010b57cec5SDimitry Andric return false;
33020b57cec5SDimitry Andric }
33030b57cec5SDimitry Andric
33040b57cec5SDimitry Andric /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
33050b57cec5SDimitry Andric /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
33060b57cec5SDimitry Andric /// is the end of a conflict marker. Handle it by ignoring up until the end of
33070b57cec5SDimitry Andric /// the line. This returns true if it is a conflict marker and false if not.
HandleEndOfConflictMarker(const char * CurPtr)33080b57cec5SDimitry Andric bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
33090b57cec5SDimitry Andric // Only a conflict marker if it starts at the beginning of a line.
33100b57cec5SDimitry Andric if (CurPtr != BufferStart &&
33110b57cec5SDimitry Andric CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
33120b57cec5SDimitry Andric return false;
33130b57cec5SDimitry Andric
33140b57cec5SDimitry Andric // If we have a situation where we don't care about conflict markers, ignore
33150b57cec5SDimitry Andric // it.
33160b57cec5SDimitry Andric if (!CurrentConflictMarkerState || isLexingRawMode())
33170b57cec5SDimitry Andric return false;
33180b57cec5SDimitry Andric
33190b57cec5SDimitry Andric // Check to see if we have the marker (4 characters in a row).
33200b57cec5SDimitry Andric for (unsigned i = 1; i != 4; ++i)
33210b57cec5SDimitry Andric if (CurPtr[i] != CurPtr[0])
33220b57cec5SDimitry Andric return false;
33230b57cec5SDimitry Andric
33240b57cec5SDimitry Andric // If we do have it, search for the end of the conflict marker. This could
33250b57cec5SDimitry Andric // fail if it got skipped with a '#if 0' or something. Note that CurPtr might
33260b57cec5SDimitry Andric // be the end of conflict marker.
33270b57cec5SDimitry Andric if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
33280b57cec5SDimitry Andric CurrentConflictMarkerState)) {
33290b57cec5SDimitry Andric CurPtr = End;
33300b57cec5SDimitry Andric
33310b57cec5SDimitry Andric // Skip ahead to the end of line.
33320b57cec5SDimitry Andric while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
33330b57cec5SDimitry Andric ++CurPtr;
33340b57cec5SDimitry Andric
33350b57cec5SDimitry Andric BufferPtr = CurPtr;
33360b57cec5SDimitry Andric
33370b57cec5SDimitry Andric // No longer in the conflict marker.
33380b57cec5SDimitry Andric CurrentConflictMarkerState = CMK_None;
33390b57cec5SDimitry Andric return true;
33400b57cec5SDimitry Andric }
33410b57cec5SDimitry Andric
33420b57cec5SDimitry Andric return false;
33430b57cec5SDimitry Andric }
33440b57cec5SDimitry Andric
findPlaceholderEnd(const char * CurPtr,const char * BufferEnd)33450b57cec5SDimitry Andric static const char *findPlaceholderEnd(const char *CurPtr,
33460b57cec5SDimitry Andric const char *BufferEnd) {
33470b57cec5SDimitry Andric if (CurPtr == BufferEnd)
33480b57cec5SDimitry Andric return nullptr;
33490b57cec5SDimitry Andric BufferEnd -= 1; // Scan until the second last character.
33500b57cec5SDimitry Andric for (; CurPtr != BufferEnd; ++CurPtr) {
33510b57cec5SDimitry Andric if (CurPtr[0] == '#' && CurPtr[1] == '>')
33520b57cec5SDimitry Andric return CurPtr + 2;
33530b57cec5SDimitry Andric }
33540b57cec5SDimitry Andric return nullptr;
33550b57cec5SDimitry Andric }
33560b57cec5SDimitry Andric
lexEditorPlaceholder(Token & Result,const char * CurPtr)33570b57cec5SDimitry Andric bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
33580b57cec5SDimitry Andric assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
33590b57cec5SDimitry Andric if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
33600b57cec5SDimitry Andric return false;
33610b57cec5SDimitry Andric const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
33620b57cec5SDimitry Andric if (!End)
33630b57cec5SDimitry Andric return false;
33640b57cec5SDimitry Andric const char *Start = CurPtr - 1;
33650b57cec5SDimitry Andric if (!LangOpts.AllowEditorPlaceholders)
33660b57cec5SDimitry Andric Diag(Start, diag::err_placeholder_in_source);
33670b57cec5SDimitry Andric Result.startToken();
33680b57cec5SDimitry Andric FormTokenWithChars(Result, End, tok::raw_identifier);
33690b57cec5SDimitry Andric Result.setRawIdentifierData(Start);
33700b57cec5SDimitry Andric PP->LookUpIdentifierInfo(Result);
33710b57cec5SDimitry Andric Result.setFlag(Token::IsEditorPlaceholder);
33720b57cec5SDimitry Andric BufferPtr = End;
33730b57cec5SDimitry Andric return true;
33740b57cec5SDimitry Andric }
33750b57cec5SDimitry Andric
isCodeCompletionPoint(const char * CurPtr) const33760b57cec5SDimitry Andric bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
33770b57cec5SDimitry Andric if (PP && PP->isCodeCompletionEnabled()) {
33780b57cec5SDimitry Andric SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
33790b57cec5SDimitry Andric return Loc == PP->getCodeCompletionLoc();
33800b57cec5SDimitry Andric }
33810b57cec5SDimitry Andric
33820b57cec5SDimitry Andric return false;
33830b57cec5SDimitry Andric }
33840b57cec5SDimitry Andric
tryReadNumericUCN(const char * & StartPtr,const char * SlashLoc,Token * Result)3385bdd1243dSDimitry Andric std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
338681ad6265SDimitry Andric const char *SlashLoc,
33870b57cec5SDimitry Andric Token *Result) {
33880b57cec5SDimitry Andric unsigned CharSize;
33890b57cec5SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize);
339081ad6265SDimitry Andric assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
33910b57cec5SDimitry Andric
33920b57cec5SDimitry Andric unsigned NumHexDigits;
33930b57cec5SDimitry Andric if (Kind == 'u')
33940b57cec5SDimitry Andric NumHexDigits = 4;
33950b57cec5SDimitry Andric else if (Kind == 'U')
33960b57cec5SDimitry Andric NumHexDigits = 8;
339781ad6265SDimitry Andric
339881ad6265SDimitry Andric bool Delimited = false;
339981ad6265SDimitry Andric bool FoundEndDelimiter = false;
340081ad6265SDimitry Andric unsigned Count = 0;
340181ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode();
34020b57cec5SDimitry Andric
34030b57cec5SDimitry Andric if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3404349cc55cSDimitry Andric if (Diagnose)
34050b57cec5SDimitry Andric Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3406bdd1243dSDimitry Andric return std::nullopt;
34070b57cec5SDimitry Andric }
34080b57cec5SDimitry Andric
34090b57cec5SDimitry Andric const char *CurPtr = StartPtr + CharSize;
34100b57cec5SDimitry Andric const char *KindLoc = &CurPtr[-1];
34110b57cec5SDimitry Andric
34120b57cec5SDimitry Andric uint32_t CodePoint = 0;
3413349cc55cSDimitry Andric while (Count != NumHexDigits || Delimited) {
34140b57cec5SDimitry Andric char C = getCharAndSize(CurPtr, CharSize);
3415bdd1243dSDimitry Andric if (!Delimited && Count == 0 && C == '{') {
3416349cc55cSDimitry Andric Delimited = true;
3417349cc55cSDimitry Andric CurPtr += CharSize;
3418349cc55cSDimitry Andric continue;
3419349cc55cSDimitry Andric }
3420349cc55cSDimitry Andric
3421349cc55cSDimitry Andric if (Delimited && C == '}') {
3422349cc55cSDimitry Andric CurPtr += CharSize;
3423349cc55cSDimitry Andric FoundEndDelimiter = true;
3424349cc55cSDimitry Andric break;
3425349cc55cSDimitry Andric }
34260b57cec5SDimitry Andric
34270b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(C);
34280b57cec5SDimitry Andric if (Value == -1U) {
3429349cc55cSDimitry Andric if (!Delimited)
3430349cc55cSDimitry Andric break;
3431349cc55cSDimitry Andric if (Diagnose)
3432bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
343381ad6265SDimitry Andric << StringRef(KindLoc, 1);
3434bdd1243dSDimitry Andric return std::nullopt;
3435349cc55cSDimitry Andric }
34360b57cec5SDimitry Andric
3437349cc55cSDimitry Andric if (CodePoint & 0xF000'0000) {
3438349cc55cSDimitry Andric if (Diagnose)
3439349cc55cSDimitry Andric Diag(KindLoc, diag::err_escape_too_large) << 0;
3440bdd1243dSDimitry Andric return std::nullopt;
3441349cc55cSDimitry Andric }
3442349cc55cSDimitry Andric
3443349cc55cSDimitry Andric CodePoint <<= 4;
3444349cc55cSDimitry Andric CodePoint |= Value;
3445349cc55cSDimitry Andric CurPtr += CharSize;
3446349cc55cSDimitry Andric Count++;
3447349cc55cSDimitry Andric }
3448349cc55cSDimitry Andric
3449349cc55cSDimitry Andric if (Count == 0) {
3450349cc55cSDimitry Andric if (Diagnose)
3451bdd1243dSDimitry Andric Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3452349cc55cSDimitry Andric : diag::warn_ucn_escape_no_digits)
3453349cc55cSDimitry Andric << StringRef(KindLoc, 1);
3454bdd1243dSDimitry Andric return std::nullopt;
345581ad6265SDimitry Andric }
345681ad6265SDimitry Andric
345781ad6265SDimitry Andric if (Delimited && Kind == 'U') {
345881ad6265SDimitry Andric if (Diagnose)
3459bdd1243dSDimitry Andric Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3460bdd1243dSDimitry Andric return std::nullopt;
3461349cc55cSDimitry Andric }
3462349cc55cSDimitry Andric
3463349cc55cSDimitry Andric if (!Delimited && Count != NumHexDigits) {
3464349cc55cSDimitry Andric if (Diagnose) {
3465bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
34660b57cec5SDimitry Andric // If the user wrote \U1234, suggest a fixit to \u.
3467349cc55cSDimitry Andric if (Count == 4 && NumHexDigits == 8) {
34680b57cec5SDimitry Andric CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
34690b57cec5SDimitry Andric Diag(KindLoc, diag::note_ucn_four_not_eight)
34700b57cec5SDimitry Andric << FixItHint::CreateReplacement(URange, "u");
34710b57cec5SDimitry Andric }
34720b57cec5SDimitry Andric }
3473bdd1243dSDimitry Andric return std::nullopt;
34740b57cec5SDimitry Andric }
34750b57cec5SDimitry Andric
3476349cc55cSDimitry Andric if (Delimited && PP) {
347706c3fb27SDimitry Andric Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
347806c3fb27SDimitry Andric ? diag::warn_cxx23_delimited_escape_sequence
3479753f127fSDimitry Andric : diag::ext_delimited_escape_sequence)
3480753f127fSDimitry Andric << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
34810b57cec5SDimitry Andric }
34820b57cec5SDimitry Andric
34830b57cec5SDimitry Andric if (Result) {
34840b57cec5SDimitry Andric Result->setFlag(Token::HasUCN);
3485bdd1243dSDimitry Andric // If the UCN contains either a trigraph or a line splicing,
3486bdd1243dSDimitry Andric // we need to call getAndAdvanceChar again to set the appropriate flags
3487bdd1243dSDimitry Andric // on Result.
3488bdd1243dSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
34890b57cec5SDimitry Andric StartPtr = CurPtr;
34900b57cec5SDimitry Andric else
34910b57cec5SDimitry Andric while (StartPtr != CurPtr)
34920b57cec5SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result);
34930b57cec5SDimitry Andric } else {
34940b57cec5SDimitry Andric StartPtr = CurPtr;
34950b57cec5SDimitry Andric }
349681ad6265SDimitry Andric return CodePoint;
349781ad6265SDimitry Andric }
349881ad6265SDimitry Andric
tryReadNamedUCN(const char * & StartPtr,const char * SlashLoc,Token * Result)3499bdd1243dSDimitry Andric std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3500bdd1243dSDimitry Andric const char *SlashLoc,
350181ad6265SDimitry Andric Token *Result) {
350281ad6265SDimitry Andric unsigned CharSize;
350381ad6265SDimitry Andric bool Diagnose = Result && !isLexingRawMode();
350481ad6265SDimitry Andric
350581ad6265SDimitry Andric char C = getCharAndSize(StartPtr, CharSize);
350681ad6265SDimitry Andric assert(C == 'N' && "expected \\N{...}");
350781ad6265SDimitry Andric
350881ad6265SDimitry Andric const char *CurPtr = StartPtr + CharSize;
350981ad6265SDimitry Andric const char *KindLoc = &CurPtr[-1];
351081ad6265SDimitry Andric
351181ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize);
351281ad6265SDimitry Andric if (C != '{') {
351381ad6265SDimitry Andric if (Diagnose)
3514bdd1243dSDimitry Andric Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3515bdd1243dSDimitry Andric return std::nullopt;
351681ad6265SDimitry Andric }
351781ad6265SDimitry Andric CurPtr += CharSize;
351881ad6265SDimitry Andric const char *StartName = CurPtr;
351981ad6265SDimitry Andric bool FoundEndDelimiter = false;
352081ad6265SDimitry Andric llvm::SmallVector<char, 30> Buffer;
352181ad6265SDimitry Andric while (C) {
352281ad6265SDimitry Andric C = getCharAndSize(CurPtr, CharSize);
352381ad6265SDimitry Andric CurPtr += CharSize;
352481ad6265SDimitry Andric if (C == '}') {
352581ad6265SDimitry Andric FoundEndDelimiter = true;
352681ad6265SDimitry Andric break;
352781ad6265SDimitry Andric }
352881ad6265SDimitry Andric
3529bdd1243dSDimitry Andric if (isVerticalWhitespace(C))
353081ad6265SDimitry Andric break;
353181ad6265SDimitry Andric Buffer.push_back(C);
353281ad6265SDimitry Andric }
353381ad6265SDimitry Andric
353481ad6265SDimitry Andric if (!FoundEndDelimiter || Buffer.empty()) {
353581ad6265SDimitry Andric if (Diagnose)
3536bdd1243dSDimitry Andric Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
353781ad6265SDimitry Andric : diag::warn_delimited_ucn_incomplete)
353881ad6265SDimitry Andric << StringRef(KindLoc, 1);
3539bdd1243dSDimitry Andric return std::nullopt;
354081ad6265SDimitry Andric }
354181ad6265SDimitry Andric
354281ad6265SDimitry Andric StringRef Name(Buffer.data(), Buffer.size());
3543bdd1243dSDimitry Andric std::optional<char32_t> Match =
354481ad6265SDimitry Andric llvm::sys::unicode::nameToCodepointStrict(Name);
3545bdd1243dSDimitry Andric std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3546bdd1243dSDimitry Andric if (!Match) {
354781ad6265SDimitry Andric LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3548bdd1243dSDimitry Andric if (Diagnose) {
3549bdd1243dSDimitry Andric Diag(StartName, diag::err_invalid_ucn_name)
3550bdd1243dSDimitry Andric << StringRef(Buffer.data(), Buffer.size())
3551bdd1243dSDimitry Andric << makeCharRange(*this, StartName, CurPtr - CharSize);
355281ad6265SDimitry Andric if (LooseMatch) {
355381ad6265SDimitry Andric Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
355481ad6265SDimitry Andric << FixItHint::CreateReplacement(
355581ad6265SDimitry Andric makeCharRange(*this, StartName, CurPtr - CharSize),
355681ad6265SDimitry Andric LooseMatch->Name);
355781ad6265SDimitry Andric }
355881ad6265SDimitry Andric }
3559bdd1243dSDimitry Andric // We do not offer misspelled character names suggestions here
356081ad6265SDimitry Andric // as the set of what would be a valid suggestion depends on context,
356181ad6265SDimitry Andric // and we should not make invalid suggestions.
356281ad6265SDimitry Andric }
356381ad6265SDimitry Andric
3564bdd1243dSDimitry Andric if (Diagnose && Match)
356506c3fb27SDimitry Andric Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
356606c3fb27SDimitry Andric ? diag::warn_cxx23_delimited_escape_sequence
3567753f127fSDimitry Andric : diag::ext_delimited_escape_sequence)
3568753f127fSDimitry Andric << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
356981ad6265SDimitry Andric
3570bdd1243dSDimitry Andric // If no diagnostic has been emitted yet, likely because we are doing a
3571bdd1243dSDimitry Andric // tentative lexing, we do not want to recover here to make sure the token
3572bdd1243dSDimitry Andric // will not be incorrectly considered valid. This function will be called
3573bdd1243dSDimitry Andric // again and a diagnostic emitted then.
3574bdd1243dSDimitry Andric if (LooseMatch && Diagnose)
3575bdd1243dSDimitry Andric Match = LooseMatch->CodePoint;
357681ad6265SDimitry Andric
357781ad6265SDimitry Andric if (Result) {
357881ad6265SDimitry Andric Result->setFlag(Token::HasUCN);
3579bdd1243dSDimitry Andric // If the UCN contains either a trigraph or a line splicing,
3580bdd1243dSDimitry Andric // we need to call getAndAdvanceChar again to set the appropriate flags
3581bdd1243dSDimitry Andric // on Result.
3582bdd1243dSDimitry Andric if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
358381ad6265SDimitry Andric StartPtr = CurPtr;
358481ad6265SDimitry Andric else
358581ad6265SDimitry Andric while (StartPtr != CurPtr)
358681ad6265SDimitry Andric (void)getAndAdvanceChar(StartPtr, *Result);
358781ad6265SDimitry Andric } else {
358881ad6265SDimitry Andric StartPtr = CurPtr;
358981ad6265SDimitry Andric }
3590bdd1243dSDimitry Andric return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
359181ad6265SDimitry Andric }
359281ad6265SDimitry Andric
tryReadUCN(const char * & StartPtr,const char * SlashLoc,Token * Result)359381ad6265SDimitry Andric uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
359481ad6265SDimitry Andric Token *Result) {
359581ad6265SDimitry Andric
359681ad6265SDimitry Andric unsigned CharSize;
3597bdd1243dSDimitry Andric std::optional<uint32_t> CodePointOpt;
359881ad6265SDimitry Andric char Kind = getCharAndSize(StartPtr, CharSize);
359981ad6265SDimitry Andric if (Kind == 'u' || Kind == 'U')
360081ad6265SDimitry Andric CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
360181ad6265SDimitry Andric else if (Kind == 'N')
3602bdd1243dSDimitry Andric CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
360381ad6265SDimitry Andric
360481ad6265SDimitry Andric if (!CodePointOpt)
360581ad6265SDimitry Andric return 0;
360681ad6265SDimitry Andric
360781ad6265SDimitry Andric uint32_t CodePoint = *CodePointOpt;
36080b57cec5SDimitry Andric
36090b57cec5SDimitry Andric // Don't apply C family restrictions to UCNs in assembly mode
36100b57cec5SDimitry Andric if (LangOpts.AsmPreprocessor)
36110b57cec5SDimitry Andric return CodePoint;
36120b57cec5SDimitry Andric
36135f757f3fSDimitry Andric // C23 6.4.3p2: A universal character name shall not designate a code point
361406c3fb27SDimitry Andric // where the hexadecimal value is:
361506c3fb27SDimitry Andric // - in the range D800 through DFFF inclusive; or
361606c3fb27SDimitry Andric // - greater than 10FFFF.
361706c3fb27SDimitry Andric // A universal-character-name outside the c-char-sequence of a character
361806c3fb27SDimitry Andric // constant, or the s-char-sequence of a string-literal shall not designate
361906c3fb27SDimitry Andric // a control character or a character in the basic character set.
362006c3fb27SDimitry Andric
36210b57cec5SDimitry Andric // C++11 [lex.charset]p2: If the hexadecimal value for a
36220b57cec5SDimitry Andric // universal-character-name corresponds to a surrogate code point (in the
36230b57cec5SDimitry Andric // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
36240b57cec5SDimitry Andric // if the hexadecimal value for a universal-character-name outside the
36250b57cec5SDimitry Andric // c-char-sequence, s-char-sequence, or r-char-sequence of a character or
36260b57cec5SDimitry Andric // string literal corresponds to a control character (in either of the
36270b57cec5SDimitry Andric // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
36280b57cec5SDimitry Andric // basic source character set, the program is ill-formed.
36290b57cec5SDimitry Andric if (CodePoint < 0xA0) {
36300b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to warn about bad
36310b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block.
36320b57cec5SDimitry Andric if (Result && PP) {
36330b57cec5SDimitry Andric if (CodePoint < 0x20 || CodePoint >= 0x7F)
36340b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_control_character);
36350b57cec5SDimitry Andric else {
36360b57cec5SDimitry Andric char C = static_cast<char>(CodePoint);
36370b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
36380b57cec5SDimitry Andric }
36390b57cec5SDimitry Andric }
36400b57cec5SDimitry Andric
36410b57cec5SDimitry Andric return 0;
36420b57cec5SDimitry Andric } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
36430b57cec5SDimitry Andric // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
36440b57cec5SDimitry Andric // We don't use isLexingRawMode() here because we need to diagnose bad
36450b57cec5SDimitry Andric // UCNs even when skipping preprocessing tokens in a #if block.
36460b57cec5SDimitry Andric if (Result && PP) {
36470b57cec5SDimitry Andric if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
36480b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
36490b57cec5SDimitry Andric else
36500b57cec5SDimitry Andric Diag(BufferPtr, diag::err_ucn_escape_invalid);
36510b57cec5SDimitry Andric }
36520b57cec5SDimitry Andric return 0;
36530b57cec5SDimitry Andric }
36540b57cec5SDimitry Andric
36550b57cec5SDimitry Andric return CodePoint;
36560b57cec5SDimitry Andric }
36570b57cec5SDimitry Andric
CheckUnicodeWhitespace(Token & Result,uint32_t C,const char * CurPtr)36580b57cec5SDimitry Andric bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
36590b57cec5SDimitry Andric const char *CurPtr) {
36600b57cec5SDimitry Andric if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3661349cc55cSDimitry Andric isUnicodeWhitespace(C)) {
36620b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_unicode_whitespace)
36630b57cec5SDimitry Andric << makeCharRange(*this, BufferPtr, CurPtr);
36640b57cec5SDimitry Andric
36650b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace);
36660b57cec5SDimitry Andric return true;
36670b57cec5SDimitry Andric }
36680b57cec5SDimitry Andric return false;
36690b57cec5SDimitry Andric }
36700b57cec5SDimitry Andric
PropagateLineStartLeadingSpaceInfo(Token & Result)36710b57cec5SDimitry Andric void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
36720b57cec5SDimitry Andric IsAtStartOfLine = Result.isAtStartOfLine();
36730b57cec5SDimitry Andric HasLeadingSpace = Result.hasLeadingSpace();
36740b57cec5SDimitry Andric HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
36750b57cec5SDimitry Andric // Note that this doesn't affect IsAtPhysicalStartOfLine.
36760b57cec5SDimitry Andric }
36770b57cec5SDimitry Andric
Lex(Token & Result)36780b57cec5SDimitry Andric bool Lexer::Lex(Token &Result) {
367981ad6265SDimitry Andric assert(!isDependencyDirectivesLexer());
368081ad6265SDimitry Andric
36810b57cec5SDimitry Andric // Start a new token.
36820b57cec5SDimitry Andric Result.startToken();
36830b57cec5SDimitry Andric
36840b57cec5SDimitry Andric // Set up misc whitespace flags for LexTokenInternal.
36850b57cec5SDimitry Andric if (IsAtStartOfLine) {
36860b57cec5SDimitry Andric Result.setFlag(Token::StartOfLine);
36870b57cec5SDimitry Andric IsAtStartOfLine = false;
36880b57cec5SDimitry Andric }
36890b57cec5SDimitry Andric
36900b57cec5SDimitry Andric if (HasLeadingSpace) {
36910b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace);
36920b57cec5SDimitry Andric HasLeadingSpace = false;
36930b57cec5SDimitry Andric }
36940b57cec5SDimitry Andric
36950b57cec5SDimitry Andric if (HasLeadingEmptyMacro) {
36960b57cec5SDimitry Andric Result.setFlag(Token::LeadingEmptyMacro);
36970b57cec5SDimitry Andric HasLeadingEmptyMacro = false;
36980b57cec5SDimitry Andric }
36990b57cec5SDimitry Andric
37000b57cec5SDimitry Andric bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
37010b57cec5SDimitry Andric IsAtPhysicalStartOfLine = false;
37020b57cec5SDimitry Andric bool isRawLex = isLexingRawMode();
37030b57cec5SDimitry Andric (void) isRawLex;
37040b57cec5SDimitry Andric bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
37050b57cec5SDimitry Andric // (After the LexTokenInternal call, the lexer might be destroyed.)
37060b57cec5SDimitry Andric assert((returnedToken || !isRawLex) && "Raw lex must succeed");
37070b57cec5SDimitry Andric return returnedToken;
37080b57cec5SDimitry Andric }
37090b57cec5SDimitry Andric
37100b57cec5SDimitry Andric /// LexTokenInternal - This implements a simple C family lexer. It is an
37110b57cec5SDimitry Andric /// extremely performance critical piece of code. This assumes that the buffer
37120b57cec5SDimitry Andric /// has a null character at the end of the file. This returns a preprocessing
37130b57cec5SDimitry Andric /// token, not a normal token, as such, it is an internal interface. It assumes
37140b57cec5SDimitry Andric /// that the Flags of result have been cleared before calling this.
LexTokenInternal(Token & Result,bool TokAtPhysicalStartOfLine)37150b57cec5SDimitry Andric bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3716bdd1243dSDimitry Andric LexStart:
3717bdd1243dSDimitry Andric assert(!Result.needsCleaning() && "Result needs cleaning");
3718bdd1243dSDimitry Andric assert(!Result.hasPtrData() && "Result has not been reset");
37190b57cec5SDimitry Andric
37200b57cec5SDimitry Andric // CurPtr - Cache BufferPtr in an automatic variable.
37210b57cec5SDimitry Andric const char *CurPtr = BufferPtr;
37220b57cec5SDimitry Andric
37230b57cec5SDimitry Andric // Small amounts of horizontal whitespace is very common between tokens.
3724fe6060f1SDimitry Andric if (isHorizontalWhitespace(*CurPtr)) {
3725fe6060f1SDimitry Andric do {
37260b57cec5SDimitry Andric ++CurPtr;
3727fe6060f1SDimitry Andric } while (isHorizontalWhitespace(*CurPtr));
37280b57cec5SDimitry Andric
37290b57cec5SDimitry Andric // If we are keeping whitespace and other tokens, just return what we just
37300b57cec5SDimitry Andric // skipped. The next lexer invocation will return the token after the
37310b57cec5SDimitry Andric // whitespace.
37320b57cec5SDimitry Andric if (isKeepWhitespaceMode()) {
37330b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::unknown);
37340b57cec5SDimitry Andric // FIXME: The next token will not have LeadingSpace set.
37350b57cec5SDimitry Andric return true;
37360b57cec5SDimitry Andric }
37370b57cec5SDimitry Andric
37380b57cec5SDimitry Andric BufferPtr = CurPtr;
37390b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace);
37400b57cec5SDimitry Andric }
37410b57cec5SDimitry Andric
37420b57cec5SDimitry Andric unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
37430b57cec5SDimitry Andric
37440b57cec5SDimitry Andric // Read a character, advancing over it.
37450b57cec5SDimitry Andric char Char = getAndAdvanceChar(CurPtr, Result);
37460b57cec5SDimitry Andric tok::TokenKind Kind;
37470b57cec5SDimitry Andric
3748e8d8bef9SDimitry Andric if (!isVerticalWhitespace(Char))
3749e8d8bef9SDimitry Andric NewLinePtr = nullptr;
3750e8d8bef9SDimitry Andric
37510b57cec5SDimitry Andric switch (Char) {
37520b57cec5SDimitry Andric case 0: // Null.
37530b57cec5SDimitry Andric // Found end of file?
37540b57cec5SDimitry Andric if (CurPtr-1 == BufferEnd)
37550b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1);
37560b57cec5SDimitry Andric
37570b57cec5SDimitry Andric // Check if we are performing code completion.
37580b57cec5SDimitry Andric if (isCodeCompletionPoint(CurPtr-1)) {
37590b57cec5SDimitry Andric // Return the code-completion token.
37600b57cec5SDimitry Andric Result.startToken();
37610b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::code_completion);
37620b57cec5SDimitry Andric return true;
37630b57cec5SDimitry Andric }
37640b57cec5SDimitry Andric
37650b57cec5SDimitry Andric if (!isLexingRawMode())
37660b57cec5SDimitry Andric Diag(CurPtr-1, diag::null_in_file);
37670b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace);
37680b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
37690b57cec5SDimitry Andric return true; // KeepWhitespaceMode
37700b57cec5SDimitry Andric
37710b57cec5SDimitry Andric // We know the lexer hasn't changed, so just try again with this lexer.
37720b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
37730b57cec5SDimitry Andric goto LexNextToken;
37740b57cec5SDimitry Andric
37750b57cec5SDimitry Andric case 26: // DOS & CP/M EOF: "^Z".
37760b57cec5SDimitry Andric // If we're in Microsoft extensions mode, treat this as end of file.
37770b57cec5SDimitry Andric if (LangOpts.MicrosoftExt) {
37780b57cec5SDimitry Andric if (!isLexingRawMode())
37790b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
37800b57cec5SDimitry Andric return LexEndOfFile(Result, CurPtr-1);
37810b57cec5SDimitry Andric }
37820b57cec5SDimitry Andric
37830b57cec5SDimitry Andric // If Microsoft extensions are disabled, this is just random garbage.
37840b57cec5SDimitry Andric Kind = tok::unknown;
37850b57cec5SDimitry Andric break;
37860b57cec5SDimitry Andric
37870b57cec5SDimitry Andric case '\r':
37880b57cec5SDimitry Andric if (CurPtr[0] == '\n')
37890b57cec5SDimitry Andric (void)getAndAdvanceChar(CurPtr, Result);
3790bdd1243dSDimitry Andric [[fallthrough]];
37910b57cec5SDimitry Andric case '\n':
37920b57cec5SDimitry Andric // If we are inside a preprocessor directive and we see the end of line,
37930b57cec5SDimitry Andric // we know we are done with the directive, so return an EOD token.
37940b57cec5SDimitry Andric if (ParsingPreprocessorDirective) {
37950b57cec5SDimitry Andric // Done parsing the "line".
37960b57cec5SDimitry Andric ParsingPreprocessorDirective = false;
37970b57cec5SDimitry Andric
37980b57cec5SDimitry Andric // Restore comment saving mode, in case it was disabled for directive.
37990b57cec5SDimitry Andric if (PP)
38000b57cec5SDimitry Andric resetExtendedTokenMode();
38010b57cec5SDimitry Andric
38020b57cec5SDimitry Andric // Since we consumed a newline, we are back at the start of a line.
38030b57cec5SDimitry Andric IsAtStartOfLine = true;
38040b57cec5SDimitry Andric IsAtPhysicalStartOfLine = true;
3805e8d8bef9SDimitry Andric NewLinePtr = CurPtr - 1;
38060b57cec5SDimitry Andric
38070b57cec5SDimitry Andric Kind = tok::eod;
38080b57cec5SDimitry Andric break;
38090b57cec5SDimitry Andric }
38100b57cec5SDimitry Andric
38110b57cec5SDimitry Andric // No leading whitespace seen so far.
38120b57cec5SDimitry Andric Result.clearFlag(Token::LeadingSpace);
38130b57cec5SDimitry Andric
38140b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
38150b57cec5SDimitry Andric return true; // KeepWhitespaceMode
38160b57cec5SDimitry Andric
38170b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer.
38180b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
38190b57cec5SDimitry Andric goto LexNextToken;
38200b57cec5SDimitry Andric case ' ':
38210b57cec5SDimitry Andric case '\t':
38220b57cec5SDimitry Andric case '\f':
38230b57cec5SDimitry Andric case '\v':
38240b57cec5SDimitry Andric SkipHorizontalWhitespace:
38250b57cec5SDimitry Andric Result.setFlag(Token::LeadingSpace);
38260b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
38270b57cec5SDimitry Andric return true; // KeepWhitespaceMode
38280b57cec5SDimitry Andric
38290b57cec5SDimitry Andric SkipIgnoredUnits:
38300b57cec5SDimitry Andric CurPtr = BufferPtr;
38310b57cec5SDimitry Andric
38320b57cec5SDimitry Andric // If the next token is obviously a // or /* */ comment, skip it efficiently
38330b57cec5SDimitry Andric // too (without going through the big switch stmt).
38340b57cec5SDimitry Andric if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
383581ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
38360b57cec5SDimitry Andric if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
38370b57cec5SDimitry Andric return true; // There is a token to return.
38380b57cec5SDimitry Andric goto SkipIgnoredUnits;
38390b57cec5SDimitry Andric } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
38400b57cec5SDimitry Andric if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
38410b57cec5SDimitry Andric return true; // There is a token to return.
38420b57cec5SDimitry Andric goto SkipIgnoredUnits;
38430b57cec5SDimitry Andric } else if (isHorizontalWhitespace(*CurPtr)) {
38440b57cec5SDimitry Andric goto SkipHorizontalWhitespace;
38450b57cec5SDimitry Andric }
38460b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer.
38470b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
38480b57cec5SDimitry Andric goto LexNextToken;
38490b57cec5SDimitry Andric
38500b57cec5SDimitry Andric // C99 6.4.4.1: Integer Constants.
38510b57cec5SDimitry Andric // C99 6.4.4.2: Floating Constants.
38520b57cec5SDimitry Andric case '0': case '1': case '2': case '3': case '4':
38530b57cec5SDimitry Andric case '5': case '6': case '7': case '8': case '9':
38540b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
38550b57cec5SDimitry Andric MIOpt.ReadToken();
38560b57cec5SDimitry Andric return LexNumericConstant(Result, CurPtr);
38570b57cec5SDimitry Andric
385881ad6265SDimitry Andric // Identifier (e.g., uber), or
38595f757f3fSDimitry Andric // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
386081ad6265SDimitry Andric // UTF-8 or UTF-16 string literal (C11/C++11).
386181ad6265SDimitry Andric case 'u':
38620b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
38630b57cec5SDimitry Andric MIOpt.ReadToken();
38640b57cec5SDimitry Andric
38650b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) {
38660b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
38670b57cec5SDimitry Andric
38680b57cec5SDimitry Andric // UTF-16 string literal
38690b57cec5SDimitry Andric if (Char == '"')
38700b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
38710b57cec5SDimitry Andric tok::utf16_string_literal);
38720b57cec5SDimitry Andric
38730b57cec5SDimitry Andric // UTF-16 character constant
38740b57cec5SDimitry Andric if (Char == '\'')
38750b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
38760b57cec5SDimitry Andric tok::utf16_char_constant);
38770b57cec5SDimitry Andric
38780b57cec5SDimitry Andric // UTF-16 raw string literal
3879*0fca6ea1SDimitry Andric if (Char == 'R' && LangOpts.RawStringLiterals &&
38800b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
38810b57cec5SDimitry Andric return LexRawStringLiteral(Result,
38820b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
38830b57cec5SDimitry Andric SizeTmp2, Result),
38840b57cec5SDimitry Andric tok::utf16_string_literal);
38850b57cec5SDimitry Andric
38860b57cec5SDimitry Andric if (Char == '8') {
38870b57cec5SDimitry Andric char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
38880b57cec5SDimitry Andric
38890b57cec5SDimitry Andric // UTF-8 string literal
38900b57cec5SDimitry Andric if (Char2 == '"')
38910b57cec5SDimitry Andric return LexStringLiteral(Result,
38920b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
38930b57cec5SDimitry Andric SizeTmp2, Result),
38940b57cec5SDimitry Andric tok::utf8_string_literal);
38955f757f3fSDimitry Andric if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
38960b57cec5SDimitry Andric return LexCharConstant(
38970b57cec5SDimitry Andric Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
38980b57cec5SDimitry Andric SizeTmp2, Result),
38990b57cec5SDimitry Andric tok::utf8_char_constant);
39000b57cec5SDimitry Andric
3901*0fca6ea1SDimitry Andric if (Char2 == 'R' && LangOpts.RawStringLiterals) {
39020b57cec5SDimitry Andric unsigned SizeTmp3;
39030b57cec5SDimitry Andric char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
39040b57cec5SDimitry Andric // UTF-8 raw string literal
39050b57cec5SDimitry Andric if (Char3 == '"') {
39060b57cec5SDimitry Andric return LexRawStringLiteral(Result,
39070b57cec5SDimitry Andric ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
39080b57cec5SDimitry Andric SizeTmp2, Result),
39090b57cec5SDimitry Andric SizeTmp3, Result),
39100b57cec5SDimitry Andric tok::utf8_string_literal);
39110b57cec5SDimitry Andric }
39120b57cec5SDimitry Andric }
39130b57cec5SDimitry Andric }
39140b57cec5SDimitry Andric }
39150b57cec5SDimitry Andric
39160b57cec5SDimitry Andric // treat u like the start of an identifier.
3917349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr);
39180b57cec5SDimitry Andric
391981ad6265SDimitry Andric case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
39200b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
39210b57cec5SDimitry Andric MIOpt.ReadToken();
39220b57cec5SDimitry Andric
39230b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 || LangOpts.C11) {
39240b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
39250b57cec5SDimitry Andric
39260b57cec5SDimitry Andric // UTF-32 string literal
39270b57cec5SDimitry Andric if (Char == '"')
39280b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
39290b57cec5SDimitry Andric tok::utf32_string_literal);
39300b57cec5SDimitry Andric
39310b57cec5SDimitry Andric // UTF-32 character constant
39320b57cec5SDimitry Andric if (Char == '\'')
39330b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
39340b57cec5SDimitry Andric tok::utf32_char_constant);
39350b57cec5SDimitry Andric
39360b57cec5SDimitry Andric // UTF-32 raw string literal
3937*0fca6ea1SDimitry Andric if (Char == 'R' && LangOpts.RawStringLiterals &&
39380b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
39390b57cec5SDimitry Andric return LexRawStringLiteral(Result,
39400b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
39410b57cec5SDimitry Andric SizeTmp2, Result),
39420b57cec5SDimitry Andric tok::utf32_string_literal);
39430b57cec5SDimitry Andric }
39440b57cec5SDimitry Andric
39450b57cec5SDimitry Andric // treat U like the start of an identifier.
3946349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr);
39470b57cec5SDimitry Andric
39480b57cec5SDimitry Andric case 'R': // Identifier or C++0x raw string literal
39490b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
39500b57cec5SDimitry Andric MIOpt.ReadToken();
39510b57cec5SDimitry Andric
3952*0fca6ea1SDimitry Andric if (LangOpts.RawStringLiterals) {
39530b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
39540b57cec5SDimitry Andric
39550b57cec5SDimitry Andric if (Char == '"')
39560b57cec5SDimitry Andric return LexRawStringLiteral(Result,
39570b57cec5SDimitry Andric ConsumeChar(CurPtr, SizeTmp, Result),
39580b57cec5SDimitry Andric tok::string_literal);
39590b57cec5SDimitry Andric }
39600b57cec5SDimitry Andric
39610b57cec5SDimitry Andric // treat R like the start of an identifier.
3962349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr);
39630b57cec5SDimitry Andric
39640b57cec5SDimitry Andric case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
39650b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
39660b57cec5SDimitry Andric MIOpt.ReadToken();
39670b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
39680b57cec5SDimitry Andric
39690b57cec5SDimitry Andric // Wide string literal.
39700b57cec5SDimitry Andric if (Char == '"')
39710b57cec5SDimitry Andric return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
39720b57cec5SDimitry Andric tok::wide_string_literal);
39730b57cec5SDimitry Andric
39740b57cec5SDimitry Andric // Wide raw string literal.
3975*0fca6ea1SDimitry Andric if (LangOpts.RawStringLiterals && Char == 'R' &&
39760b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
39770b57cec5SDimitry Andric return LexRawStringLiteral(Result,
39780b57cec5SDimitry Andric ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
39790b57cec5SDimitry Andric SizeTmp2, Result),
39800b57cec5SDimitry Andric tok::wide_string_literal);
39810b57cec5SDimitry Andric
39820b57cec5SDimitry Andric // Wide character constant.
39830b57cec5SDimitry Andric if (Char == '\'')
39840b57cec5SDimitry Andric return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
39850b57cec5SDimitry Andric tok::wide_char_constant);
39860b57cec5SDimitry Andric // FALL THROUGH, treating L like the start of an identifier.
3987bdd1243dSDimitry Andric [[fallthrough]];
39880b57cec5SDimitry Andric
39890b57cec5SDimitry Andric // C99 6.4.2: Identifiers.
39900b57cec5SDimitry Andric case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
39910b57cec5SDimitry Andric case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
39920b57cec5SDimitry Andric case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
39930b57cec5SDimitry Andric case 'V': case 'W': case 'X': case 'Y': case 'Z':
39940b57cec5SDimitry Andric case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
39950b57cec5SDimitry Andric case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
39960b57cec5SDimitry Andric case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
39970b57cec5SDimitry Andric case 'v': case 'w': case 'x': case 'y': case 'z':
39980b57cec5SDimitry Andric case '_':
39990b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
40000b57cec5SDimitry Andric MIOpt.ReadToken();
4001349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr);
40020b57cec5SDimitry Andric
40030b57cec5SDimitry Andric case '$': // $ in identifiers.
40040b57cec5SDimitry Andric if (LangOpts.DollarIdents) {
40050b57cec5SDimitry Andric if (!isLexingRawMode())
40060b57cec5SDimitry Andric Diag(CurPtr-1, diag::ext_dollar_in_identifier);
40070b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
40080b57cec5SDimitry Andric MIOpt.ReadToken();
4009349cc55cSDimitry Andric return LexIdentifierContinue(Result, CurPtr);
40100b57cec5SDimitry Andric }
40110b57cec5SDimitry Andric
40120b57cec5SDimitry Andric Kind = tok::unknown;
40130b57cec5SDimitry Andric break;
40140b57cec5SDimitry Andric
40150b57cec5SDimitry Andric // C99 6.4.4: Character Constants.
40160b57cec5SDimitry Andric case '\'':
40170b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
40180b57cec5SDimitry Andric MIOpt.ReadToken();
40190b57cec5SDimitry Andric return LexCharConstant(Result, CurPtr, tok::char_constant);
40200b57cec5SDimitry Andric
40210b57cec5SDimitry Andric // C99 6.4.5: String Literals.
40220b57cec5SDimitry Andric case '"':
40230b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
40240b57cec5SDimitry Andric MIOpt.ReadToken();
40250b57cec5SDimitry Andric return LexStringLiteral(Result, CurPtr,
40260b57cec5SDimitry Andric ParsingFilename ? tok::header_name
40270b57cec5SDimitry Andric : tok::string_literal);
40280b57cec5SDimitry Andric
40290b57cec5SDimitry Andric // C99 6.4.6: Punctuators.
40300b57cec5SDimitry Andric case '?':
40310b57cec5SDimitry Andric Kind = tok::question;
40320b57cec5SDimitry Andric break;
40330b57cec5SDimitry Andric case '[':
40340b57cec5SDimitry Andric Kind = tok::l_square;
40350b57cec5SDimitry Andric break;
40360b57cec5SDimitry Andric case ']':
40370b57cec5SDimitry Andric Kind = tok::r_square;
40380b57cec5SDimitry Andric break;
40390b57cec5SDimitry Andric case '(':
40400b57cec5SDimitry Andric Kind = tok::l_paren;
40410b57cec5SDimitry Andric break;
40420b57cec5SDimitry Andric case ')':
40430b57cec5SDimitry Andric Kind = tok::r_paren;
40440b57cec5SDimitry Andric break;
40450b57cec5SDimitry Andric case '{':
40460b57cec5SDimitry Andric Kind = tok::l_brace;
40470b57cec5SDimitry Andric break;
40480b57cec5SDimitry Andric case '}':
40490b57cec5SDimitry Andric Kind = tok::r_brace;
40500b57cec5SDimitry Andric break;
40510b57cec5SDimitry Andric case '.':
40520b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
40530b57cec5SDimitry Andric if (Char >= '0' && Char <= '9') {
40540b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
40550b57cec5SDimitry Andric MIOpt.ReadToken();
40560b57cec5SDimitry Andric
40570b57cec5SDimitry Andric return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
40580b57cec5SDimitry Andric } else if (LangOpts.CPlusPlus && Char == '*') {
40590b57cec5SDimitry Andric Kind = tok::periodstar;
40600b57cec5SDimitry Andric CurPtr += SizeTmp;
40610b57cec5SDimitry Andric } else if (Char == '.' &&
40620b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
40630b57cec5SDimitry Andric Kind = tok::ellipsis;
40640b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
40650b57cec5SDimitry Andric SizeTmp2, Result);
40660b57cec5SDimitry Andric } else {
40670b57cec5SDimitry Andric Kind = tok::period;
40680b57cec5SDimitry Andric }
40690b57cec5SDimitry Andric break;
40700b57cec5SDimitry Andric case '&':
40710b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
40720b57cec5SDimitry Andric if (Char == '&') {
40730b57cec5SDimitry Andric Kind = tok::ampamp;
40740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
40750b57cec5SDimitry Andric } else if (Char == '=') {
40760b57cec5SDimitry Andric Kind = tok::ampequal;
40770b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
40780b57cec5SDimitry Andric } else {
40790b57cec5SDimitry Andric Kind = tok::amp;
40800b57cec5SDimitry Andric }
40810b57cec5SDimitry Andric break;
40820b57cec5SDimitry Andric case '*':
40830b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') {
40840b57cec5SDimitry Andric Kind = tok::starequal;
40850b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
40860b57cec5SDimitry Andric } else {
40870b57cec5SDimitry Andric Kind = tok::star;
40880b57cec5SDimitry Andric }
40890b57cec5SDimitry Andric break;
40900b57cec5SDimitry Andric case '+':
40910b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
40920b57cec5SDimitry Andric if (Char == '+') {
40930b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
40940b57cec5SDimitry Andric Kind = tok::plusplus;
40950b57cec5SDimitry Andric } else if (Char == '=') {
40960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
40970b57cec5SDimitry Andric Kind = tok::plusequal;
40980b57cec5SDimitry Andric } else {
40990b57cec5SDimitry Andric Kind = tok::plus;
41000b57cec5SDimitry Andric }
41010b57cec5SDimitry Andric break;
41020b57cec5SDimitry Andric case '-':
41030b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
41040b57cec5SDimitry Andric if (Char == '-') { // --
41050b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41060b57cec5SDimitry Andric Kind = tok::minusminus;
41070b57cec5SDimitry Andric } else if (Char == '>' && LangOpts.CPlusPlus &&
41080b57cec5SDimitry Andric getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
41090b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
41100b57cec5SDimitry Andric SizeTmp2, Result);
41110b57cec5SDimitry Andric Kind = tok::arrowstar;
41120b57cec5SDimitry Andric } else if (Char == '>') { // ->
41130b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41140b57cec5SDimitry Andric Kind = tok::arrow;
41150b57cec5SDimitry Andric } else if (Char == '=') { // -=
41160b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41170b57cec5SDimitry Andric Kind = tok::minusequal;
41180b57cec5SDimitry Andric } else {
41190b57cec5SDimitry Andric Kind = tok::minus;
41200b57cec5SDimitry Andric }
41210b57cec5SDimitry Andric break;
41220b57cec5SDimitry Andric case '~':
41230b57cec5SDimitry Andric Kind = tok::tilde;
41240b57cec5SDimitry Andric break;
41250b57cec5SDimitry Andric case '!':
41260b57cec5SDimitry Andric if (getCharAndSize(CurPtr, SizeTmp) == '=') {
41270b57cec5SDimitry Andric Kind = tok::exclaimequal;
41280b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41290b57cec5SDimitry Andric } else {
41300b57cec5SDimitry Andric Kind = tok::exclaim;
41310b57cec5SDimitry Andric }
41320b57cec5SDimitry Andric break;
41330b57cec5SDimitry Andric case '/':
41340b57cec5SDimitry Andric // 6.4.9: Comments
41350b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
41360b57cec5SDimitry Andric if (Char == '/') { // Line comment.
41370b57cec5SDimitry Andric // Even if Line comments are disabled (e.g. in C89 mode), we generally
41380b57cec5SDimitry Andric // want to lex this as a comment. There is one problem with this though,
41390b57cec5SDimitry Andric // that in one particular corner case, this can change the behavior of the
41400b57cec5SDimitry Andric // resultant program. For example, In "foo //**/ bar", C89 would lex
41410b57cec5SDimitry Andric // this as "foo / bar" and languages with Line comments would lex it as
41420b57cec5SDimitry Andric // "foo". Check to see if the character after the second slash is a '*'.
41430b57cec5SDimitry Andric // If so, we will lex that as a "/" instead of the start of a comment.
41440b57cec5SDimitry Andric // However, we never do this if we are just preprocessing.
414581ad6265SDimitry Andric bool TreatAsComment =
414681ad6265SDimitry Andric LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
41470b57cec5SDimitry Andric if (!TreatAsComment)
41480b57cec5SDimitry Andric if (!(PP && PP->isPreprocessedOutput()))
41490b57cec5SDimitry Andric TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
41500b57cec5SDimitry Andric
41510b57cec5SDimitry Andric if (TreatAsComment) {
41520b57cec5SDimitry Andric if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
41530b57cec5SDimitry Andric TokAtPhysicalStartOfLine))
41540b57cec5SDimitry Andric return true; // There is a token to return.
41550b57cec5SDimitry Andric
41560b57cec5SDimitry Andric // It is common for the tokens immediately after a // comment to be
41570b57cec5SDimitry Andric // whitespace (indentation for the next line). Instead of going through
41580b57cec5SDimitry Andric // the big switch, handle it efficiently now.
41590b57cec5SDimitry Andric goto SkipIgnoredUnits;
41600b57cec5SDimitry Andric }
41610b57cec5SDimitry Andric }
41620b57cec5SDimitry Andric
41630b57cec5SDimitry Andric if (Char == '*') { // /**/ comment.
41640b57cec5SDimitry Andric if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
41650b57cec5SDimitry Andric TokAtPhysicalStartOfLine))
41660b57cec5SDimitry Andric return true; // There is a token to return.
41670b57cec5SDimitry Andric
41680b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer.
41690b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
41700b57cec5SDimitry Andric goto LexNextToken;
41710b57cec5SDimitry Andric }
41720b57cec5SDimitry Andric
41730b57cec5SDimitry Andric if (Char == '=') {
41740b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41750b57cec5SDimitry Andric Kind = tok::slashequal;
41760b57cec5SDimitry Andric } else {
41770b57cec5SDimitry Andric Kind = tok::slash;
41780b57cec5SDimitry Andric }
41790b57cec5SDimitry Andric break;
41800b57cec5SDimitry Andric case '%':
41810b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
41820b57cec5SDimitry Andric if (Char == '=') {
41830b57cec5SDimitry Andric Kind = tok::percentequal;
41840b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41850b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '>') {
41860b57cec5SDimitry Andric Kind = tok::r_brace; // '%>' -> '}'
41870b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41880b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') {
41890b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41900b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
41910b57cec5SDimitry Andric if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
41920b57cec5SDimitry Andric Kind = tok::hashhash; // '%:%:' -> '##'
41930b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
41940b57cec5SDimitry Andric SizeTmp2, Result);
41950b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
41960b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
41970b57cec5SDimitry Andric if (!isLexingRawMode())
41980b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft);
41990b57cec5SDimitry Andric Kind = tok::hashat;
42000b57cec5SDimitry Andric } else { // '%:' -> '#'
42010b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line,
42020b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to
42030b57cec5SDimitry Andric // the preprocessor to handle it.
42040b57cec5SDimitry Andric // TODO: -fpreprocessed mode??
42050b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
42060b57cec5SDimitry Andric goto HandleDirective;
42070b57cec5SDimitry Andric
42080b57cec5SDimitry Andric Kind = tok::hash;
42090b57cec5SDimitry Andric }
42100b57cec5SDimitry Andric } else {
42110b57cec5SDimitry Andric Kind = tok::percent;
42120b57cec5SDimitry Andric }
42130b57cec5SDimitry Andric break;
42140b57cec5SDimitry Andric case '<':
42150b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
42160b57cec5SDimitry Andric if (ParsingFilename) {
42170b57cec5SDimitry Andric return LexAngledStringLiteral(Result, CurPtr);
42180b57cec5SDimitry Andric } else if (Char == '<') {
42190b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
42200b57cec5SDimitry Andric if (After == '=') {
42210b57cec5SDimitry Andric Kind = tok::lesslessequal;
42220b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
42230b57cec5SDimitry Andric SizeTmp2, Result);
42240b57cec5SDimitry Andric } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
42250b57cec5SDimitry Andric // If this is actually a '<<<<<<<' version control conflict marker,
42260b57cec5SDimitry Andric // recognize it as such and recover nicely.
42270b57cec5SDimitry Andric goto LexNextToken;
42280b57cec5SDimitry Andric } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
42290b57cec5SDimitry Andric // If this is '<<<<' and we're in a Perforce-style conflict marker,
42300b57cec5SDimitry Andric // ignore it.
42310b57cec5SDimitry Andric goto LexNextToken;
42320b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '<') {
42330b57cec5SDimitry Andric Kind = tok::lesslessless;
42340b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
42350b57cec5SDimitry Andric SizeTmp2, Result);
42360b57cec5SDimitry Andric } else {
42370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
42380b57cec5SDimitry Andric Kind = tok::lessless;
42390b57cec5SDimitry Andric }
42400b57cec5SDimitry Andric } else if (Char == '=') {
42410b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
42420b57cec5SDimitry Andric if (After == '>') {
424381ad6265SDimitry Andric if (LangOpts.CPlusPlus20) {
42440b57cec5SDimitry Andric if (!isLexingRawMode())
42450b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
42460b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
42470b57cec5SDimitry Andric SizeTmp2, Result);
42480b57cec5SDimitry Andric Kind = tok::spaceship;
42490b57cec5SDimitry Andric break;
42500b57cec5SDimitry Andric }
42510b57cec5SDimitry Andric // Suggest adding a space between the '<=' and the '>' to avoid a
42520b57cec5SDimitry Andric // change in semantics if this turns up in C++ <=17 mode.
425381ad6265SDimitry Andric if (LangOpts.CPlusPlus && !isLexingRawMode()) {
42545ffd83dbSDimitry Andric Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
42550b57cec5SDimitry Andric << FixItHint::CreateInsertion(
42560b57cec5SDimitry Andric getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
42570b57cec5SDimitry Andric }
42580b57cec5SDimitry Andric }
42590b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
42600b57cec5SDimitry Andric Kind = tok::lessequal;
42610b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
42620b57cec5SDimitry Andric if (LangOpts.CPlusPlus11 &&
42630b57cec5SDimitry Andric getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
42640b57cec5SDimitry Andric // C++0x [lex.pptoken]p3:
42650b57cec5SDimitry Andric // Otherwise, if the next three characters are <:: and the subsequent
42660b57cec5SDimitry Andric // character is neither : nor >, the < is treated as a preprocessor
42670b57cec5SDimitry Andric // token by itself and not as the first character of the alternative
42680b57cec5SDimitry Andric // token <:.
42690b57cec5SDimitry Andric unsigned SizeTmp3;
42700b57cec5SDimitry Andric char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
42710b57cec5SDimitry Andric if (After != ':' && After != '>') {
42720b57cec5SDimitry Andric Kind = tok::less;
42730b57cec5SDimitry Andric if (!isLexingRawMode())
42740b57cec5SDimitry Andric Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
42750b57cec5SDimitry Andric break;
42760b57cec5SDimitry Andric }
42770b57cec5SDimitry Andric }
42780b57cec5SDimitry Andric
42790b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
42800b57cec5SDimitry Andric Kind = tok::l_square;
42810b57cec5SDimitry Andric } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
42820b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
42830b57cec5SDimitry Andric Kind = tok::l_brace;
42840b57cec5SDimitry Andric } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
42850b57cec5SDimitry Andric lexEditorPlaceholder(Result, CurPtr)) {
42860b57cec5SDimitry Andric return true;
42870b57cec5SDimitry Andric } else {
42880b57cec5SDimitry Andric Kind = tok::less;
42890b57cec5SDimitry Andric }
42900b57cec5SDimitry Andric break;
42910b57cec5SDimitry Andric case '>':
42920b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
42930b57cec5SDimitry Andric if (Char == '=') {
42940b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
42950b57cec5SDimitry Andric Kind = tok::greaterequal;
42960b57cec5SDimitry Andric } else if (Char == '>') {
42970b57cec5SDimitry Andric char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
42980b57cec5SDimitry Andric if (After == '=') {
42990b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
43000b57cec5SDimitry Andric SizeTmp2, Result);
43010b57cec5SDimitry Andric Kind = tok::greatergreaterequal;
43020b57cec5SDimitry Andric } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
43030b57cec5SDimitry Andric // If this is actually a '>>>>' conflict marker, recognize it as such
43040b57cec5SDimitry Andric // and recover nicely.
43050b57cec5SDimitry Andric goto LexNextToken;
43060b57cec5SDimitry Andric } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
43070b57cec5SDimitry Andric // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
43080b57cec5SDimitry Andric goto LexNextToken;
43090b57cec5SDimitry Andric } else if (LangOpts.CUDA && After == '>') {
43100b57cec5SDimitry Andric Kind = tok::greatergreatergreater;
43110b57cec5SDimitry Andric CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
43120b57cec5SDimitry Andric SizeTmp2, Result);
43130b57cec5SDimitry Andric } else {
43140b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43150b57cec5SDimitry Andric Kind = tok::greatergreater;
43160b57cec5SDimitry Andric }
43170b57cec5SDimitry Andric } else {
43180b57cec5SDimitry Andric Kind = tok::greater;
43190b57cec5SDimitry Andric }
43200b57cec5SDimitry Andric break;
43210b57cec5SDimitry Andric case '^':
43220b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
43230b57cec5SDimitry Andric if (Char == '=') {
43240b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43250b57cec5SDimitry Andric Kind = tok::caretequal;
43260b57cec5SDimitry Andric } else if (LangOpts.OpenCL && Char == '^') {
43270b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43280b57cec5SDimitry Andric Kind = tok::caretcaret;
43290b57cec5SDimitry Andric } else {
43300b57cec5SDimitry Andric Kind = tok::caret;
43310b57cec5SDimitry Andric }
43320b57cec5SDimitry Andric break;
43330b57cec5SDimitry Andric case '|':
43340b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
43350b57cec5SDimitry Andric if (Char == '=') {
43360b57cec5SDimitry Andric Kind = tok::pipeequal;
43370b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43380b57cec5SDimitry Andric } else if (Char == '|') {
43390b57cec5SDimitry Andric // If this is '|||||||' and we're in a conflict marker, ignore it.
43400b57cec5SDimitry Andric if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
43410b57cec5SDimitry Andric goto LexNextToken;
43420b57cec5SDimitry Andric Kind = tok::pipepipe;
43430b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43440b57cec5SDimitry Andric } else {
43450b57cec5SDimitry Andric Kind = tok::pipe;
43460b57cec5SDimitry Andric }
43470b57cec5SDimitry Andric break;
43480b57cec5SDimitry Andric case ':':
43490b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
43500b57cec5SDimitry Andric if (LangOpts.Digraphs && Char == '>') {
43510b57cec5SDimitry Andric Kind = tok::r_square; // ':>' -> ']'
43520b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
435306c3fb27SDimitry Andric } else if (Char == ':') {
43540b57cec5SDimitry Andric Kind = tok::coloncolon;
43550b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43560b57cec5SDimitry Andric } else {
43570b57cec5SDimitry Andric Kind = tok::colon;
43580b57cec5SDimitry Andric }
43590b57cec5SDimitry Andric break;
43600b57cec5SDimitry Andric case ';':
43610b57cec5SDimitry Andric Kind = tok::semi;
43620b57cec5SDimitry Andric break;
43630b57cec5SDimitry Andric case '=':
43640b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
43650b57cec5SDimitry Andric if (Char == '=') {
43660b57cec5SDimitry Andric // If this is '====' and we're in a conflict marker, ignore it.
43670b57cec5SDimitry Andric if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
43680b57cec5SDimitry Andric goto LexNextToken;
43690b57cec5SDimitry Andric
43700b57cec5SDimitry Andric Kind = tok::equalequal;
43710b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43720b57cec5SDimitry Andric } else {
43730b57cec5SDimitry Andric Kind = tok::equal;
43740b57cec5SDimitry Andric }
43750b57cec5SDimitry Andric break;
43760b57cec5SDimitry Andric case ',':
43770b57cec5SDimitry Andric Kind = tok::comma;
43780b57cec5SDimitry Andric break;
43790b57cec5SDimitry Andric case '#':
43800b57cec5SDimitry Andric Char = getCharAndSize(CurPtr, SizeTmp);
43810b57cec5SDimitry Andric if (Char == '#') {
43820b57cec5SDimitry Andric Kind = tok::hashhash;
43830b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43840b57cec5SDimitry Andric } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
43850b57cec5SDimitry Andric Kind = tok::hashat;
43860b57cec5SDimitry Andric if (!isLexingRawMode())
43870b57cec5SDimitry Andric Diag(BufferPtr, diag::ext_charize_microsoft);
43880b57cec5SDimitry Andric CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
43890b57cec5SDimitry Andric } else {
43900b57cec5SDimitry Andric // We parsed a # character. If this occurs at the start of the line,
43910b57cec5SDimitry Andric // it's actually the start of a preprocessing directive. Callback to
43920b57cec5SDimitry Andric // the preprocessor to handle it.
43930b57cec5SDimitry Andric // TODO: -fpreprocessed mode??
43940b57cec5SDimitry Andric if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
43950b57cec5SDimitry Andric goto HandleDirective;
43960b57cec5SDimitry Andric
43970b57cec5SDimitry Andric Kind = tok::hash;
43980b57cec5SDimitry Andric }
43990b57cec5SDimitry Andric break;
44000b57cec5SDimitry Andric
44010b57cec5SDimitry Andric case '@':
44020b57cec5SDimitry Andric // Objective C support.
44030b57cec5SDimitry Andric if (CurPtr[-1] == '@' && LangOpts.ObjC)
44040b57cec5SDimitry Andric Kind = tok::at;
44050b57cec5SDimitry Andric else
44060b57cec5SDimitry Andric Kind = tok::unknown;
44070b57cec5SDimitry Andric break;
44080b57cec5SDimitry Andric
44090b57cec5SDimitry Andric // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
44100b57cec5SDimitry Andric case '\\':
44110b57cec5SDimitry Andric if (!LangOpts.AsmPreprocessor) {
44120b57cec5SDimitry Andric if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
44130b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
44140b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
44150b57cec5SDimitry Andric return true; // KeepWhitespaceMode
44160b57cec5SDimitry Andric
44170b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer.
44180b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
44190b57cec5SDimitry Andric goto LexNextToken;
44200b57cec5SDimitry Andric }
44210b57cec5SDimitry Andric
4422349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
44230b57cec5SDimitry Andric }
44240b57cec5SDimitry Andric }
44250b57cec5SDimitry Andric
44260b57cec5SDimitry Andric Kind = tok::unknown;
44270b57cec5SDimitry Andric break;
44280b57cec5SDimitry Andric
44290b57cec5SDimitry Andric default: {
44300b57cec5SDimitry Andric if (isASCII(Char)) {
44310b57cec5SDimitry Andric Kind = tok::unknown;
44320b57cec5SDimitry Andric break;
44330b57cec5SDimitry Andric }
44340b57cec5SDimitry Andric
44350b57cec5SDimitry Andric llvm::UTF32 CodePoint;
44360b57cec5SDimitry Andric
44370b57cec5SDimitry Andric // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
44380b57cec5SDimitry Andric // an escaped newline.
44390b57cec5SDimitry Andric --CurPtr;
44400b57cec5SDimitry Andric llvm::ConversionResult Status =
44410b57cec5SDimitry Andric llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
44420b57cec5SDimitry Andric (const llvm::UTF8 *)BufferEnd,
44430b57cec5SDimitry Andric &CodePoint,
44440b57cec5SDimitry Andric llvm::strictConversion);
44450b57cec5SDimitry Andric if (Status == llvm::conversionOK) {
44460b57cec5SDimitry Andric if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
44470b57cec5SDimitry Andric if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
44480b57cec5SDimitry Andric return true; // KeepWhitespaceMode
44490b57cec5SDimitry Andric
44500b57cec5SDimitry Andric // We only saw whitespace, so just try again with this lexer.
44510b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
44520b57cec5SDimitry Andric goto LexNextToken;
44530b57cec5SDimitry Andric }
4454349cc55cSDimitry Andric return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
44550b57cec5SDimitry Andric }
44560b57cec5SDimitry Andric
44570b57cec5SDimitry Andric if (isLexingRawMode() || ParsingPreprocessorDirective ||
44580b57cec5SDimitry Andric PP->isPreprocessedOutput()) {
44590b57cec5SDimitry Andric ++CurPtr;
44600b57cec5SDimitry Andric Kind = tok::unknown;
44610b57cec5SDimitry Andric break;
44620b57cec5SDimitry Andric }
44630b57cec5SDimitry Andric
44640b57cec5SDimitry Andric // Non-ASCII characters tend to creep into source code unintentionally.
44650b57cec5SDimitry Andric // Instead of letting the parser complain about the unknown token,
44660b57cec5SDimitry Andric // just diagnose the invalid UTF-8, then drop the character.
44670b57cec5SDimitry Andric Diag(CurPtr, diag::err_invalid_utf8);
44680b57cec5SDimitry Andric
44690b57cec5SDimitry Andric BufferPtr = CurPtr+1;
44700b57cec5SDimitry Andric // We're pretending the character didn't exist, so just try again with
44710b57cec5SDimitry Andric // this lexer.
44720b57cec5SDimitry Andric // (We manually eliminate the tail call to avoid recursion.)
44730b57cec5SDimitry Andric goto LexNextToken;
44740b57cec5SDimitry Andric }
44750b57cec5SDimitry Andric }
44760b57cec5SDimitry Andric
44770b57cec5SDimitry Andric // Notify MIOpt that we read a non-whitespace/non-comment token.
44780b57cec5SDimitry Andric MIOpt.ReadToken();
44790b57cec5SDimitry Andric
44800b57cec5SDimitry Andric // Update the location of token as well as BufferPtr.
44810b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, Kind);
44820b57cec5SDimitry Andric return true;
44830b57cec5SDimitry Andric
44840b57cec5SDimitry Andric HandleDirective:
44850b57cec5SDimitry Andric // We parsed a # character and it's the start of a preprocessing directive.
44860b57cec5SDimitry Andric
44870b57cec5SDimitry Andric FormTokenWithChars(Result, CurPtr, tok::hash);
44880b57cec5SDimitry Andric PP->HandleDirective(Result);
44890b57cec5SDimitry Andric
449006c3fb27SDimitry Andric if (PP->hadModuleLoaderFatalFailure())
44910b57cec5SDimitry Andric // With a fatal failure in the module loader, we abort parsing.
44920b57cec5SDimitry Andric return true;
44930b57cec5SDimitry Andric
44940b57cec5SDimitry Andric // We parsed the directive; lex a token with the new state.
44950b57cec5SDimitry Andric return false;
4496bdd1243dSDimitry Andric
4497bdd1243dSDimitry Andric LexNextToken:
4498bdd1243dSDimitry Andric Result.clearFlag(Token::NeedsCleaning);
4499bdd1243dSDimitry Andric goto LexStart;
45000b57cec5SDimitry Andric }
450181ad6265SDimitry Andric
convertDependencyDirectiveToken(const dependency_directives_scan::Token & DDTok,Token & Result)450281ad6265SDimitry Andric const char *Lexer::convertDependencyDirectiveToken(
450381ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok, Token &Result) {
450481ad6265SDimitry Andric const char *TokPtr = BufferStart + DDTok.Offset;
450581ad6265SDimitry Andric Result.startToken();
450681ad6265SDimitry Andric Result.setLocation(getSourceLocation(TokPtr));
450781ad6265SDimitry Andric Result.setKind(DDTok.Kind);
450881ad6265SDimitry Andric Result.setFlag((Token::TokenFlags)DDTok.Flags);
450981ad6265SDimitry Andric Result.setLength(DDTok.Length);
451081ad6265SDimitry Andric BufferPtr = TokPtr + DDTok.Length;
451181ad6265SDimitry Andric return TokPtr;
451281ad6265SDimitry Andric }
451381ad6265SDimitry Andric
LexDependencyDirectiveToken(Token & Result)451481ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveToken(Token &Result) {
451581ad6265SDimitry Andric assert(isDependencyDirectivesLexer());
451681ad6265SDimitry Andric
451781ad6265SDimitry Andric using namespace dependency_directives_scan;
451881ad6265SDimitry Andric
451981ad6265SDimitry Andric while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
452081ad6265SDimitry Andric if (DepDirectives.front().Kind == pp_eof)
452181ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd);
4522bdd1243dSDimitry Andric if (DepDirectives.front().Kind == tokens_present_before_eof)
4523bdd1243dSDimitry Andric MIOpt.ReadToken();
452481ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0;
452581ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front();
452681ad6265SDimitry Andric }
452781ad6265SDimitry Andric
452881ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok =
452981ad6265SDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
453081ad6265SDimitry Andric if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
453181ad6265SDimitry Andric // Read something other than a preprocessor directive hash.
453281ad6265SDimitry Andric MIOpt.ReadToken();
453381ad6265SDimitry Andric }
453481ad6265SDimitry Andric
4535bdd1243dSDimitry Andric if (ParsingFilename && DDTok.is(tok::less)) {
4536bdd1243dSDimitry Andric BufferPtr = BufferStart + DDTok.Offset;
4537bdd1243dSDimitry Andric LexAngledStringLiteral(Result, BufferPtr + 1);
4538bdd1243dSDimitry Andric if (Result.isNot(tok::header_name))
4539bdd1243dSDimitry Andric return true;
4540bdd1243dSDimitry Andric // Advance the index of lexed tokens.
4541bdd1243dSDimitry Andric while (true) {
4542bdd1243dSDimitry Andric const dependency_directives_scan::Token &NextTok =
4543bdd1243dSDimitry Andric DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4544bdd1243dSDimitry Andric if (BufferStart + NextTok.Offset >= BufferPtr)
4545bdd1243dSDimitry Andric break;
4546bdd1243dSDimitry Andric ++NextDepDirectiveTokenIndex;
4547bdd1243dSDimitry Andric }
4548bdd1243dSDimitry Andric return true;
4549bdd1243dSDimitry Andric }
4550bdd1243dSDimitry Andric
455181ad6265SDimitry Andric const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
455281ad6265SDimitry Andric
455381ad6265SDimitry Andric if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
455481ad6265SDimitry Andric PP->HandleDirective(Result);
455581ad6265SDimitry Andric return false;
455681ad6265SDimitry Andric }
455781ad6265SDimitry Andric if (Result.is(tok::raw_identifier)) {
455881ad6265SDimitry Andric Result.setRawIdentifierData(TokPtr);
455981ad6265SDimitry Andric if (!isLexingRawMode()) {
45605f757f3fSDimitry Andric const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
456181ad6265SDimitry Andric if (II->isHandleIdentifierCase())
456281ad6265SDimitry Andric return PP->HandleIdentifier(Result);
456381ad6265SDimitry Andric }
456481ad6265SDimitry Andric return true;
456581ad6265SDimitry Andric }
456681ad6265SDimitry Andric if (Result.isLiteral()) {
456781ad6265SDimitry Andric Result.setLiteralData(TokPtr);
456881ad6265SDimitry Andric return true;
456981ad6265SDimitry Andric }
457006c3fb27SDimitry Andric if (Result.is(tok::colon)) {
457181ad6265SDimitry Andric // Convert consecutive colons to 'tok::coloncolon'.
457281ad6265SDimitry Andric if (*BufferPtr == ':') {
457381ad6265SDimitry Andric assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
457481ad6265SDimitry Andric tok::colon));
457581ad6265SDimitry Andric ++NextDepDirectiveTokenIndex;
457681ad6265SDimitry Andric Result.setKind(tok::coloncolon);
457781ad6265SDimitry Andric }
457881ad6265SDimitry Andric return true;
457981ad6265SDimitry Andric }
458081ad6265SDimitry Andric if (Result.is(tok::eod))
458181ad6265SDimitry Andric ParsingPreprocessorDirective = false;
458281ad6265SDimitry Andric
458381ad6265SDimitry Andric return true;
458481ad6265SDimitry Andric }
458581ad6265SDimitry Andric
LexDependencyDirectiveTokenWhileSkipping(Token & Result)458681ad6265SDimitry Andric bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
458781ad6265SDimitry Andric assert(isDependencyDirectivesLexer());
458881ad6265SDimitry Andric
458981ad6265SDimitry Andric using namespace dependency_directives_scan;
459081ad6265SDimitry Andric
459181ad6265SDimitry Andric bool Stop = false;
459281ad6265SDimitry Andric unsigned NestedIfs = 0;
459381ad6265SDimitry Andric do {
459481ad6265SDimitry Andric DepDirectives = DepDirectives.drop_front();
459581ad6265SDimitry Andric switch (DepDirectives.front().Kind) {
459681ad6265SDimitry Andric case pp_none:
459781ad6265SDimitry Andric llvm_unreachable("unexpected 'pp_none'");
459881ad6265SDimitry Andric case pp_include:
459981ad6265SDimitry Andric case pp___include_macros:
460081ad6265SDimitry Andric case pp_define:
460181ad6265SDimitry Andric case pp_undef:
460281ad6265SDimitry Andric case pp_import:
460381ad6265SDimitry Andric case pp_pragma_import:
460481ad6265SDimitry Andric case pp_pragma_once:
460581ad6265SDimitry Andric case pp_pragma_push_macro:
460681ad6265SDimitry Andric case pp_pragma_pop_macro:
460781ad6265SDimitry Andric case pp_pragma_include_alias:
460806c3fb27SDimitry Andric case pp_pragma_system_header:
460981ad6265SDimitry Andric case pp_include_next:
461081ad6265SDimitry Andric case decl_at_import:
461181ad6265SDimitry Andric case cxx_module_decl:
461281ad6265SDimitry Andric case cxx_import_decl:
461381ad6265SDimitry Andric case cxx_export_module_decl:
461481ad6265SDimitry Andric case cxx_export_import_decl:
4615bdd1243dSDimitry Andric case tokens_present_before_eof:
461681ad6265SDimitry Andric break;
461781ad6265SDimitry Andric case pp_if:
461881ad6265SDimitry Andric case pp_ifdef:
461981ad6265SDimitry Andric case pp_ifndef:
462081ad6265SDimitry Andric ++NestedIfs;
462181ad6265SDimitry Andric break;
462281ad6265SDimitry Andric case pp_elif:
462381ad6265SDimitry Andric case pp_elifdef:
462481ad6265SDimitry Andric case pp_elifndef:
462581ad6265SDimitry Andric case pp_else:
462681ad6265SDimitry Andric if (!NestedIfs) {
462781ad6265SDimitry Andric Stop = true;
462881ad6265SDimitry Andric }
462981ad6265SDimitry Andric break;
463081ad6265SDimitry Andric case pp_endif:
463181ad6265SDimitry Andric if (!NestedIfs) {
463281ad6265SDimitry Andric Stop = true;
463381ad6265SDimitry Andric } else {
463481ad6265SDimitry Andric --NestedIfs;
463581ad6265SDimitry Andric }
463681ad6265SDimitry Andric break;
463781ad6265SDimitry Andric case pp_eof:
463881ad6265SDimitry Andric NextDepDirectiveTokenIndex = 0;
463981ad6265SDimitry Andric return LexEndOfFile(Result, BufferEnd);
464081ad6265SDimitry Andric }
464181ad6265SDimitry Andric } while (!Stop);
464281ad6265SDimitry Andric
464381ad6265SDimitry Andric const dependency_directives_scan::Token &DDTok =
464481ad6265SDimitry Andric DepDirectives.front().Tokens.front();
464581ad6265SDimitry Andric assert(DDTok.is(tok::hash));
464681ad6265SDimitry Andric NextDepDirectiveTokenIndex = 1;
464781ad6265SDimitry Andric
464881ad6265SDimitry Andric convertDependencyDirectiveToken(DDTok, Result);
464981ad6265SDimitry Andric return false;
465081ad6265SDimitry Andric }
4651