Lexer.cpp - OpenGrok cross reference for /freebsd/contrib/llvm-project/clang/lib/Lex/Lexer.cpp

Lines Matching +full:non +full:- +full:comment
1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7 //===----------------------------------------------------------------------===//
11 //===----------------------------------------------------------------------===//
56 //===----------------------------------------------------------------------===//
58 //===----------------------------------------------------------------------===//
60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
65     return II->getObjCKeywordID() == objcKey;  in isObjCAtKeyword()
69 /// getObjCKeywordID - Return the ObjC keyword kind.
74   return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;  in getObjCKeywordID()
77 /// Determine whether the token kind starts a simple-type-specifier.
115     return getIdentifierInfo()->isKeyword(LangOpts);  in isSimpleTypeSpecifier()
122 //===----------------------------------------------------------------------===//
124 //===----------------------------------------------------------------------===//
138   // Check whether we have a BOM in the beginning of the buffer. If yes - act  in InitLexer()
139   // accordingly. Right now we support only UTF-8 with and without BOM, so, just  in InitLexer()
140   // skip the UTF-8 BOM if it's present.  in InitLexer()
143     StringRef Buf(BufferStart, BufferEnd - BufferStart);  in InitLexer()
145       .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM  in InitLexer()
180 /// Lexer constructor - Create a new lexer object for the specified buffer
196 /// Lexer constructor - Create a new raw lexer object.  This object is only
210 /// Lexer constructor - Create a new raw lexer object.  This object is only
225     SetCommentRetentionState(PP->getCommentRetentionState());  in resetExtendedTokenMode()
228 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
259   L->BufferPtr = StrData;  in Create_PragmaLexer()
260   L->BufferEnd = StrData+TokLen;  in Create_PragmaLexer()
261   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");  in Create_PragmaLexer()
265   L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),  in Create_PragmaLexer()
271   L->ParsingPreprocessorDirective = true;  in Create_PragmaLexer()
274   L->Is_PragmaLexer = true;  in Create_PragmaLexer()
279   this->IsAtPhysicalStartOfLine = IsAtStartOfLine;  in seek()
280   this->IsAtStartOfLine = IsAtStartOfLine;  in seek()
294       if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&  in StringifyImpl()
319 //===----------------------------------------------------------------------===//
321 //===----------------------------------------------------------------------===//
333     // Munch the encoding-prefix and opening double-quote.  in getSpellingSlow()
339       if (Spelling[Length - 1] == '"')  in getSpellingSlow()
344     // splicing do not occur within their d-char-sequence nor within their  in getSpellingSlow()
345     // r-char-sequence.  in getSpellingSlow()
347         Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {  in getSpellingSlow()
351       do --RawEnd; while (*RawEnd != '"');  in getSpellingSlow()
352       size_t RawLength = RawEnd - BufPtr + 1;  in getSpellingSlow()
374 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
376 /// after trigraph expansion and escaped-newline folding.  In particular, this
415 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
417 /// after trigraph expansion and escaped-newline folding.  In particular, this
442 /// getSpelling - This method is used to get the spelling of a token into a
464       Buffer = II->getNameStart();  in getSpelling()
465       return II->getLength();  in getSpelling()
495 /// MeasureTokenLength - Relex the token at the specified location and return
516   // all obviously single-char tokens.  This could use  in getRawToken()
550   for (; LexStart != BufStart; --LexStart) {  in findBeginningOfLine()
582   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);  in getBeginningOfFileToken()
596       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)  in getBeginningOfFileToken()
625   return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);  in GetBeginningOfToken()
666       MaxLineOffset = CurPtr - Buffer.begin();  in ComputePreamble()
690       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;  in ComputePreamble()
699     if (TheTok.getKind() == tok::comment) {  in ComputePreamble()
766       } while (TheTok.getKind() == tok::comment);  in ComputePreamble()
783     End = ActiveCommentLoc; // don't truncate a decl comment.  in ComputePreamble()
787   return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),  in ComputePreamble()
813     --CharNo;  in getTokenPrefixLength()
819   for (; CharNo; --CharNo) {  in getTokenPrefixLength()
830     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;  in getTokenPrefixLength()
863     Len = Len - Offset;  in getLocForEndOfToken()
1057   return file.substr(beginInfo.second, EndOffs - beginInfo.second);  in getSourceText()
1069     const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();  in getImmediateMacroName()
1094   // Find the spelling location of the start of the non-argument expansion  in getImmediateMacroName()
1121   // Find the spelling location of the start of the non-argument expansion  in getImmediateMacroNameForDiagnostics()
1140   if (Str - 1 < BufferStart)  in isNewLineEscaped()
1143   if ((Str[0] == '\n' && Str[-1] == '\r') ||  in isNewLineEscaped()
1144       (Str[0] == '\r' && Str[-1] == '\n')) {  in isNewLineEscaped()
1145     if (Str - 2 < BufferStart)  in isNewLineEscaped()
1147     --Str;  in isNewLineEscaped()
1149   --Str;  in isNewLineEscaped()
1151   // Rewind to first non-space character:  in isNewLineEscaped()
1153     --Str;  in isNewLineEscaped()
1172   StringRef Rest = Buffer.substr(Line - Buffer.data());  in getIndentationForLine()
1179 //===----------------------------------------------------------------------===//
1181 //===----------------------------------------------------------------------===//
1183 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1211 /// getSourceLocation - Return a source location identifier for the specified
1220   unsigned CharNo = Loc-BufferStart;  in getSourceLocation()
1230 /// Diag - Forwarding function for diagnostics.  This translate a source
1233   return PP->Diag(getSourceLocation(Loc), DiagID);  in Diag()
1236 //===----------------------------------------------------------------------===//
1238 //===----------------------------------------------------------------------===//
1240 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1253   case '-':  return '~';  in GetTrigraphCharForLetter()
1257 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
1267     if (L && !L->isLexingRawMode())  in DecodeTrigraphChar()
1268       L->Diag(CP-2, diag::trigraph_ignored);  in DecodeTrigraphChar()
1272   if (L && !L->isLexingRawMode())  in DecodeTrigraphChar()
1273     L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);  in DecodeTrigraphChar()
1277 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
1278 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1285     if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')  in getEscapedNewLineSize()
1290         Ptr[Size-1] != Ptr[Size])  in getEscapedNewLineSize()
1300 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1301 /// them), skip over them and return the first non-escaped-newline found,
1362   if (!Tok || Tok->isNot(TKind))  in findLocationAfterToken()
1364   SourceLocation TokenLoc = Tok->getLocation();  in findLocationAfterToken()
1369     const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();  in findLocationAfterToken()
1386   return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);  in findLocationAfterToken()
1389 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1411     // Common case, backslash-char where the char is not whitespace.  in getCharAndSizeSlow()
1419       if (Tok) Tok->setFlag(Token::NeedsCleaning);  in getCharAndSizeSlow()
1446       if (Tok) Tok->setFlag(Token::NeedsCleaning);  in getCharAndSizeSlow()
1459 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1474     // Common case, backslash-char where the char is not whitespace.  in getCharAndSizeSlowNoWarn()
1510 //===----------------------------------------------------------------------===//
1512 //===----------------------------------------------------------------------===//
1539 // To mitigate https://github.com/llvm/llvm-project/issues/54732,
1544 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf
1566     // A non-leading codepoint must have the XID_Continue property.  in isAllowedIDChar()
1660 /// After encountering UTF-8 character C and interpreting it as an identifier
1661 /// character, check whether it's a homoglyph for a common non-identifier
1677     {U'\u200c', 0},   // ZERO WIDTH NON-JOINER  in maybeDiagnoseUTF8Homoglyph()
1684     {U'\u2212', '-'}, // MINUS SIGN  in maybeDiagnoseUTF8Homoglyph()
1693     {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE  in maybeDiagnoseUTF8Homoglyph()
1704     {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS  in maybeDiagnoseUTF8Homoglyph()
1709     {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN  in maybeDiagnoseUTF8Homoglyph()
1711     {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN  in maybeDiagnoseUTF8Homoglyph()
1726                        std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});  in maybeDiagnoseUTF8Homoglyph()
1727   if (Homoglyph->Character == C) {  in maybeDiagnoseUTF8Homoglyph()
1728     if (Homoglyph->LooksLike) {  in maybeDiagnoseUTF8Homoglyph()
1729       const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};  in maybeDiagnoseUTF8Homoglyph()
1778         !PP->isPreprocessedOutput())  in tryConsumeIdentifierUCN()
1780           PP->getDiagnostics(), LangOpts, CodePoint,  in tryConsumeIdentifierUCN()
1789       diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,  in tryConsumeIdentifierUCN()
1792     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,  in tryConsumeIdentifierUCN()
1798   if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||  in tryConsumeIdentifierUCN()
1799       (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))  in tryConsumeIdentifierUCN()
1810   // If a UTF-8 codepoint appears immediately after an escaped new line,  in tryConsumeIdentifierUTF8Char()
1815   const char *CharStart = CurPtr + FirstCodeUnitSize - 1;  in tryConsumeIdentifierUTF8Char()
1831         !PP->isPreprocessedOutput())  in tryConsumeIdentifierUTF8Char()
1833           PP->getDiagnostics(), LangOpts, CodePoint,  in tryConsumeIdentifierUTF8Char()
1841           PP->getDiagnostics(), CodePoint,  in tryConsumeIdentifierUTF8Char()
1843     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,  in tryConsumeIdentifierUTF8Char()
1846     maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,  in tryConsumeIdentifierUTF8Char()
1850   // Once we sucessfully parsed some UTF-8,  in tryConsumeIdentifierUTF8Char()
1863         !PP->isPreprocessedOutput()) {  in LexUnicodeIdentifierStart()
1865         diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,  in LexUnicodeIdentifierStart()
1867       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,  in LexUnicodeIdentifierStart()
1870       maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,  in LexUnicodeIdentifierStart()
1879       !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&  in LexUnicodeIdentifierStart()
1881     // Non-ASCII characters tend to creep into source code unintentionally.  in LexUnicodeIdentifierStart()
1884     // Note that we can /only/ do this when the non-ASCII character is actually  in LexUnicodeIdentifierStart()
1891         PP->getDiagnostics(), LangOpts, C,  in LexUnicodeIdentifierStart()
1916   while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {  in fastParseASCIIIdentifier()
1936   // Match [_A-Za-z0-9]*, we have already matched an identifier start.  in LexIdentifierContinue()
1978   const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);  in LexIdentifierContinue()
1979   // Note that we have to call PP->LookUpIdentifierInfo() even for code  in LexIdentifierContinue()
1986     // Return the code-completion token.  in LexIdentifierContinue()
1988     // Skip the code-completion char and all immediate identifier characters.  in LexIdentifierContinue()
1991     // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code  in LexIdentifierContinue()
2008   if (II->isHandleIdentifierCase())  in LexIdentifierContinue()
2009     return PP->HandleIdentifier(Result);  in LexIdentifierContinue()
2014 /// isHexaLiteral - Return true if Start points to a hex constant.
2028 /// LexNumericConstant - Lex the remainder of a integer or floating point
2029 /// constant. From[-1] is the first character lexed.  Return the end of the
2039       CurPtr -= Size;  in LexNumericConstant()
2046   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {  in LexNumericConstant()
2054   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {  in LexNumericConstant()
2056     // not-quite-conforming extension. Only do so if this looks like it's  in LexNumericConstant()
2057     // actually meant to be a hexfloat, and not if it has a ud-suffix.  in LexNumericConstant()
2084   // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.  in LexNumericConstant()
2097 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2098 /// in C++11, or warn on a ud-suffix in C++98.
2126   // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix  in LexUDSuffix()
2127   // that does not start with an underscore is ill-formed. As a conforming  in LexUDSuffix()
2129   // them. We assume a suffix beginning with a UCN or UTF-8 character is more  in LexUDSuffix()
2130   // likely to be a ud-suffix than a macro, however, and accept that.  in LexUDSuffix()
2189 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2212         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.  in LexStringLiteral()
2215       FormTokenWithChars(Result, CurPtr-1, tok::unknown);  in LexStringLiteral()
2220       if (isCodeCompletionPoint(CurPtr-1)) {  in LexStringLiteral()
2222           codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);  in LexStringLiteral()
2224           PP->CodeCompleteNaturalLanguage();  in LexStringLiteral()
2225         FormTokenWithChars(Result, CurPtr - 1, tok::unknown);  in LexStringLiteral()
2230       NulCharacter = CurPtr-1;  in LexStringLiteral()
2235   // If we are in C++11, lex the optional ud-suffix.  in LexStringLiteral()
2250 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2257   //  universal-character-names, and line splicing) are reverted.  in LexRawStringLiteral()
2298       if (C == 0 && CurPtr-1 == BufferEnd) {  in LexRawStringLiteral()
2299         --CurPtr;  in LexRawStringLiteral()
2321     } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.  in LexRawStringLiteral()
2325       FormTokenWithChars(Result, CurPtr-1, tok::unknown);  in LexRawStringLiteral()
2330   // If we are in C++11, lex the optional ud-suffix.  in LexRawStringLiteral()
2341 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2355         (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.  in LexAngledStringLiteral()
2363       if (isCodeCompletionPoint(CurPtr - 1)) {  in LexAngledStringLiteral()
2364         codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);  in LexAngledStringLiteral()
2366         FormTokenWithChars(Result, CurPtr - 1, tok::unknown);  in LexAngledStringLiteral()
2369       NulCharacter = CurPtr-1;  in LexAngledStringLiteral()
2389   StringRef PartialPath(PathStart, CompletionPoint - PathStart);  in codeCompleteIncludedFile()
2397   PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(  in codeCompleteIncludedFile()
2398       StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));  in codeCompleteIncludedFile()
2412   PP->setCodeCompletionTokenRange(  in codeCompleteIncludedFile()
2413       FileLoc.getLocWithOffset(StartOfFilename - BufferStart),  in codeCompleteIncludedFile()
2414       FileLoc.getLocWithOffset(CompletionPoint - BufferStart));  in codeCompleteIncludedFile()
2415   PP->CodeCompleteIncludedFile(Dir, IsAngled);  in codeCompleteIncludedFile()
2418 /// LexCharConstant - Lex the remainder of a character constant, after having
2448         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.  in LexCharConstant()
2451       FormTokenWithChars(Result, CurPtr-1, tok::unknown);  in LexCharConstant()
2456       if (isCodeCompletionPoint(CurPtr-1)) {  in LexCharConstant()
2457         PP->CodeCompleteNaturalLanguage();  in LexCharConstant()
2458         FormTokenWithChars(Result, CurPtr-1, tok::unknown);  in LexCharConstant()
2463       NulCharacter = CurPtr-1;  in LexCharConstant()
2468   // If we are in C++11, lex the optional ud-suffix.  in LexCharConstant()
2483 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2484 /// Update BufferPtr to point to the next non-whitespace character and return.
2489   // Whitespace - Skip it, then return the token after the whitespace.  in SkipWhitespace()
2490   bool SawNewline = isVerticalWhitespace(CurPtr[-1]);  in SkipWhitespace()
2501     setLastNewLine(CurPtr - 1);  in SkipWhitespace()
2538   char PrevChar = CurPtr[-1];  in SkipWhitespace()
2547       if (auto *Handler = PP->getEmptylineHandler())  in SkipWhitespace()
2548         Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),  in SkipWhitespace()
2558 /// newline character that terminates the comment.  Then update BufferPtr and
2576   // Scan over the body of the comment.  The common case, when scanning, is that  in SkipLineComment()
2577   // the comment contains normal ascii characters with nothing interesting in  in SkipLineComment()
2581   // character that ends the line comment.  in SkipLineComment()
2584   // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a  in SkipLineComment()
2585   // diagnostic only once per entire ill-formed subsequence to avoid  in SkipLineComment()
2586   // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).  in SkipLineComment()
2594            C != '\n' && C != '\r') { // Newline or DOS-style newline.  in SkipLineComment()
2617       const char *EscapePtr = CurPtr-1;  in SkipLineComment()
2620         --EscapePtr;  in SkipLineComment()
2627       else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&  in SkipLineComment()
2628                EscapePtr[-2] == '?' && LangOpts.Trigraphs)  in SkipLineComment()
2629         // Trigraph-escaped newline.  in SkipLineComment()
2630         CurPtr = EscapePtr-2;  in SkipLineComment()
2657     // \n, then we had an escaped newline within the comment.  Emit diagnostic  in SkipLineComment()
2658     // unless the next line is also a // comment.  in SkipLineComment()
2663           // Okay, we found a // comment that ends in a newline, if the next  in SkipLineComment()
2664           // line is also a // comment, but has spaces, don't emit a diagnostic.  in SkipLineComment()
2674             Diag(OldPtr-1, diag::ext_multi_line_line_comment);  in SkipLineComment()
2680       --CurPtr;  in SkipLineComment()
2684     if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {  in SkipLineComment()
2685       PP->CodeCompleteNaturalLanguage();  in SkipLineComment()
2691   // Found but did not consume the newline.  Notify comment handlers about the  in SkipLineComment()
2692   // comment unless we're in a #if 0 block.  in SkipLineComment()
2694       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),  in SkipLineComment()
2700   // If we are returning comments as tokens, return this comment as a token.  in SkipLineComment()
2715   // comment above in that mode.  in SkipLineComment()
2727 /// If in save-comment mode, package up this Line comment in an appropriate
2730   // If we're not in a preprocessor directive, just return the // comment  in SaveLineComment()
2732   FormTokenWithChars(Result, CurPtr, tok::comment);  in SaveLineComment()
2737   // If this Line-style comment is in a macro definition, transmogrify it into  in SaveLineComment()
2738   // a C-style block comment.  in SaveLineComment()
2740   std::string Spelling = PP->getSpelling(Result, &Invalid);  in SaveLineComment()
2744   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");  in SaveLineComment()
2748   Result.setKind(tok::comment);  in SaveLineComment()
2749   PP->CreateString(Spelling, Result,  in SaveLineComment()
2754 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2756 /// a diagnostic if so.  We know that the newline is inside of a block comment.
2768     --CurPtr;  in isEndOfBlockCommentWithEscapedNewLine()
2770     // If this is a two-character newline sequence, skip the other character.  in isEndOfBlockCommentWithEscapedNewLine()
2772       // \n\n or \r\r -> not escaped newline.  in isEndOfBlockCommentWithEscapedNewLine()
2775       // \n\r or \r\n -> skip the newline.  in isEndOfBlockCommentWithEscapedNewLine()
2776       --CurPtr;  in isEndOfBlockCommentWithEscapedNewLine()
2783       --CurPtr;  in isEndOfBlockCommentWithEscapedNewLine()
2788       --CurPtr;  in isEndOfBlockCommentWithEscapedNewLine()
2789     } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {  in isEndOfBlockCommentWithEscapedNewLine()
2791       TrigraphPos = CurPtr - 2;  in isEndOfBlockCommentWithEscapedNewLine()
2792       CurPtr -= 3;  in isEndOfBlockCommentWithEscapedNewLine()
2798     // splicing we have a '*/' ending the comment.  in isEndOfBlockCommentWithEscapedNewLine()
2810       if (!L->isLexingRawMode())  in isEndOfBlockCommentWithEscapedNewLine()
2811         L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);  in isEndOfBlockCommentWithEscapedNewLine()
2814     if (!L->isLexingRawMode())  in isEndOfBlockCommentWithEscapedNewLine()
2815       L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);  in isEndOfBlockCommentWithEscapedNewLine()
2819   if (!L->isLexingRawMode())  in isEndOfBlockCommentWithEscapedNewLine()
2820     L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);  in isEndOfBlockCommentWithEscapedNewLine()
2823   if (SpacePos && !L->isLexingRawMode())  in isEndOfBlockCommentWithEscapedNewLine()
2824     L->Diag(SpacePos, diag::backslash_newline_space);  in isEndOfBlockCommentWithEscapedNewLine()
2836 /// We have just read from input the / and * characters that started a comment.
2837 /// Read until we find the * and / characters that terminate the comment.
2839 /// comments, because they cannot cause the comment to end.  The only thing
2840 /// that can happen is the comment could end with an escaped newline between
2861     --CurPtr;  in SkipBlockComment()
2863     // KeepWhitespaceMode should return this broken comment as a token.  Since  in SkipBlockComment()
2864     // it isn't a well formed comment, just return it as an 'unknown' token.  in SkipBlockComment()
2875   // then this slash does not end the block comment, it is part of it.  in SkipBlockComment()
2880   // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a  in SkipBlockComment()
2881   // diagnostic only once per entire ill-formed subsequence to avoid  in SkipBlockComment()
2882   // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).  in SkipBlockComment()
2886     // Skip over all non-interesting characters until we find end of buffer or a  in SkipBlockComment()
2889         // If there is a code-completion point avoid the fast scan because it  in SkipBlockComment()
2891         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {  in SkipBlockComment()
2892       // While not aligned to a 16-byte boundary.  in SkipBlockComment()
2959     // Loop to scan the remainder, warning on invalid UTF-8  in SkipBlockComment()
2972           (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);  in SkipBlockComment()
2975           Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);  in SkipBlockComment()
2979         CurPtr += Length - 1;  in SkipBlockComment()
2986       if (CurPtr[-2] == '*')  // We found the final */.  We're done!  in SkipBlockComment()
2989       if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {  in SkipBlockComment()
2990         if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,  in SkipBlockComment()
2998         // If this is a /* inside of the comment, emit a warning.  Don't do this  in SkipBlockComment()
2999         // if this is a /*/, which will end the comment.  This misses cases with  in SkipBlockComment()
3002           Diag(CurPtr-1, diag::warn_nested_block_comment);  in SkipBlockComment()
3009       // comment, which surely would confuse the parser.  in SkipBlockComment()
3010       --CurPtr;  in SkipBlockComment()
3012       // KeepWhitespaceMode should return this broken comment as a token.  Since  in SkipBlockComment()
3013       // it isn't a well formed comment, just return it as an 'unknown' token.  in SkipBlockComment()
3021     } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {  in SkipBlockComment()
3022       PP->CodeCompleteNaturalLanguage();  in SkipBlockComment()
3030   // Notify comment handlers about the comment unless we're in a #if 0 block.  in SkipBlockComment()
3032       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),  in SkipBlockComment()
3038   // If we are returning comments as tokens, return this comment as a token.  in SkipBlockComment()
3040     FormTokenWithChars(Result, CurPtr, tok::comment);  in SkipBlockComment()
3044   // It is common for the tokens immediately after a /**/ comment to be  in SkipBlockComment()
3047   // have already returned above with the comment as a token.  in SkipBlockComment()
3059 //===----------------------------------------------------------------------===//
3061 //===----------------------------------------------------------------------===//
3063 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3071   // CurPtr - Cache BufferPtr in an automatic variable.  in ReadToEndOfLine()
3078         Result->push_back(Char);  in ReadToEndOfLine()
3082       if (CurPtr-1 != BufferEnd) {  in ReadToEndOfLine()
3083         if (isCodeCompletionPoint(CurPtr-1)) {  in ReadToEndOfLine()
3084           PP->CodeCompleteNaturalLanguage();  in ReadToEndOfLine()
3091           Result->push_back(Char);  in ReadToEndOfLine()
3099       assert(CurPtr[-1] == Char && "Trigraphs for newline?");  in ReadToEndOfLine()
3100       BufferPtr = CurPtr-1;  in ReadToEndOfLine()
3106           PP->CodeCompleteNaturalLanguage();  in ReadToEndOfLine()
3117 /// LexEndOfFile - CurPtr points to the end of this file.  Handle this
3131     // Restore comment saving mode, in case it was disabled for directive.  in LexEndOfFile()
3146   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {  in LexEndOfFile()
3147     PP->setRecordedPreambleConditionalStack(ConditionalStack);  in LexEndOfFile()
3160     if (PP->getCodeCompletionFileLoc() != FileLoc)  in LexEndOfFile()
3161       PP->Diag(ConditionalStack.back().IfLoc,  in LexEndOfFile()
3166   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue  in LexEndOfFile()
3168   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {  in LexEndOfFile()
3169     DiagnosticsEngine &Diags = PP->getDiagnostics();  in LexEndOfFile()
3176       // non-extension, user-requested "missing newline at EOF" warning.  in LexEndOfFile()
3193   return PP->HandleEndOfFile(Result, isPragmaLexer());  in LexEndOfFile()
3196 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3232   // Restore the lexer back to non-skipping mode.  in isNextPPTokenLParen()
3245   auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);  in FindConflictEnd()
3250         (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {  in FindConflictEnd()
3260 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
3267       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')  in IsStartOfConflictMarker()
3271   if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&  in IsStartOfConflictMarker()
3272       !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))  in IsStartOfConflictMarker()
3291     // end-of-conflict marker starts with \r or \n.  in IsStartOfConflictMarker()
3304 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3311       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')  in HandleEndOfConflictMarker()
3349   BufferEnd -= 1; // Scan until the second last character.  in findPlaceholderEnd()
3358   assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");  in lexEditorPlaceholder()
3359   if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)  in lexEditorPlaceholder()
3364   const char *Start = CurPtr - 1;  in lexEditorPlaceholder()
3370   PP->LookUpIdentifierInfo(Result);  in lexEditorPlaceholder()
3377   if (PP && PP->isCodeCompletionEnabled()) {  in isCodeCompletionPoint()
3378     SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);  in isCodeCompletionPoint()
3379     return Loc == PP->getCodeCompletionLoc();  in isCodeCompletionPoint()
3410   const char *KindLoc = &CurPtr[-1];  in tryReadNumericUCN()
3428     if (Value == -1U) {  in tryReadNumericUCN()
3477     Diag(SlashLoc, PP->getLangOpts().CPlusPlus23  in tryReadNumericUCN()
3480         << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);  in tryReadNumericUCN()
3484     Result->setFlag(Token::HasUCN);  in tryReadNumericUCN()
3488     if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))  in tryReadNumericUCN()
3509   const char *KindLoc = &CurPtr[-1];  in tryReadNamedUCN()
3551           << makeCharRange(*this, StartName, CurPtr - CharSize);  in tryReadNamedUCN()
3555                    makeCharRange(*this, StartName, CurPtr - CharSize),  in tryReadNamedUCN()
3556                    LooseMatch->Name);  in tryReadNamedUCN()
3565     Diag(SlashLoc, PP->getLangOpts().CPlusPlus23  in tryReadNamedUCN()
3568         << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);  in tryReadNamedUCN()
3575     Match = LooseMatch->CodePoint;  in tryReadNamedUCN()
3578     Result->setFlag(Token::HasUCN);  in tryReadNamedUCN()
3582     if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))  in tryReadNamedUCN()
3615   // - in the range D800 through DFFF inclusive; or  in tryReadUCN()
3616   // - greater than 10FFFF.  in tryReadUCN()
3617   // A universal-character-name outside the c-char-sequence of a character  in tryReadUCN()
3618   // constant, or the s-char-sequence of a string-literal shall not designate  in tryReadUCN()
3622   //   universal-character-name corresponds to a surrogate code point (in the  in tryReadUCN()
3623   //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,  in tryReadUCN()
3624   //   if the hexadecimal value for a universal-character-name outside the  in tryReadUCN()
3625   //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or  in tryReadUCN()
3627   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the  in tryReadUCN()
3628   //   basic source character set, the program is ill-formed.  in tryReadUCN()
3660   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&  in CheckUnicodeWhitespace()
3710 /// LexTokenInternal - This implements a simple C family lexer.  It is an
3720   // CurPtr - Cache BufferPtr in an automatic variable.  in LexTokenInternal()
3754     if (CurPtr-1 == BufferEnd)  in LexTokenInternal()
3755       return LexEndOfFile(Result, CurPtr-1);  in LexTokenInternal()
3758     if (isCodeCompletionPoint(CurPtr-1)) {  in LexTokenInternal()
3759       // Return the code-completion token.  in LexTokenInternal()
3766       Diag(CurPtr-1, diag::null_in_file);  in LexTokenInternal()
3779         Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);  in LexTokenInternal()
3780       return LexEndOfFile(Result, CurPtr-1);  in LexTokenInternal()
3798       // Restore comment saving mode, in case it was disabled for directive.  in LexTokenInternal()
3805       NewLinePtr = CurPtr - 1;  in LexTokenInternal()
3832     // If the next token is obviously a // or /* */ comment, skip it efficiently  in LexTokenInternal()
3854     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
3859   // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or  in LexTokenInternal()
3860   // UTF-8 or UTF-16 string literal (C11/C++11).  in LexTokenInternal()
3862     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
3868       // UTF-16 string literal  in LexTokenInternal()
3873       // UTF-16 character constant  in LexTokenInternal()
3878       // UTF-16 raw string literal  in LexTokenInternal()
3889         // UTF-8 string literal  in LexTokenInternal()
3904           // UTF-8 raw string literal  in LexTokenInternal()
3919   case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal  in LexTokenInternal()
3920     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
3926       // UTF-32 string literal  in LexTokenInternal()
3931       // UTF-32 character constant  in LexTokenInternal()
3936       // UTF-32 raw string literal  in LexTokenInternal()
3949     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
3965     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
3999     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
4006         Diag(CurPtr-1, diag::ext_dollar_in_identifier);  in LexTokenInternal()
4007       // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
4017     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
4023     // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
4054       // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
4102   case '-':  in LexTokenInternal()
4104     if (Char == '-') {      // --  in LexTokenInternal()
4108                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*  in LexTokenInternal()
4112     } else if (Char == '>') {   // ->  in LexTokenInternal()
4115     } else if (Char == '=') {   // -=  in LexTokenInternal()
4136     if (Char == '/') {         // Line comment.  in LexTokenInternal()
4138       // want to lex this as a comment.  There is one problem with this though,  in LexTokenInternal()
4143       // If so, we will lex that as a "/" instead of the start of a comment.  in LexTokenInternal()
4148         if (!(PP && PP->isPreprocessedOutput()))  in LexTokenInternal()
4156         // It is common for the tokens immediately after a // comment to be  in LexTokenInternal()
4163     if (Char == '*') {  // /**/ comment.  in LexTokenInternal()
4186       Kind = tok::r_brace;                             // '%>' -> '}'  in LexTokenInternal()
4192         Kind = tok::hashhash;                          // '%:%:' -> '##'  in LexTokenInternal()
4195       } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize  in LexTokenInternal()
4200       } else {                                         // '%:' -> '#'  in LexTokenInternal()
4204         // TODO: -fpreprocessed mode??  in LexTokenInternal()
4224       } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {  in LexTokenInternal()
4228       } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {  in LexTokenInternal()
4229         // If this is '<<<<' and we're in a Perforce-style conflict marker,  in LexTokenInternal()
4261     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['  in LexTokenInternal()
4281     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'  in LexTokenInternal()
4302       } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {  in LexTokenInternal()
4306       } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {  in LexTokenInternal()
4340       if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))  in LexTokenInternal()
4351       Kind = tok::r_square; // ':>' -> ']'  in LexTokenInternal()
4367       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))  in LexTokenInternal()
4384     } else if (Char == '@' && LangOpts.MicrosoftExt) {  // #@ -> Charize  in LexTokenInternal()
4393       // TODO: -fpreprocessed mode??  in LexTokenInternal()
4403     if (CurPtr[-1] == '@' && LangOpts.ObjC)  in LexTokenInternal()
4439     --CurPtr;  in LexTokenInternal()
4458         PP->isPreprocessedOutput()) {  in LexTokenInternal()
4464     // Non-ASCII characters tend to creep into source code unintentionally.  in LexTokenInternal()
4466     // just diagnose the invalid UTF-8, then drop the character.  in LexTokenInternal()
4477   // Notify MIOpt that we read a non-whitespace/non-comment token.  in LexTokenInternal()
4488   PP->HandleDirective(Result);  in LexTokenInternal()
4490   if (PP->hadModuleLoaderFatalFailure())  in LexTokenInternal()
4554     PP->HandleDirective(Result);  in LexDependencyDirectiveToken()
4560       const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);  in LexDependencyDirectiveToken()
4561       if (II->isHandleIdentifierCase())  in LexDependencyDirectiveToken()
4562         return PP->HandleIdentifier(Result);  in LexDependencyDirectiveToken()
4634         --NestedIfs;  in LexDependencyDirectiveTokenWhileSkipping()