1 //===- DependencyDirectivesScanner.cpp ------------------------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 /// 9 /// \file 10 /// This is the interface for scanning header and source files to get the 11 /// minimum necessary preprocessor directives for evaluating includes. It 12 /// reduces the source down to #define, #include, #import, @import, and any 13 /// conditional preprocessor logic that contains one of those. 14 /// 15 //===----------------------------------------------------------------------===// 16 17 #include "clang/Lex/DependencyDirectivesScanner.h" 18 #include "clang/Basic/CharInfo.h" 19 #include "clang/Basic/Diagnostic.h" 20 #include "clang/Lex/LexDiagnostic.h" 21 #include "clang/Lex/Lexer.h" 22 #include "llvm/ADT/ScopeExit.h" 23 #include "llvm/ADT/SmallString.h" 24 #include "llvm/ADT/StringMap.h" 25 #include "llvm/ADT/StringSwitch.h" 26 #include <optional> 27 28 using namespace clang; 29 using namespace clang::dependency_directives_scan; 30 using namespace llvm; 31 32 namespace { 33 34 struct DirectiveWithTokens { 35 DirectiveKind Kind; 36 unsigned NumTokens; 37 38 DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens) 39 : Kind(Kind), NumTokens(NumTokens) {} 40 }; 41 42 /// Does an efficient "scan" of the sources to detect the presence of 43 /// preprocessor (or module import) directives and collects the raw lexed tokens 44 /// for those directives so that the \p Lexer can "replay" them when the file is 45 /// included. 46 /// 47 /// Note that the behavior of the raw lexer is affected by the language mode, 48 /// while at this point we want to do a scan and collect tokens once, 49 /// irrespective of the language mode that the file will get included in. To 50 /// compensate for that the \p Lexer, while "replaying", will adjust a token 51 /// where appropriate, when it could affect the preprocessor's state. 52 /// For example in a directive like 53 /// 54 /// \code 55 /// #if __has_cpp_attribute(clang::fallthrough) 56 /// \endcode 57 /// 58 /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2 59 /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon' 60 /// while in C++ mode. 61 struct Scanner { 62 Scanner(StringRef Input, 63 SmallVectorImpl<dependency_directives_scan::Token> &Tokens, 64 DiagnosticsEngine *Diags, SourceLocation InputSourceLoc) 65 : Input(Input), Tokens(Tokens), Diags(Diags), 66 InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()), 67 TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(), 68 Input.end()) {} 69 70 static LangOptions getLangOptsForDepScanning() { 71 LangOptions LangOpts; 72 // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'. 73 LangOpts.ObjC = true; 74 LangOpts.LineComment = true; 75 return LangOpts; 76 } 77 78 /// Lex the provided source and emit the directive tokens. 79 /// 80 /// \returns True on error. 81 bool scan(SmallVectorImpl<Directive> &Directives); 82 83 private: 84 /// Lexes next token and advances \p First and the \p Lexer. 85 [[nodiscard]] dependency_directives_scan::Token & 86 lexToken(const char *&First, const char *const End); 87 88 dependency_directives_scan::Token &lexIncludeFilename(const char *&First, 89 const char *const End); 90 91 void skipLine(const char *&First, const char *const End); 92 void skipDirective(StringRef Name, const char *&First, const char *const End); 93 94 /// Lexes next token and if it is identifier returns its string, otherwise 95 /// it skips the current line and returns \p std::nullopt. 96 /// 97 /// In any case (whatever the token kind) \p First and the \p Lexer will 98 /// advance beyond the token. 99 [[nodiscard]] std::optional<StringRef> 100 tryLexIdentifierOrSkipLine(const char *&First, const char *const End); 101 102 /// Used when it is certain that next token is an identifier. 103 [[nodiscard]] StringRef lexIdentifier(const char *&First, 104 const char *const End); 105 106 /// Lexes next token and returns true iff it is an identifier that matches \p 107 /// Id, otherwise it skips the current line and returns false. 108 /// 109 /// In any case (whatever the token kind) \p First and the \p Lexer will 110 /// advance beyond the token. 111 [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id, 112 const char *&First, 113 const char *const End); 114 115 [[nodiscard]] bool scanImpl(const char *First, const char *const End); 116 [[nodiscard]] bool lexPPLine(const char *&First, const char *const End); 117 [[nodiscard]] bool lexAt(const char *&First, const char *const End); 118 [[nodiscard]] bool lexModule(const char *&First, const char *const End); 119 [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First, 120 const char *const End); 121 [[nodiscard]] bool lexPragma(const char *&First, const char *const End); 122 [[nodiscard]] bool lexEndif(const char *&First, const char *const End); 123 [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First, 124 const char *const End); 125 [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind, 126 const char *&First, 127 const char *const End); 128 void lexPPDirectiveBody(const char *&First, const char *const End); 129 130 DirectiveWithTokens &pushDirective(DirectiveKind Kind) { 131 Tokens.append(CurDirToks); 132 DirsWithToks.emplace_back(Kind, CurDirToks.size()); 133 CurDirToks.clear(); 134 return DirsWithToks.back(); 135 } 136 void popDirective() { 137 Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens); 138 } 139 DirectiveKind topDirective() const { 140 return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind; 141 } 142 143 unsigned getOffsetAt(const char *CurPtr) const { 144 return CurPtr - Input.data(); 145 } 146 147 /// Reports a diagnostic if the diagnostic engine is provided. Always returns 148 /// true at the end. 149 bool reportError(const char *CurPtr, unsigned Err); 150 151 StringMap<char> SplitIds; 152 StringRef Input; 153 SmallVectorImpl<dependency_directives_scan::Token> &Tokens; 154 DiagnosticsEngine *Diags; 155 SourceLocation InputSourceLoc; 156 157 const char *LastTokenPtr = nullptr; 158 /// Keeps track of the tokens for the currently lexed directive. Once a 159 /// directive is fully lexed and "committed" then the tokens get appended to 160 /// \p Tokens and \p CurDirToks is cleared for the next directive. 161 SmallVector<dependency_directives_scan::Token, 32> CurDirToks; 162 /// The directives that were lexed along with the number of tokens that each 163 /// directive contains. The tokens of all the directives are kept in \p Tokens 164 /// vector, in the same order as the directives order in \p DirsWithToks. 165 SmallVector<DirectiveWithTokens, 64> DirsWithToks; 166 LangOptions LangOpts; 167 Lexer TheLexer; 168 }; 169 170 } // end anonymous namespace 171 172 bool Scanner::reportError(const char *CurPtr, unsigned Err) { 173 if (!Diags) 174 return true; 175 assert(CurPtr >= Input.data() && "invalid buffer ptr"); 176 Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err); 177 return true; 178 } 179 180 static void skipOverSpaces(const char *&First, const char *const End) { 181 while (First != End && isHorizontalWhitespace(*First)) 182 ++First; 183 } 184 185 [[nodiscard]] static bool isRawStringLiteral(const char *First, 186 const char *Current) { 187 assert(First <= Current); 188 189 // Check if we can even back up. 190 if (*Current != '"' || First == Current) 191 return false; 192 193 // Check for an "R". 194 --Current; 195 if (*Current != 'R') 196 return false; 197 if (First == Current || !isAsciiIdentifierContinue(*--Current)) 198 return true; 199 200 // Check for a prefix of "u", "U", or "L". 201 if (*Current == 'u' || *Current == 'U' || *Current == 'L') 202 return First == Current || !isAsciiIdentifierContinue(*--Current); 203 204 // Check for a prefix of "u8". 205 if (*Current != '8' || First == Current || *Current-- != 'u') 206 return false; 207 return First == Current || !isAsciiIdentifierContinue(*--Current); 208 } 209 210 static void skipRawString(const char *&First, const char *const End) { 211 assert(First[0] == '"'); 212 assert(First[-1] == 'R'); 213 214 const char *Last = ++First; 215 while (Last != End && *Last != '(') 216 ++Last; 217 if (Last == End) { 218 First = Last; // Hit the end... just give up. 219 return; 220 } 221 222 StringRef Terminator(First, Last - First); 223 for (;;) { 224 // Move First to just past the next ")". 225 First = Last; 226 while (First != End && *First != ')') 227 ++First; 228 if (First == End) 229 return; 230 ++First; 231 232 // Look ahead for the terminator sequence. 233 Last = First; 234 while (Last != End && size_t(Last - First) < Terminator.size() && 235 Terminator[Last - First] == *Last) 236 ++Last; 237 238 // Check if we hit it (or the end of the file). 239 if (Last == End) { 240 First = Last; 241 return; 242 } 243 if (size_t(Last - First) < Terminator.size()) 244 continue; 245 if (*Last != '"') 246 continue; 247 First = Last + 1; 248 return; 249 } 250 } 251 252 // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n) 253 static unsigned isEOL(const char *First, const char *const End) { 254 if (First == End) 255 return 0; 256 if (End - First > 1 && isVerticalWhitespace(First[0]) && 257 isVerticalWhitespace(First[1]) && First[0] != First[1]) 258 return 2; 259 return !!isVerticalWhitespace(First[0]); 260 } 261 262 static void skipString(const char *&First, const char *const End) { 263 assert(*First == '\'' || *First == '"' || *First == '<'); 264 const char Terminator = *First == '<' ? '>' : *First; 265 for (++First; First != End && *First != Terminator; ++First) { 266 // String and character literals don't extend past the end of the line. 267 if (isVerticalWhitespace(*First)) 268 return; 269 if (*First != '\\') 270 continue; 271 // Skip past backslash to the next character. This ensures that the 272 // character right after it is skipped as well, which matters if it's 273 // the terminator. 274 if (++First == End) 275 return; 276 if (!isWhitespace(*First)) 277 continue; 278 // Whitespace after the backslash might indicate a line continuation. 279 const char *FirstAfterBackslashPastSpace = First; 280 skipOverSpaces(FirstAfterBackslashPastSpace, End); 281 if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) { 282 // Advance the character pointer to the next line for the next 283 // iteration. 284 First = FirstAfterBackslashPastSpace + NLSize - 1; 285 } 286 } 287 if (First != End) 288 ++First; // Finish off the string. 289 } 290 291 // Returns the length of the skipped newline 292 static unsigned skipNewline(const char *&First, const char *End) { 293 if (First == End) 294 return 0; 295 assert(isVerticalWhitespace(*First)); 296 unsigned Len = isEOL(First, End); 297 assert(Len && "expected newline"); 298 First += Len; 299 return Len; 300 } 301 302 static bool wasLineContinuation(const char *First, unsigned EOLLen) { 303 return *(First - (int)EOLLen - 1) == '\\'; 304 } 305 306 static void skipToNewlineRaw(const char *&First, const char *const End) { 307 for (;;) { 308 if (First == End) 309 return; 310 311 unsigned Len = isEOL(First, End); 312 if (Len) 313 return; 314 315 do { 316 if (++First == End) 317 return; 318 Len = isEOL(First, End); 319 } while (!Len); 320 321 if (First[-1] != '\\') 322 return; 323 324 First += Len; 325 // Keep skipping lines... 326 } 327 } 328 329 static void skipLineComment(const char *&First, const char *const End) { 330 assert(First[0] == '/' && First[1] == '/'); 331 First += 2; 332 skipToNewlineRaw(First, End); 333 } 334 335 static void skipBlockComment(const char *&First, const char *const End) { 336 assert(First[0] == '/' && First[1] == '*'); 337 if (End - First < 4) { 338 First = End; 339 return; 340 } 341 for (First += 3; First != End; ++First) 342 if (First[-1] == '*' && First[0] == '/') { 343 ++First; 344 return; 345 } 346 } 347 348 /// \returns True if the current single quotation mark character is a C++ 14 349 /// digit separator. 350 static bool isQuoteCppDigitSeparator(const char *const Start, 351 const char *const Cur, 352 const char *const End) { 353 assert(*Cur == '\'' && "expected quotation character"); 354 // skipLine called in places where we don't expect a valid number 355 // body before `start` on the same line, so always return false at the start. 356 if (Start == Cur) 357 return false; 358 // The previous character must be a valid PP number character. 359 // Make sure that the L, u, U, u8 prefixes don't get marked as a 360 // separator though. 361 char Prev = *(Cur - 1); 362 if (Prev == 'L' || Prev == 'U' || Prev == 'u') 363 return false; 364 if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u') 365 return false; 366 if (!isPreprocessingNumberBody(Prev)) 367 return false; 368 // The next character should be a valid identifier body character. 369 return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1)); 370 } 371 372 void Scanner::skipLine(const char *&First, const char *const End) { 373 for (;;) { 374 assert(First <= End); 375 if (First == End) 376 return; 377 378 if (isVerticalWhitespace(*First)) { 379 skipNewline(First, End); 380 return; 381 } 382 const char *Start = First; 383 while (First != End && !isVerticalWhitespace(*First)) { 384 // Iterate over strings correctly to avoid comments and newlines. 385 if (*First == '"' || 386 (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) { 387 LastTokenPtr = First; 388 if (isRawStringLiteral(Start, First)) 389 skipRawString(First, End); 390 else 391 skipString(First, End); 392 continue; 393 } 394 395 // Iterate over comments correctly. 396 if (*First != '/' || End - First < 2) { 397 LastTokenPtr = First; 398 ++First; 399 continue; 400 } 401 402 if (First[1] == '/') { 403 // "//...". 404 skipLineComment(First, End); 405 continue; 406 } 407 408 if (First[1] != '*') { 409 LastTokenPtr = First; 410 ++First; 411 continue; 412 } 413 414 // "/*...*/". 415 skipBlockComment(First, End); 416 } 417 if (First == End) 418 return; 419 420 // Skip over the newline. 421 unsigned Len = skipNewline(First, End); 422 if (!wasLineContinuation(First, Len)) // Continue past line-continuations. 423 break; 424 } 425 } 426 427 void Scanner::skipDirective(StringRef Name, const char *&First, 428 const char *const End) { 429 if (llvm::StringSwitch<bool>(Name) 430 .Case("warning", true) 431 .Case("error", true) 432 .Default(false)) 433 // Do not process quotes or comments. 434 skipToNewlineRaw(First, End); 435 else 436 skipLine(First, End); 437 } 438 439 static void skipWhitespace(const char *&First, const char *const End) { 440 for (;;) { 441 assert(First <= End); 442 skipOverSpaces(First, End); 443 444 if (End - First < 2) 445 return; 446 447 if (First[0] == '\\' && isVerticalWhitespace(First[1])) { 448 skipNewline(++First, End); 449 continue; 450 } 451 452 // Check for a non-comment character. 453 if (First[0] != '/') 454 return; 455 456 // "// ...". 457 if (First[1] == '/') { 458 skipLineComment(First, End); 459 return; 460 } 461 462 // Cannot be a comment. 463 if (First[1] != '*') 464 return; 465 466 // "/*...*/". 467 skipBlockComment(First, End); 468 } 469 } 470 471 bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First, 472 const char *const End) { 473 const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset; 474 for (;;) { 475 const dependency_directives_scan::Token &Tok = lexToken(First, End); 476 if (Tok.is(tok::eof)) 477 return reportError( 478 DirectiveLoc, 479 diag::err_dep_source_scanner_missing_semi_after_at_import); 480 if (Tok.is(tok::semi)) 481 break; 482 } 483 pushDirective(Kind); 484 skipWhitespace(First, End); 485 if (First == End) 486 return false; 487 if (!isVerticalWhitespace(*First)) 488 return reportError( 489 DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import); 490 skipNewline(First, End); 491 return false; 492 } 493 494 dependency_directives_scan::Token &Scanner::lexToken(const char *&First, 495 const char *const End) { 496 clang::Token Tok; 497 TheLexer.LexFromRawLexer(Tok); 498 First = Input.data() + TheLexer.getCurrentBufferOffset(); 499 assert(First <= End); 500 501 unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); 502 CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), 503 Tok.getFlags()); 504 return CurDirToks.back(); 505 } 506 507 dependency_directives_scan::Token & 508 Scanner::lexIncludeFilename(const char *&First, const char *const End) { 509 clang::Token Tok; 510 TheLexer.LexIncludeFilename(Tok); 511 First = Input.data() + TheLexer.getCurrentBufferOffset(); 512 assert(First <= End); 513 514 unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength(); 515 CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(), 516 Tok.getFlags()); 517 return CurDirToks.back(); 518 } 519 520 void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) { 521 while (true) { 522 const dependency_directives_scan::Token &Tok = lexToken(First, End); 523 if (Tok.is(tok::eod)) 524 break; 525 } 526 } 527 528 [[nodiscard]] std::optional<StringRef> 529 Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) { 530 const dependency_directives_scan::Token &Tok = lexToken(First, End); 531 if (Tok.isNot(tok::raw_identifier)) { 532 if (!Tok.is(tok::eod)) 533 skipLine(First, End); 534 return std::nullopt; 535 } 536 537 bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning; 538 if (LLVM_LIKELY(!NeedsCleaning)) 539 return Input.slice(Tok.Offset, Tok.getEnd()); 540 541 SmallString<64> Spelling; 542 Spelling.resize(Tok.Length); 543 544 unsigned SpellingLength = 0; 545 const char *BufPtr = Input.begin() + Tok.Offset; 546 const char *AfterIdent = Input.begin() + Tok.getEnd(); 547 while (BufPtr < AfterIdent) { 548 unsigned Size; 549 Spelling[SpellingLength++] = 550 Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 551 BufPtr += Size; 552 } 553 554 return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0) 555 .first->first(); 556 } 557 558 StringRef Scanner::lexIdentifier(const char *&First, const char *const End) { 559 std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End); 560 assert(Id && "expected identifier token"); 561 return *Id; 562 } 563 564 bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First, 565 const char *const End) { 566 if (std::optional<StringRef> FoundId = 567 tryLexIdentifierOrSkipLine(First, End)) { 568 if (*FoundId == Id) 569 return true; 570 skipLine(First, End); 571 } 572 return false; 573 } 574 575 bool Scanner::lexAt(const char *&First, const char *const End) { 576 // Handle "@import". 577 578 // Lex '@'. 579 const dependency_directives_scan::Token &AtTok = lexToken(First, End); 580 assert(AtTok.is(tok::at)); 581 (void)AtTok; 582 583 if (!isNextIdentifierOrSkipLine("import", First, End)) 584 return false; 585 return lexModuleDirectiveBody(decl_at_import, First, End); 586 } 587 588 bool Scanner::lexModule(const char *&First, const char *const End) { 589 StringRef Id = lexIdentifier(First, End); 590 bool Export = false; 591 if (Id == "export") { 592 Export = true; 593 std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End); 594 if (!NextId) 595 return false; 596 Id = *NextId; 597 } 598 599 if (Id != "module" && Id != "import") { 600 skipLine(First, End); 601 return false; 602 } 603 604 skipWhitespace(First, End); 605 606 // Ignore this as a module directive if the next character can't be part of 607 // an import. 608 609 switch (*First) { 610 case ':': 611 case '<': 612 case '"': 613 break; 614 default: 615 if (!isAsciiIdentifierContinue(*First)) { 616 skipLine(First, End); 617 return false; 618 } 619 } 620 621 TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false); 622 623 DirectiveKind Kind; 624 if (Id == "module") 625 Kind = Export ? cxx_export_module_decl : cxx_module_decl; 626 else 627 Kind = Export ? cxx_export_import_decl : cxx_import_decl; 628 629 return lexModuleDirectiveBody(Kind, First, End); 630 } 631 632 bool Scanner::lexPragma(const char *&First, const char *const End) { 633 std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); 634 if (!FoundId) 635 return false; 636 637 StringRef Id = *FoundId; 638 auto Kind = llvm::StringSwitch<DirectiveKind>(Id) 639 .Case("once", pp_pragma_once) 640 .Case("push_macro", pp_pragma_push_macro) 641 .Case("pop_macro", pp_pragma_pop_macro) 642 .Case("include_alias", pp_pragma_include_alias) 643 .Default(pp_none); 644 if (Kind != pp_none) { 645 lexPPDirectiveBody(First, End); 646 pushDirective(Kind); 647 return false; 648 } 649 650 if (Id != "clang") { 651 skipLine(First, End); 652 return false; 653 } 654 655 // #pragma clang. 656 if (!isNextIdentifierOrSkipLine("module", First, End)) 657 return false; 658 659 // #pragma clang module. 660 if (!isNextIdentifierOrSkipLine("import", First, End)) 661 return false; 662 663 // #pragma clang module import. 664 lexPPDirectiveBody(First, End); 665 pushDirective(pp_pragma_import); 666 return false; 667 } 668 669 bool Scanner::lexEndif(const char *&First, const char *const End) { 670 // Strip out "#else" if it's empty. 671 if (topDirective() == pp_else) 672 popDirective(); 673 674 // If "#ifdef" is empty, strip it and skip the "#endif". 675 // 676 // FIXME: Once/if Clang starts disallowing __has_include in macro expansions, 677 // we can skip empty `#if` and `#elif` blocks as well after scanning for a 678 // literal __has_include in the condition. Even without that rule we could 679 // drop the tokens if we scan for identifiers in the condition and find none. 680 if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) { 681 popDirective(); 682 skipLine(First, End); 683 return false; 684 } 685 686 return lexDefault(pp_endif, First, End); 687 } 688 689 bool Scanner::lexDefault(DirectiveKind Kind, const char *&First, 690 const char *const End) { 691 lexPPDirectiveBody(First, End); 692 pushDirective(Kind); 693 return false; 694 } 695 696 static bool isStartOfRelevantLine(char First) { 697 switch (First) { 698 case '#': 699 case '@': 700 case 'i': 701 case 'e': 702 case 'm': 703 return true; 704 } 705 return false; 706 } 707 708 bool Scanner::lexPPLine(const char *&First, const char *const End) { 709 assert(First != End); 710 711 skipWhitespace(First, End); 712 assert(First <= End); 713 if (First == End) 714 return false; 715 716 if (!isStartOfRelevantLine(*First)) { 717 skipLine(First, End); 718 assert(First <= End); 719 return false; 720 } 721 722 LastTokenPtr = First; 723 724 TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true); 725 726 auto ScEx1 = make_scope_exit([&]() { 727 /// Clear Scanner's CurDirToks before returning, in case we didn't push a 728 /// new directive. 729 CurDirToks.clear(); 730 }); 731 732 // Handle "@import". 733 if (*First == '@') 734 return lexAt(First, End); 735 736 if (*First == 'i' || *First == 'e' || *First == 'm') 737 return lexModule(First, End); 738 739 // Handle preprocessing directives. 740 741 TheLexer.setParsingPreprocessorDirective(true); 742 auto ScEx2 = make_scope_exit( 743 [&]() { TheLexer.setParsingPreprocessorDirective(false); }); 744 745 // Lex '#'. 746 const dependency_directives_scan::Token &HashTok = lexToken(First, End); 747 if (HashTok.is(tok::hashhash)) { 748 // A \p tok::hashhash at this location is passed by the preprocessor to the 749 // parser to interpret, like any other token. So for dependency scanning 750 // skip it like a normal token not affecting the preprocessor. 751 skipLine(First, End); 752 assert(First <= End); 753 return false; 754 } 755 assert(HashTok.is(tok::hash)); 756 (void)HashTok; 757 758 std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End); 759 if (!FoundId) 760 return false; 761 762 StringRef Id = *FoundId; 763 764 if (Id == "pragma") 765 return lexPragma(First, End); 766 767 auto Kind = llvm::StringSwitch<DirectiveKind>(Id) 768 .Case("include", pp_include) 769 .Case("__include_macros", pp___include_macros) 770 .Case("define", pp_define) 771 .Case("undef", pp_undef) 772 .Case("import", pp_import) 773 .Case("include_next", pp_include_next) 774 .Case("if", pp_if) 775 .Case("ifdef", pp_ifdef) 776 .Case("ifndef", pp_ifndef) 777 .Case("elif", pp_elif) 778 .Case("elifdef", pp_elifdef) 779 .Case("elifndef", pp_elifndef) 780 .Case("else", pp_else) 781 .Case("endif", pp_endif) 782 .Default(pp_none); 783 if (Kind == pp_none) { 784 skipDirective(Id, First, End); 785 return false; 786 } 787 788 if (Kind == pp_endif) 789 return lexEndif(First, End); 790 791 switch (Kind) { 792 case pp_include: 793 case pp___include_macros: 794 case pp_include_next: 795 case pp_import: 796 lexIncludeFilename(First, End); 797 break; 798 default: 799 break; 800 } 801 802 // Everything else. 803 return lexDefault(Kind, First, End); 804 } 805 806 static void skipUTF8ByteOrderMark(const char *&First, const char *const End) { 807 if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' && 808 First[2] == '\xbf') 809 First += 3; 810 } 811 812 bool Scanner::scanImpl(const char *First, const char *const End) { 813 skipUTF8ByteOrderMark(First, End); 814 while (First != End) 815 if (lexPPLine(First, End)) 816 return true; 817 return false; 818 } 819 820 bool Scanner::scan(SmallVectorImpl<Directive> &Directives) { 821 bool Error = scanImpl(Input.begin(), Input.end()); 822 823 if (!Error) { 824 // Add an EOF on success. 825 if (LastTokenPtr && 826 (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset)) 827 pushDirective(tokens_present_before_eof); 828 pushDirective(pp_eof); 829 } 830 831 ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens; 832 for (const DirectiveWithTokens &DirWithToks : DirsWithToks) { 833 assert(RemainingTokens.size() >= DirWithToks.NumTokens); 834 Directives.emplace_back(DirWithToks.Kind, 835 RemainingTokens.take_front(DirWithToks.NumTokens)); 836 RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens); 837 } 838 assert(RemainingTokens.empty()); 839 840 return Error; 841 } 842 843 bool clang::scanSourceForDependencyDirectives( 844 StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens, 845 SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags, 846 SourceLocation InputSourceLoc) { 847 return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives); 848 } 849 850 void clang::printDependencyDirectivesAsSource( 851 StringRef Source, 852 ArrayRef<dependency_directives_scan::Directive> Directives, 853 llvm::raw_ostream &OS) { 854 // Add a space separator where it is convenient for testing purposes. 855 auto needsSpaceSeparator = 856 [](tok::TokenKind Prev, 857 const dependency_directives_scan::Token &Tok) -> bool { 858 if (Prev == Tok.Kind) 859 return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square, 860 tok::r_square); 861 if (Prev == tok::raw_identifier && 862 Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal, 863 tok::char_constant, tok::header_name)) 864 return true; 865 if (Prev == tok::r_paren && 866 Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal, 867 tok::char_constant, tok::unknown)) 868 return true; 869 if (Prev == tok::comma && 870 Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less)) 871 return true; 872 return false; 873 }; 874 875 for (const dependency_directives_scan::Directive &Directive : Directives) { 876 if (Directive.Kind == tokens_present_before_eof) 877 OS << "<TokBeforeEOF>"; 878 std::optional<tok::TokenKind> PrevTokenKind; 879 for (const dependency_directives_scan::Token &Tok : Directive.Tokens) { 880 if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok)) 881 OS << ' '; 882 PrevTokenKind = Tok.Kind; 883 OS << Source.slice(Tok.Offset, Tok.getEnd()); 884 } 885 } 886 } 887