1 //===- Tokens.cpp - collect tokens from preprocessing ---------------------===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 #include "clang/Tooling/Syntax/Tokens.h" 9 10 #include "clang/Basic/Diagnostic.h" 11 #include "clang/Basic/IdentifierTable.h" 12 #include "clang/Basic/LLVM.h" 13 #include "clang/Basic/LangOptions.h" 14 #include "clang/Basic/SourceLocation.h" 15 #include "clang/Basic/SourceManager.h" 16 #include "clang/Basic/TokenKinds.h" 17 #include "clang/Lex/PPCallbacks.h" 18 #include "clang/Lex/Preprocessor.h" 19 #include "clang/Lex/Token.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/STLExtras.h" 22 #include "llvm/Support/Debug.h" 23 #include "llvm/Support/ErrorHandling.h" 24 #include "llvm/Support/FormatVariadic.h" 25 #include "llvm/Support/raw_ostream.h" 26 #include <cassert> 27 #include <optional> 28 #include <string> 29 #include <utility> 30 #include <vector> 31 32 using namespace clang; 33 using namespace clang::syntax; 34 35 namespace { 36 // Finds the smallest consecutive subsuquence of Toks that covers R. 37 llvm::ArrayRef<syntax::Token> 38 getTokensCovering(llvm::ArrayRef<syntax::Token> Toks, SourceRange R, 39 const SourceManager &SM) { 40 if (R.isInvalid()) 41 return {}; 42 const syntax::Token *Begin = 43 llvm::partition_point(Toks, [&](const syntax::Token &T) { 44 return SM.isBeforeInTranslationUnit(T.location(), R.getBegin()); 45 }); 46 const syntax::Token *End = 47 llvm::partition_point(Toks, [&](const syntax::Token &T) { 48 return !SM.isBeforeInTranslationUnit(R.getEnd(), T.location()); 49 }); 50 if (Begin > End) 51 return {}; 52 return {Begin, End}; 53 } 54 55 // Finds the range within FID corresponding to expanded tokens [First, Last]. 56 // Prev precedes First and Next follows Last, these must *not* be included. 57 // If no range satisfies the criteria, returns an invalid range. 58 // 59 // #define ID(x) x 60 // ID(ID(ID(a1) a2)) 61 // ~~ -> a1 62 // ~~ -> a2 63 // ~~~~~~~~~ -> a1 a2 64 SourceRange spelledForExpandedSlow(SourceLocation First, SourceLocation Last, 65 SourceLocation Prev, SourceLocation Next, 66 FileID TargetFile, 67 const SourceManager &SM) { 68 // There are two main parts to this algorithm: 69 // - identifying which spelled range covers the expanded tokens 70 // - validating that this range doesn't cover any extra tokens (First/Last) 71 // 72 // We do these in order. However as we transform the expanded range into the 73 // spelled one, we adjust First/Last so the validation remains simple. 74 75 assert(SM.getSLocEntry(TargetFile).isFile()); 76 // In most cases, to select First and Last we must return their expansion 77 // range, i.e. the whole of any macros they are included in. 78 // 79 // When First and Last are part of the *same macro arg* of a macro written 80 // in TargetFile, we that slice of the arg, i.e. their spelling range. 81 // 82 // Unwrap such macro calls. If the target file has A(B(C)), the 83 // SourceLocation stack of a token inside C shows us the expansion of A first, 84 // then B, then any macros inside C's body, then C itself. 85 // (This is the reverse of the order the PP applies the expansions in). 86 while (First.isMacroID() && Last.isMacroID()) { 87 auto DecFirst = SM.getDecomposedLoc(First); 88 auto DecLast = SM.getDecomposedLoc(Last); 89 auto &ExpFirst = SM.getSLocEntry(DecFirst.first).getExpansion(); 90 auto &ExpLast = SM.getSLocEntry(DecLast.first).getExpansion(); 91 92 if (!ExpFirst.isMacroArgExpansion() || !ExpLast.isMacroArgExpansion()) 93 break; 94 // Locations are in the same macro arg if they expand to the same place. 95 // (They may still have different FileIDs - an arg can have >1 chunks!) 96 if (ExpFirst.getExpansionLocStart() != ExpLast.getExpansionLocStart()) 97 break; 98 // Careful, given: 99 // #define HIDE ID(ID(a)) 100 // ID(ID(HIDE)) 101 // The token `a` is wrapped in 4 arg-expansions, we only want to unwrap 2. 102 // We distinguish them by whether the macro expands into the target file. 103 // Fortunately, the target file ones will always appear first. 104 auto ExpFileID = SM.getFileID(ExpFirst.getExpansionLocStart()); 105 if (ExpFileID == TargetFile) 106 break; 107 // Replace each endpoint with its spelling inside the macro arg. 108 // (This is getImmediateSpellingLoc without repeating lookups). 109 First = ExpFirst.getSpellingLoc().getLocWithOffset(DecFirst.second); 110 Last = ExpLast.getSpellingLoc().getLocWithOffset(DecLast.second); 111 } 112 113 // In all remaining cases we need the full containing macros. 114 // If this overlaps Prev or Next, then no range is possible. 115 SourceRange Candidate = 116 SM.getExpansionRange(SourceRange(First, Last)).getAsRange(); 117 auto DecFirst = SM.getDecomposedExpansionLoc(Candidate.getBegin()); 118 auto DecLast = SM.getDecomposedExpansionLoc(Candidate.getEnd()); 119 // Can end up in the wrong file due to bad input or token-pasting shenanigans. 120 if (Candidate.isInvalid() || DecFirst.first != TargetFile || 121 DecLast.first != TargetFile) 122 return SourceRange(); 123 // Check bounds, which may still be inside macros. 124 if (Prev.isValid()) { 125 auto Dec = SM.getDecomposedLoc(SM.getExpansionRange(Prev).getBegin()); 126 if (Dec.first != DecFirst.first || Dec.second >= DecFirst.second) 127 return SourceRange(); 128 } 129 if (Next.isValid()) { 130 auto Dec = SM.getDecomposedLoc(SM.getExpansionRange(Next).getEnd()); 131 if (Dec.first != DecLast.first || Dec.second <= DecLast.second) 132 return SourceRange(); 133 } 134 // Now we know that Candidate is a file range that covers [First, Last] 135 // without encroaching on {Prev, Next}. Ship it! 136 return Candidate; 137 } 138 139 } // namespace 140 141 syntax::Token::Token(SourceLocation Location, unsigned Length, 142 tok::TokenKind Kind) 143 : Location(Location), Length(Length), Kind(Kind) { 144 assert(Location.isValid()); 145 } 146 147 syntax::Token::Token(const clang::Token &T) 148 : Token(T.getLocation(), T.getLength(), T.getKind()) { 149 assert(!T.isAnnotation()); 150 } 151 152 llvm::StringRef syntax::Token::text(const SourceManager &SM) const { 153 bool Invalid = false; 154 const char *Start = SM.getCharacterData(location(), &Invalid); 155 assert(!Invalid); 156 return llvm::StringRef(Start, length()); 157 } 158 159 FileRange syntax::Token::range(const SourceManager &SM) const { 160 assert(location().isFileID() && "must be a spelled token"); 161 FileID File; 162 unsigned StartOffset; 163 std::tie(File, StartOffset) = SM.getDecomposedLoc(location()); 164 return FileRange(File, StartOffset, StartOffset + length()); 165 } 166 167 FileRange syntax::Token::range(const SourceManager &SM, 168 const syntax::Token &First, 169 const syntax::Token &Last) { 170 auto F = First.range(SM); 171 auto L = Last.range(SM); 172 assert(F.file() == L.file() && "tokens from different files"); 173 assert((F == L || F.endOffset() <= L.beginOffset()) && 174 "wrong order of tokens"); 175 return FileRange(F.file(), F.beginOffset(), L.endOffset()); 176 } 177 178 llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, const Token &T) { 179 return OS << T.str(); 180 } 181 182 FileRange::FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset) 183 : File(File), Begin(BeginOffset), End(EndOffset) { 184 assert(File.isValid()); 185 assert(BeginOffset <= EndOffset); 186 } 187 188 FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc, 189 unsigned Length) { 190 assert(BeginLoc.isValid()); 191 assert(BeginLoc.isFileID()); 192 193 std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc); 194 End = Begin + Length; 195 } 196 FileRange::FileRange(const SourceManager &SM, SourceLocation BeginLoc, 197 SourceLocation EndLoc) { 198 assert(BeginLoc.isValid()); 199 assert(BeginLoc.isFileID()); 200 assert(EndLoc.isValid()); 201 assert(EndLoc.isFileID()); 202 assert(SM.getFileID(BeginLoc) == SM.getFileID(EndLoc)); 203 assert(SM.getFileOffset(BeginLoc) <= SM.getFileOffset(EndLoc)); 204 205 std::tie(File, Begin) = SM.getDecomposedLoc(BeginLoc); 206 End = SM.getFileOffset(EndLoc); 207 } 208 209 llvm::raw_ostream &syntax::operator<<(llvm::raw_ostream &OS, 210 const FileRange &R) { 211 return OS << llvm::formatv("FileRange(file = {0}, offsets = {1}-{2})", 212 R.file().getHashValue(), R.beginOffset(), 213 R.endOffset()); 214 } 215 216 llvm::StringRef FileRange::text(const SourceManager &SM) const { 217 bool Invalid = false; 218 StringRef Text = SM.getBufferData(File, &Invalid); 219 if (Invalid) 220 return ""; 221 assert(Begin <= Text.size()); 222 assert(End <= Text.size()); 223 return Text.substr(Begin, length()); 224 } 225 226 void TokenBuffer::indexExpandedTokens() { 227 // No-op if the index is already created. 228 if (!ExpandedTokIndex.empty()) 229 return; 230 ExpandedTokIndex.reserve(ExpandedTokens.size()); 231 // Index ExpandedTokens for faster lookups by SourceLocation. 232 for (size_t I = 0, E = ExpandedTokens.size(); I != E; ++I) { 233 SourceLocation Loc = ExpandedTokens[I].location(); 234 if (Loc.isValid()) 235 ExpandedTokIndex[Loc] = I; 236 } 237 } 238 239 llvm::ArrayRef<syntax::Token> TokenBuffer::expandedTokens(SourceRange R) const { 240 if (R.isInvalid()) 241 return {}; 242 if (!ExpandedTokIndex.empty()) { 243 // Quick lookup if `R` is a token range. 244 // This is a huge win since majority of the users use ranges provided by an 245 // AST. Ranges in AST are token ranges from expanded token stream. 246 const auto B = ExpandedTokIndex.find(R.getBegin()); 247 const auto E = ExpandedTokIndex.find(R.getEnd()); 248 if (B != ExpandedTokIndex.end() && E != ExpandedTokIndex.end()) { 249 const Token *L = ExpandedTokens.data() + B->getSecond(); 250 // Add 1 to End to make a half-open range. 251 const Token *R = ExpandedTokens.data() + E->getSecond() + 1; 252 if (L > R) 253 return {}; 254 return {L, R}; 255 } 256 } 257 // Slow case. Use `isBeforeInTranslationUnit` to binary search for the 258 // required range. 259 return getTokensCovering(expandedTokens(), R, *SourceMgr); 260 } 261 262 CharSourceRange FileRange::toCharRange(const SourceManager &SM) const { 263 return CharSourceRange( 264 SourceRange(SM.getComposedLoc(File, Begin), SM.getComposedLoc(File, End)), 265 /*IsTokenRange=*/false); 266 } 267 268 std::pair<const syntax::Token *, const TokenBuffer::Mapping *> 269 TokenBuffer::spelledForExpandedToken(const syntax::Token *Expanded) const { 270 assert(Expanded); 271 assert(ExpandedTokens.data() <= Expanded && 272 Expanded < ExpandedTokens.data() + ExpandedTokens.size()); 273 274 auto FileIt = Files.find( 275 SourceMgr->getFileID(SourceMgr->getExpansionLoc(Expanded->location()))); 276 assert(FileIt != Files.end() && "no file for an expanded token"); 277 278 const MarkedFile &File = FileIt->second; 279 280 unsigned ExpandedIndex = Expanded - ExpandedTokens.data(); 281 // Find the first mapping that produced tokens after \p Expanded. 282 auto It = llvm::partition_point(File.Mappings, [&](const Mapping &M) { 283 return M.BeginExpanded <= ExpandedIndex; 284 }); 285 // Our token could only be produced by the previous mapping. 286 if (It == File.Mappings.begin()) { 287 // No previous mapping, no need to modify offsets. 288 return {&File.SpelledTokens[ExpandedIndex - File.BeginExpanded], 289 /*Mapping=*/nullptr}; 290 } 291 --It; // 'It' now points to last mapping that started before our token. 292 293 // Check if the token is part of the mapping. 294 if (ExpandedIndex < It->EndExpanded) 295 return {&File.SpelledTokens[It->BeginSpelled], /*Mapping=*/&*It}; 296 297 // Not part of the mapping, use the index from previous mapping to compute the 298 // corresponding spelled token. 299 return { 300 &File.SpelledTokens[It->EndSpelled + (ExpandedIndex - It->EndExpanded)], 301 /*Mapping=*/nullptr}; 302 } 303 304 const TokenBuffer::Mapping * 305 TokenBuffer::mappingStartingBeforeSpelled(const MarkedFile &F, 306 const syntax::Token *Spelled) { 307 assert(F.SpelledTokens.data() <= Spelled); 308 unsigned SpelledI = Spelled - F.SpelledTokens.data(); 309 assert(SpelledI < F.SpelledTokens.size()); 310 311 auto It = llvm::partition_point(F.Mappings, [SpelledI](const Mapping &M) { 312 return M.BeginSpelled <= SpelledI; 313 }); 314 if (It == F.Mappings.begin()) 315 return nullptr; 316 --It; 317 return &*It; 318 } 319 320 llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> 321 TokenBuffer::expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const { 322 if (Spelled.empty()) 323 return {}; 324 const auto &File = fileForSpelled(Spelled); 325 326 auto *FrontMapping = mappingStartingBeforeSpelled(File, &Spelled.front()); 327 unsigned SpelledFrontI = &Spelled.front() - File.SpelledTokens.data(); 328 assert(SpelledFrontI < File.SpelledTokens.size()); 329 unsigned ExpandedBegin; 330 if (!FrontMapping) { 331 // No mapping that starts before the first token of Spelled, we don't have 332 // to modify offsets. 333 ExpandedBegin = File.BeginExpanded + SpelledFrontI; 334 } else if (SpelledFrontI < FrontMapping->EndSpelled) { 335 // This mapping applies to Spelled tokens. 336 if (SpelledFrontI != FrontMapping->BeginSpelled) { 337 // Spelled tokens don't cover the entire mapping, returning empty result. 338 return {}; // FIXME: support macro arguments. 339 } 340 // Spelled tokens start at the beginning of this mapping. 341 ExpandedBegin = FrontMapping->BeginExpanded; 342 } else { 343 // Spelled tokens start after the mapping ends (they start in the hole 344 // between 2 mappings, or between a mapping and end of the file). 345 ExpandedBegin = 346 FrontMapping->EndExpanded + (SpelledFrontI - FrontMapping->EndSpelled); 347 } 348 349 auto *BackMapping = mappingStartingBeforeSpelled(File, &Spelled.back()); 350 unsigned SpelledBackI = &Spelled.back() - File.SpelledTokens.data(); 351 unsigned ExpandedEnd; 352 if (!BackMapping) { 353 // No mapping that starts before the last token of Spelled, we don't have to 354 // modify offsets. 355 ExpandedEnd = File.BeginExpanded + SpelledBackI + 1; 356 } else if (SpelledBackI < BackMapping->EndSpelled) { 357 // This mapping applies to Spelled tokens. 358 if (SpelledBackI + 1 != BackMapping->EndSpelled) { 359 // Spelled tokens don't cover the entire mapping, returning empty result. 360 return {}; // FIXME: support macro arguments. 361 } 362 ExpandedEnd = BackMapping->EndExpanded; 363 } else { 364 // Spelled tokens end after the mapping ends. 365 ExpandedEnd = 366 BackMapping->EndExpanded + (SpelledBackI - BackMapping->EndSpelled) + 1; 367 } 368 369 assert(ExpandedBegin < ExpandedTokens.size()); 370 assert(ExpandedEnd < ExpandedTokens.size()); 371 // Avoid returning empty ranges. 372 if (ExpandedBegin == ExpandedEnd) 373 return {}; 374 return {llvm::ArrayRef(ExpandedTokens.data() + ExpandedBegin, 375 ExpandedTokens.data() + ExpandedEnd)}; 376 } 377 378 llvm::ArrayRef<syntax::Token> TokenBuffer::spelledTokens(FileID FID) const { 379 auto It = Files.find(FID); 380 assert(It != Files.end()); 381 return It->second.SpelledTokens; 382 } 383 384 const syntax::Token * 385 TokenBuffer::spelledTokenContaining(SourceLocation Loc) const { 386 assert(Loc.isFileID()); 387 const auto *Tok = llvm::partition_point( 388 spelledTokens(SourceMgr->getFileID(Loc)), 389 [&](const syntax::Token &Tok) { return Tok.endLocation() <= Loc; }); 390 if (!Tok || Loc < Tok->location()) 391 return nullptr; 392 return Tok; 393 } 394 395 std::string TokenBuffer::Mapping::str() const { 396 return std::string( 397 llvm::formatv("spelled tokens: [{0},{1}), expanded tokens: [{2},{3})", 398 BeginSpelled, EndSpelled, BeginExpanded, EndExpanded)); 399 } 400 401 std::optional<llvm::ArrayRef<syntax::Token>> 402 TokenBuffer::spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const { 403 // In cases of invalid code, AST nodes can have source ranges that include 404 // the `eof` token. As there's no spelling for this token, exclude it from 405 // the range. 406 if (!Expanded.empty() && Expanded.back().kind() == tok::eof) { 407 Expanded = Expanded.drop_back(); 408 } 409 // Mapping an empty range is ambiguous in case of empty mappings at either end 410 // of the range, bail out in that case. 411 if (Expanded.empty()) 412 return std::nullopt; 413 const syntax::Token *First = &Expanded.front(); 414 const syntax::Token *Last = &Expanded.back(); 415 auto [FirstSpelled, FirstMapping] = spelledForExpandedToken(First); 416 auto [LastSpelled, LastMapping] = spelledForExpandedToken(Last); 417 418 FileID FID = SourceMgr->getFileID(FirstSpelled->location()); 419 // FIXME: Handle multi-file changes by trying to map onto a common root. 420 if (FID != SourceMgr->getFileID(LastSpelled->location())) 421 return std::nullopt; 422 423 const MarkedFile &File = Files.find(FID)->second; 424 425 // If the range is within one macro argument, the result may be only part of a 426 // Mapping. We must use the general (SourceManager-based) algorithm. 427 if (FirstMapping && FirstMapping == LastMapping && 428 SourceMgr->isMacroArgExpansion(First->location()) && 429 SourceMgr->isMacroArgExpansion(Last->location())) { 430 // We use excluded Prev/Next token for bounds checking. 431 SourceLocation Prev = (First == &ExpandedTokens.front()) 432 ? SourceLocation() 433 : (First - 1)->location(); 434 SourceLocation Next = (Last == &ExpandedTokens.back()) 435 ? SourceLocation() 436 : (Last + 1)->location(); 437 SourceRange Range = spelledForExpandedSlow( 438 First->location(), Last->location(), Prev, Next, FID, *SourceMgr); 439 if (Range.isInvalid()) 440 return std::nullopt; 441 return getTokensCovering(File.SpelledTokens, Range, *SourceMgr); 442 } 443 444 // Otherwise, use the fast version based on Mappings. 445 // Do not allow changes that doesn't cover full expansion. 446 unsigned FirstExpanded = Expanded.begin() - ExpandedTokens.data(); 447 unsigned LastExpanded = Expanded.end() - ExpandedTokens.data(); 448 if (FirstMapping && FirstExpanded != FirstMapping->BeginExpanded) 449 return std::nullopt; 450 if (LastMapping && LastMapping->EndExpanded != LastExpanded) 451 return std::nullopt; 452 return llvm::ArrayRef( 453 FirstMapping ? File.SpelledTokens.data() + FirstMapping->BeginSpelled 454 : FirstSpelled, 455 LastMapping ? File.SpelledTokens.data() + LastMapping->EndSpelled 456 : LastSpelled + 1); 457 } 458 459 TokenBuffer::Expansion TokenBuffer::makeExpansion(const MarkedFile &F, 460 const Mapping &M) const { 461 Expansion E; 462 E.Spelled = llvm::ArrayRef(F.SpelledTokens.data() + M.BeginSpelled, 463 F.SpelledTokens.data() + M.EndSpelled); 464 E.Expanded = llvm::ArrayRef(ExpandedTokens.data() + M.BeginExpanded, 465 ExpandedTokens.data() + M.EndExpanded); 466 return E; 467 } 468 469 const TokenBuffer::MarkedFile & 470 TokenBuffer::fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const { 471 assert(!Spelled.empty()); 472 assert(Spelled.front().location().isFileID() && "not a spelled token"); 473 auto FileIt = Files.find(SourceMgr->getFileID(Spelled.front().location())); 474 assert(FileIt != Files.end() && "file not tracked by token buffer"); 475 const auto &File = FileIt->second; 476 assert(File.SpelledTokens.data() <= Spelled.data() && 477 Spelled.end() <= 478 (File.SpelledTokens.data() + File.SpelledTokens.size()) && 479 "Tokens not in spelled range"); 480 #ifndef NDEBUG 481 auto T1 = Spelled.back().location(); 482 auto T2 = File.SpelledTokens.back().location(); 483 assert(T1 == T2 || sourceManager().isBeforeInTranslationUnit(T1, T2)); 484 #endif 485 return File; 486 } 487 488 std::optional<TokenBuffer::Expansion> 489 TokenBuffer::expansionStartingAt(const syntax::Token *Spelled) const { 490 assert(Spelled); 491 const auto &File = fileForSpelled(*Spelled); 492 493 unsigned SpelledIndex = Spelled - File.SpelledTokens.data(); 494 auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) { 495 return M.BeginSpelled < SpelledIndex; 496 }); 497 if (M == File.Mappings.end() || M->BeginSpelled != SpelledIndex) 498 return std::nullopt; 499 return makeExpansion(File, *M); 500 } 501 502 std::vector<TokenBuffer::Expansion> TokenBuffer::expansionsOverlapping( 503 llvm::ArrayRef<syntax::Token> Spelled) const { 504 if (Spelled.empty()) 505 return {}; 506 const auto &File = fileForSpelled(Spelled); 507 508 // Find the first overlapping range, and then copy until we stop overlapping. 509 unsigned SpelledBeginIndex = Spelled.begin() - File.SpelledTokens.data(); 510 unsigned SpelledEndIndex = Spelled.end() - File.SpelledTokens.data(); 511 auto M = llvm::partition_point(File.Mappings, [&](const Mapping &M) { 512 return M.EndSpelled <= SpelledBeginIndex; 513 }); 514 std::vector<TokenBuffer::Expansion> Expansions; 515 for (; M != File.Mappings.end() && M->BeginSpelled < SpelledEndIndex; ++M) 516 Expansions.push_back(makeExpansion(File, *M)); 517 return Expansions; 518 } 519 520 llvm::ArrayRef<syntax::Token> 521 syntax::spelledTokensTouching(SourceLocation Loc, 522 llvm::ArrayRef<syntax::Token> Tokens) { 523 assert(Loc.isFileID()); 524 525 auto *Right = llvm::partition_point( 526 Tokens, [&](const syntax::Token &Tok) { return Tok.location() < Loc; }); 527 bool AcceptRight = Right != Tokens.end() && Right->location() <= Loc; 528 bool AcceptLeft = 529 Right != Tokens.begin() && (Right - 1)->endLocation() >= Loc; 530 return llvm::ArrayRef(Right - (AcceptLeft ? 1 : 0), 531 Right + (AcceptRight ? 1 : 0)); 532 } 533 534 llvm::ArrayRef<syntax::Token> 535 syntax::spelledTokensTouching(SourceLocation Loc, 536 const syntax::TokenBuffer &Tokens) { 537 return spelledTokensTouching( 538 Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc))); 539 } 540 541 const syntax::Token * 542 syntax::spelledIdentifierTouching(SourceLocation Loc, 543 llvm::ArrayRef<syntax::Token> Tokens) { 544 for (const syntax::Token &Tok : spelledTokensTouching(Loc, Tokens)) { 545 if (Tok.kind() == tok::identifier) 546 return &Tok; 547 } 548 return nullptr; 549 } 550 551 const syntax::Token * 552 syntax::spelledIdentifierTouching(SourceLocation Loc, 553 const syntax::TokenBuffer &Tokens) { 554 return spelledIdentifierTouching( 555 Loc, Tokens.spelledTokens(Tokens.sourceManager().getFileID(Loc))); 556 } 557 558 std::vector<const syntax::Token *> 559 TokenBuffer::macroExpansions(FileID FID) const { 560 auto FileIt = Files.find(FID); 561 assert(FileIt != Files.end() && "file not tracked by token buffer"); 562 auto &File = FileIt->second; 563 std::vector<const syntax::Token *> Expansions; 564 auto &Spelled = File.SpelledTokens; 565 for (auto Mapping : File.Mappings) { 566 const syntax::Token *Token = &Spelled[Mapping.BeginSpelled]; 567 if (Token->kind() == tok::TokenKind::identifier) 568 Expansions.push_back(Token); 569 } 570 return Expansions; 571 } 572 573 std::vector<syntax::Token> syntax::tokenize(const FileRange &FR, 574 const SourceManager &SM, 575 const LangOptions &LO) { 576 std::vector<syntax::Token> Tokens; 577 IdentifierTable Identifiers(LO); 578 auto AddToken = [&](clang::Token T) { 579 // Fill the proper token kind for keywords, etc. 580 if (T.getKind() == tok::raw_identifier && !T.needsCleaning() && 581 !T.hasUCN()) { // FIXME: support needsCleaning and hasUCN cases. 582 clang::IdentifierInfo &II = Identifiers.get(T.getRawIdentifier()); 583 T.setIdentifierInfo(&II); 584 T.setKind(II.getTokenID()); 585 } 586 Tokens.push_back(syntax::Token(T)); 587 }; 588 589 auto SrcBuffer = SM.getBufferData(FR.file()); 590 Lexer L(SM.getLocForStartOfFile(FR.file()), LO, SrcBuffer.data(), 591 SrcBuffer.data() + FR.beginOffset(), 592 // We can't make BufEnd point to FR.endOffset, as Lexer requires a 593 // null terminated buffer. 594 SrcBuffer.data() + SrcBuffer.size()); 595 596 clang::Token T; 597 while (!L.LexFromRawLexer(T) && L.getCurrentBufferOffset() < FR.endOffset()) 598 AddToken(T); 599 // LexFromRawLexer returns true when it parses the last token of the file, add 600 // it iff it starts within the range we are interested in. 601 if (SM.getFileOffset(T.getLocation()) < FR.endOffset()) 602 AddToken(T); 603 return Tokens; 604 } 605 606 std::vector<syntax::Token> syntax::tokenize(FileID FID, const SourceManager &SM, 607 const LangOptions &LO) { 608 return tokenize(syntax::FileRange(FID, 0, SM.getFileIDSize(FID)), SM, LO); 609 } 610 611 /// Records information reqired to construct mappings for the token buffer that 612 /// we are collecting. 613 class TokenCollector::CollectPPExpansions : public PPCallbacks { 614 public: 615 CollectPPExpansions(TokenCollector &C) : Collector(&C) {} 616 617 /// Disabled instance will stop reporting anything to TokenCollector. 618 /// This ensures that uses of the preprocessor after TokenCollector::consume() 619 /// is called do not access the (possibly invalid) collector instance. 620 void disable() { Collector = nullptr; } 621 622 void MacroExpands(const clang::Token &MacroNameTok, const MacroDefinition &MD, 623 SourceRange Range, const MacroArgs *Args) override { 624 if (!Collector) 625 return; 626 const auto &SM = Collector->PP.getSourceManager(); 627 // Only record top-level expansions that directly produce expanded tokens. 628 // This excludes those where: 629 // - the macro use is inside a macro body, 630 // - the macro appears in an argument to another macro. 631 // However macro expansion isn't really a tree, it's token rewrite rules, 632 // so there are other cases, e.g. 633 // #define B(X) X 634 // #define A 1 + B 635 // A(2) 636 // Both A and B produce expanded tokens, though the macro name 'B' comes 637 // from an expansion. The best we can do is merge the mappings for both. 638 639 // The *last* token of any top-level macro expansion must be in a file. 640 // (In the example above, see the closing paren of the expansion of B). 641 if (!Range.getEnd().isFileID()) 642 return; 643 // If there's a current expansion that encloses this one, this one can't be 644 // top-level. 645 if (LastExpansionEnd.isValid() && 646 !SM.isBeforeInTranslationUnit(LastExpansionEnd, Range.getEnd())) 647 return; 648 649 // If the macro invocation (B) starts in a macro (A) but ends in a file, 650 // we'll create a merged mapping for A + B by overwriting the endpoint for 651 // A's startpoint. 652 if (!Range.getBegin().isFileID()) { 653 Range.setBegin(SM.getExpansionLoc(Range.getBegin())); 654 assert(Collector->Expansions.count(Range.getBegin()) && 655 "Overlapping macros should have same expansion location"); 656 } 657 658 Collector->Expansions[Range.getBegin()] = Range.getEnd(); 659 LastExpansionEnd = Range.getEnd(); 660 } 661 // FIXME: handle directives like #pragma, #include, etc. 662 private: 663 TokenCollector *Collector; 664 /// Used to detect recursive macro expansions. 665 SourceLocation LastExpansionEnd; 666 }; 667 668 /// Fills in the TokenBuffer by tracing the run of a preprocessor. The 669 /// implementation tracks the tokens, macro expansions and directives coming 670 /// from the preprocessor and: 671 /// - for each token, figures out if it is a part of an expanded token stream, 672 /// spelled token stream or both. Stores the tokens appropriately. 673 /// - records mappings from the spelled to expanded token ranges, e.g. for macro 674 /// expansions. 675 /// FIXME: also properly record: 676 /// - #include directives, 677 /// - #pragma, #line and other PP directives, 678 /// - skipped pp regions, 679 /// - ... 680 681 TokenCollector::TokenCollector(Preprocessor &PP) : PP(PP) { 682 // Collect the expanded token stream during preprocessing. 683 PP.setTokenWatcher([this](const clang::Token &T) { 684 if (T.isAnnotation()) 685 return; 686 DEBUG_WITH_TYPE("collect-tokens", llvm::dbgs() 687 << "Token: " 688 << syntax::Token(T).dumpForTests( 689 this->PP.getSourceManager()) 690 << "\n" 691 692 ); 693 Expanded.push_back(syntax::Token(T)); 694 }); 695 // And locations of macro calls, to properly recover boundaries of those in 696 // case of empty expansions. 697 auto CB = std::make_unique<CollectPPExpansions>(*this); 698 this->Collector = CB.get(); 699 PP.addPPCallbacks(std::move(CB)); 700 } 701 702 /// Builds mappings and spelled tokens in the TokenBuffer based on the expanded 703 /// token stream. 704 class TokenCollector::Builder { 705 public: 706 Builder(std::vector<syntax::Token> Expanded, PPExpansions CollectedExpansions, 707 const SourceManager &SM, const LangOptions &LangOpts) 708 : Result(SM), CollectedExpansions(std::move(CollectedExpansions)), SM(SM), 709 LangOpts(LangOpts) { 710 Result.ExpandedTokens = std::move(Expanded); 711 } 712 713 TokenBuffer build() && { 714 assert(!Result.ExpandedTokens.empty()); 715 assert(Result.ExpandedTokens.back().kind() == tok::eof); 716 717 // Tokenize every file that contributed tokens to the expanded stream. 718 buildSpelledTokens(); 719 720 // The expanded token stream consists of runs of tokens that came from 721 // the same source (a macro expansion, part of a file etc). 722 // Between these runs are the logical positions of spelled tokens that 723 // didn't expand to anything. 724 while (NextExpanded < Result.ExpandedTokens.size() - 1 /* eof */) { 725 // Create empty mappings for spelled tokens that expanded to nothing here. 726 // May advance NextSpelled, but NextExpanded is unchanged. 727 discard(); 728 // Create mapping for a contiguous run of expanded tokens. 729 // Advances NextExpanded past the run, and NextSpelled accordingly. 730 unsigned OldPosition = NextExpanded; 731 advance(); 732 if (NextExpanded == OldPosition) 733 diagnoseAdvanceFailure(); 734 } 735 // If any tokens remain in any of the files, they didn't expand to anything. 736 // Create empty mappings up until the end of the file. 737 for (const auto &File : Result.Files) 738 discard(File.first); 739 740 #ifndef NDEBUG 741 for (auto &pair : Result.Files) { 742 auto &mappings = pair.second.Mappings; 743 assert(llvm::is_sorted(mappings, [](const TokenBuffer::Mapping &M1, 744 const TokenBuffer::Mapping &M2) { 745 return M1.BeginSpelled < M2.BeginSpelled && 746 M1.EndSpelled < M2.EndSpelled && 747 M1.BeginExpanded < M2.BeginExpanded && 748 M1.EndExpanded < M2.EndExpanded; 749 })); 750 } 751 #endif 752 753 return std::move(Result); 754 } 755 756 private: 757 // Consume a sequence of spelled tokens that didn't expand to anything. 758 // In the simplest case, skips spelled tokens until finding one that produced 759 // the NextExpanded token, and creates an empty mapping for them. 760 // If Drain is provided, skips remaining tokens from that file instead. 761 void discard(std::optional<FileID> Drain = std::nullopt) { 762 SourceLocation Target = 763 Drain ? SM.getLocForEndOfFile(*Drain) 764 : SM.getExpansionLoc( 765 Result.ExpandedTokens[NextExpanded].location()); 766 FileID File = SM.getFileID(Target); 767 const auto &SpelledTokens = Result.Files[File].SpelledTokens; 768 auto &NextSpelled = this->NextSpelled[File]; 769 770 TokenBuffer::Mapping Mapping; 771 Mapping.BeginSpelled = NextSpelled; 772 // When dropping trailing tokens from a file, the empty mapping should 773 // be positioned within the file's expanded-token range (at the end). 774 Mapping.BeginExpanded = Mapping.EndExpanded = 775 Drain ? Result.Files[*Drain].EndExpanded : NextExpanded; 776 // We may want to split into several adjacent empty mappings. 777 // FlushMapping() emits the current mapping and starts a new one. 778 auto FlushMapping = [&, this] { 779 Mapping.EndSpelled = NextSpelled; 780 if (Mapping.BeginSpelled != Mapping.EndSpelled) 781 Result.Files[File].Mappings.push_back(Mapping); 782 Mapping.BeginSpelled = NextSpelled; 783 }; 784 785 while (NextSpelled < SpelledTokens.size() && 786 SpelledTokens[NextSpelled].location() < Target) { 787 // If we know mapping bounds at [NextSpelled, KnownEnd] (macro expansion) 788 // then we want to partition our (empty) mapping. 789 // [Start, NextSpelled) [NextSpelled, KnownEnd] (KnownEnd, Target) 790 SourceLocation KnownEnd = 791 CollectedExpansions.lookup(SpelledTokens[NextSpelled].location()); 792 if (KnownEnd.isValid()) { 793 FlushMapping(); // Emits [Start, NextSpelled) 794 while (NextSpelled < SpelledTokens.size() && 795 SpelledTokens[NextSpelled].location() <= KnownEnd) 796 ++NextSpelled; 797 FlushMapping(); // Emits [NextSpelled, KnownEnd] 798 // Now the loop continues and will emit (KnownEnd, Target). 799 } else { 800 ++NextSpelled; 801 } 802 } 803 FlushMapping(); 804 } 805 806 // Consumes the NextExpanded token and others that are part of the same run. 807 // Increases NextExpanded and NextSpelled by at least one, and adds a mapping 808 // (unless this is a run of file tokens, which we represent with no mapping). 809 void advance() { 810 const syntax::Token &Tok = Result.ExpandedTokens[NextExpanded]; 811 SourceLocation Expansion = SM.getExpansionLoc(Tok.location()); 812 FileID File = SM.getFileID(Expansion); 813 const auto &SpelledTokens = Result.Files[File].SpelledTokens; 814 auto &NextSpelled = this->NextSpelled[File]; 815 816 if (Tok.location().isFileID()) { 817 // A run of file tokens continues while the expanded/spelled tokens match. 818 while (NextSpelled < SpelledTokens.size() && 819 NextExpanded < Result.ExpandedTokens.size() && 820 SpelledTokens[NextSpelled].location() == 821 Result.ExpandedTokens[NextExpanded].location()) { 822 ++NextSpelled; 823 ++NextExpanded; 824 } 825 // We need no mapping for file tokens copied to the expanded stream. 826 } else { 827 // We found a new macro expansion. We should have its spelling bounds. 828 auto End = CollectedExpansions.lookup(Expansion); 829 assert(End.isValid() && "Macro expansion wasn't captured?"); 830 831 // Mapping starts here... 832 TokenBuffer::Mapping Mapping; 833 Mapping.BeginExpanded = NextExpanded; 834 Mapping.BeginSpelled = NextSpelled; 835 // ... consumes spelled tokens within bounds we captured ... 836 while (NextSpelled < SpelledTokens.size() && 837 SpelledTokens[NextSpelled].location() <= End) 838 ++NextSpelled; 839 // ... consumes expanded tokens rooted at the same expansion ... 840 while (NextExpanded < Result.ExpandedTokens.size() && 841 SM.getExpansionLoc( 842 Result.ExpandedTokens[NextExpanded].location()) == Expansion) 843 ++NextExpanded; 844 // ... and ends here. 845 Mapping.EndExpanded = NextExpanded; 846 Mapping.EndSpelled = NextSpelled; 847 Result.Files[File].Mappings.push_back(Mapping); 848 } 849 } 850 851 // advance() is supposed to consume at least one token - if not, we crash. 852 void diagnoseAdvanceFailure() { 853 #ifndef NDEBUG 854 // Show the failed-to-map token in context. 855 for (unsigned I = (NextExpanded < 10) ? 0 : NextExpanded - 10; 856 I < NextExpanded + 5 && I < Result.ExpandedTokens.size(); ++I) { 857 const char *L = 858 (I == NextExpanded) ? "!! " : (I < NextExpanded) ? "ok " : " "; 859 llvm::errs() << L << Result.ExpandedTokens[I].dumpForTests(SM) << "\n"; 860 } 861 #endif 862 llvm_unreachable("Couldn't map expanded token to spelled tokens!"); 863 } 864 865 /// Initializes TokenBuffer::Files and fills spelled tokens and expanded 866 /// ranges for each of the files. 867 void buildSpelledTokens() { 868 for (unsigned I = 0; I < Result.ExpandedTokens.size(); ++I) { 869 const auto &Tok = Result.ExpandedTokens[I]; 870 auto FID = SM.getFileID(SM.getExpansionLoc(Tok.location())); 871 auto It = Result.Files.try_emplace(FID); 872 TokenBuffer::MarkedFile &File = It.first->second; 873 874 // The eof token should not be considered part of the main-file's range. 875 File.EndExpanded = Tok.kind() == tok::eof ? I : I + 1; 876 877 if (!It.second) 878 continue; // we have seen this file before. 879 // This is the first time we see this file. 880 File.BeginExpanded = I; 881 File.SpelledTokens = tokenize(FID, SM, LangOpts); 882 } 883 } 884 885 TokenBuffer Result; 886 unsigned NextExpanded = 0; // cursor in ExpandedTokens 887 llvm::DenseMap<FileID, unsigned> NextSpelled; // cursor in SpelledTokens 888 PPExpansions CollectedExpansions; 889 const SourceManager &SM; 890 const LangOptions &LangOpts; 891 }; 892 893 TokenBuffer TokenCollector::consume() && { 894 PP.setTokenWatcher(nullptr); 895 Collector->disable(); 896 return Builder(std::move(Expanded), std::move(Expansions), 897 PP.getSourceManager(), PP.getLangOpts()) 898 .build(); 899 } 900 901 std::string syntax::Token::str() const { 902 return std::string(llvm::formatv("Token({0}, length = {1})", 903 tok::getTokenName(kind()), length())); 904 } 905 906 std::string syntax::Token::dumpForTests(const SourceManager &SM) const { 907 return std::string(llvm::formatv("Token(`{0}`, {1}, length = {2})", text(SM), 908 tok::getTokenName(kind()), length())); 909 } 910 911 std::string TokenBuffer::dumpForTests() const { 912 auto PrintToken = [this](const syntax::Token &T) -> std::string { 913 if (T.kind() == tok::eof) 914 return "<eof>"; 915 return std::string(T.text(*SourceMgr)); 916 }; 917 918 auto DumpTokens = [this, &PrintToken](llvm::raw_ostream &OS, 919 llvm::ArrayRef<syntax::Token> Tokens) { 920 if (Tokens.empty()) { 921 OS << "<empty>"; 922 return; 923 } 924 OS << Tokens[0].text(*SourceMgr); 925 for (unsigned I = 1; I < Tokens.size(); ++I) { 926 if (Tokens[I].kind() == tok::eof) 927 continue; 928 OS << " " << PrintToken(Tokens[I]); 929 } 930 }; 931 932 std::string Dump; 933 llvm::raw_string_ostream OS(Dump); 934 935 OS << "expanded tokens:\n" 936 << " "; 937 // (!) we do not show '<eof>'. 938 DumpTokens(OS, llvm::ArrayRef(ExpandedTokens).drop_back()); 939 OS << "\n"; 940 941 std::vector<FileID> Keys; 942 for (const auto &F : Files) 943 Keys.push_back(F.first); 944 llvm::sort(Keys); 945 946 for (FileID ID : Keys) { 947 const MarkedFile &File = Files.find(ID)->second; 948 auto Entry = SourceMgr->getFileEntryRefForID(ID); 949 if (!Entry) 950 continue; // Skip builtin files. 951 std::string Path = llvm::sys::path::convert_to_slash(Entry->getName()); 952 OS << llvm::formatv("file '{0}'\n", Path) << " spelled tokens:\n" 953 << " "; 954 DumpTokens(OS, File.SpelledTokens); 955 OS << "\n"; 956 957 if (File.Mappings.empty()) { 958 OS << " no mappings.\n"; 959 continue; 960 } 961 OS << " mappings:\n"; 962 for (auto &M : File.Mappings) { 963 OS << llvm::formatv( 964 " ['{0}'_{1}, '{2}'_{3}) => ['{4}'_{5}, '{6}'_{7})\n", 965 PrintToken(File.SpelledTokens[M.BeginSpelled]), M.BeginSpelled, 966 M.EndSpelled == File.SpelledTokens.size() 967 ? "<eof>" 968 : PrintToken(File.SpelledTokens[M.EndSpelled]), 969 M.EndSpelled, PrintToken(ExpandedTokens[M.BeginExpanded]), 970 M.BeginExpanded, PrintToken(ExpandedTokens[M.EndExpanded]), 971 M.EndExpanded); 972 } 973 } 974 return Dump; 975 } 976