xref: /freebsd/contrib/llvm-project/clang/lib/Tooling/Transformer/SourceCode.cpp (revision f7f4bd06a8d4e5d1e92d0d2905a68a2a03ed9c0c)
1  //===--- SourceCode.cpp - Source code manipulation routines -----*- C++ -*-===//
2  //
3  // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4  // See https://llvm.org/LICENSE.txt for license information.
5  // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6  //
7  //===----------------------------------------------------------------------===//
8  //
9  //  This file provides functions that simplify extraction of source code.
10  //
11  //===----------------------------------------------------------------------===//
12  #include "clang/Tooling/Transformer/SourceCode.h"
13  #include "clang/AST/ASTContext.h"
14  #include "clang/AST/Attr.h"
15  #include "clang/AST/Comment.h"
16  #include "clang/AST/Decl.h"
17  #include "clang/AST/DeclCXX.h"
18  #include "clang/AST/DeclTemplate.h"
19  #include "clang/AST/Expr.h"
20  #include "clang/Basic/SourceManager.h"
21  #include "clang/Lex/Lexer.h"
22  #include "llvm/Support/Errc.h"
23  #include "llvm/Support/Error.h"
24  #include <set>
25  
26  using namespace clang;
27  
28  using llvm::errc;
29  using llvm::StringError;
30  
31  StringRef clang::tooling::getText(CharSourceRange Range,
32                                    const ASTContext &Context) {
33    return Lexer::getSourceText(Range, Context.getSourceManager(),
34                                Context.getLangOpts());
35  }
36  
37  CharSourceRange clang::tooling::maybeExtendRange(CharSourceRange Range,
38                                                   tok::TokenKind Next,
39                                                   ASTContext &Context) {
40    CharSourceRange R = Lexer::getAsCharRange(Range, Context.getSourceManager(),
41                                              Context.getLangOpts());
42    if (R.isInvalid())
43      return Range;
44    Token Tok;
45    bool Err =
46        Lexer::getRawToken(R.getEnd(), Tok, Context.getSourceManager(),
47                           Context.getLangOpts(), /*IgnoreWhiteSpace=*/true);
48    if (Err || !Tok.is(Next))
49      return Range;
50    return CharSourceRange::getTokenRange(Range.getBegin(), Tok.getLocation());
51  }
52  
53  static llvm::Error validateRange(const CharSourceRange &Range,
54                                   const SourceManager &SM,
55                                   bool AllowSystemHeaders) {
56    if (Range.isInvalid())
57      return llvm::make_error<StringError>(errc::invalid_argument,
58                                           "Invalid range");
59  
60    if (Range.getBegin().isMacroID() || Range.getEnd().isMacroID())
61      return llvm::make_error<StringError>(
62          errc::invalid_argument, "Range starts or ends in a macro expansion");
63  
64    if (!AllowSystemHeaders) {
65      if (SM.isInSystemHeader(Range.getBegin()) ||
66          SM.isInSystemHeader(Range.getEnd()))
67        return llvm::make_error<StringError>(errc::invalid_argument,
68                                             "Range is in system header");
69    }
70  
71    std::pair<FileID, unsigned> BeginInfo = SM.getDecomposedLoc(Range.getBegin());
72    std::pair<FileID, unsigned> EndInfo = SM.getDecomposedLoc(Range.getEnd());
73    if (BeginInfo.first != EndInfo.first)
74      return llvm::make_error<StringError>(
75          errc::invalid_argument, "Range begins and ends in different files");
76  
77    if (BeginInfo.second > EndInfo.second)
78      return llvm::make_error<StringError>(errc::invalid_argument,
79                                           "Range's begin is past its end");
80  
81    return llvm::Error::success();
82  }
83  
84  llvm::Error clang::tooling::validateEditRange(const CharSourceRange &Range,
85                                                const SourceManager &SM) {
86    return validateRange(Range, SM, /*AllowSystemHeaders=*/false);
87  }
88  
89  static bool spelledInMacroDefinition(SourceLocation Loc,
90                                       const SourceManager &SM) {
91    while (Loc.isMacroID()) {
92      const auto &Expansion = SM.getSLocEntry(SM.getFileID(Loc)).getExpansion();
93      if (Expansion.isMacroArgExpansion()) {
94        // Check the spelling location of the macro arg, in case the arg itself is
95        // in a macro expansion.
96        Loc = Expansion.getSpellingLoc();
97      } else {
98        return true;
99      }
100    }
101    return false;
102  }
103  
104  static CharSourceRange getRange(const CharSourceRange &EditRange,
105                                  const SourceManager &SM,
106                                  const LangOptions &LangOpts,
107                                  bool IncludeMacroExpansion) {
108    CharSourceRange Range;
109    if (IncludeMacroExpansion) {
110      Range = Lexer::makeFileCharRange(EditRange, SM, LangOpts);
111    } else {
112      if (spelledInMacroDefinition(EditRange.getBegin(), SM) ||
113          spelledInMacroDefinition(EditRange.getEnd(), SM))
114        return {};
115  
116      auto B = SM.getSpellingLoc(EditRange.getBegin());
117      auto E = SM.getSpellingLoc(EditRange.getEnd());
118      if (EditRange.isTokenRange())
119        E = Lexer::getLocForEndOfToken(E, 0, SM, LangOpts);
120      Range = CharSourceRange::getCharRange(B, E);
121    }
122    return Range;
123  }
124  
125  std::optional<CharSourceRange> clang::tooling::getFileRangeForEdit(
126      const CharSourceRange &EditRange, const SourceManager &SM,
127      const LangOptions &LangOpts, bool IncludeMacroExpansion) {
128    CharSourceRange Range =
129        getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
130    bool IsInvalid = llvm::errorToBool(validateEditRange(Range, SM));
131    if (IsInvalid)
132      return std::nullopt;
133    return Range;
134  }
135  
136  std::optional<CharSourceRange> clang::tooling::getFileRange(
137      const CharSourceRange &EditRange, const SourceManager &SM,
138      const LangOptions &LangOpts, bool IncludeMacroExpansion) {
139    CharSourceRange Range =
140        getRange(EditRange, SM, LangOpts, IncludeMacroExpansion);
141    bool IsInvalid =
142        llvm::errorToBool(validateRange(Range, SM, /*AllowSystemHeaders=*/true));
143    if (IsInvalid)
144      return std::nullopt;
145    return Range;
146  }
147  
148  static bool startsWithNewline(const SourceManager &SM, const Token &Tok) {
149    return isVerticalWhitespace(SM.getCharacterData(Tok.getLocation())[0]);
150  }
151  
152  static bool contains(const std::set<tok::TokenKind> &Terminators,
153                       const Token &Tok) {
154    return Terminators.count(Tok.getKind()) > 0;
155  }
156  
157  // Returns the exclusive, *file* end location of the entity whose last token is
158  // at location 'EntityLast'. That is, it returns the location one past the last
159  // relevant character.
160  //
161  // Associated tokens include comments, horizontal whitespace and 'Terminators'
162  // -- optional tokens, which, if any are found, will be included; if
163  // 'Terminators' is empty, we will not include any extra tokens beyond comments
164  // and horizontal whitespace.
165  static SourceLocation
166  getEntityEndLoc(const SourceManager &SM, SourceLocation EntityLast,
167                  const std::set<tok::TokenKind> &Terminators,
168                  const LangOptions &LangOpts) {
169    assert(EntityLast.isValid() && "Invalid end location found.");
170  
171    // We remember the last location of a non-horizontal-whitespace token we have
172    // lexed; this is the location up to which we will want to delete.
173    // FIXME: Support using the spelling loc here for cases where we want to
174    // analyze the macro text.
175  
176    CharSourceRange ExpansionRange = SM.getExpansionRange(EntityLast);
177    // FIXME: Should check isTokenRange(), for the (rare) case that
178    // `ExpansionRange` is a character range.
179    std::unique_ptr<Lexer> Lexer = [&]() {
180      bool Invalid = false;
181      auto FileOffset = SM.getDecomposedLoc(ExpansionRange.getEnd());
182      llvm::StringRef File = SM.getBufferData(FileOffset.first, &Invalid);
183      assert(!Invalid && "Cannot get file/offset");
184      return std::make_unique<clang::Lexer>(
185          SM.getLocForStartOfFile(FileOffset.first), LangOpts, File.begin(),
186          File.data() + FileOffset.second, File.end());
187    }();
188  
189    // Tell Lexer to return whitespace as pseudo-tokens (kind is tok::unknown).
190    Lexer->SetKeepWhitespaceMode(true);
191  
192    // Generally, the code we want to include looks like this ([] are optional),
193    // If Terminators is empty:
194    //   [ <comment> ] [ <newline> ]
195    // Otherwise:
196    //   ... <terminator> [ <comment> ] [ <newline> ]
197  
198    Token Tok;
199    bool Terminated = false;
200  
201    // First, lex to the current token (which is the last token of the range that
202    // is definitely associated with the decl). Then, we process the first token
203    // separately from the rest based on conditions that hold specifically for
204    // that first token.
205    //
206    // We do not search for a terminator if none is required or we've already
207    // encountered it. Otherwise, if the original `EntityLast` location was in a
208    // macro expansion, we don't have visibility into the text, so we assume we've
209    // already terminated. However, we note this assumption with
210    // `TerminatedByMacro`, because we'll want to handle it somewhat differently
211    // for the terminators semicolon and comma. These terminators can be safely
212    // associated with the entity when they appear after the macro -- extra
213    // semicolons have no effect on the program and a well-formed program won't
214    // have multiple commas in a row, so we're guaranteed that there is only one.
215    //
216    // FIXME: This handling of macros is more conservative than necessary. When
217    // the end of the expansion coincides with the end of the node, we can still
218    // safely analyze the code. But, it is more complicated, because we need to
219    // start by lexing the spelling loc for the first token and then switch to the
220    // expansion loc.
221    bool TerminatedByMacro = false;
222    Lexer->LexFromRawLexer(Tok);
223    if (Terminators.empty() || contains(Terminators, Tok))
224      Terminated = true;
225    else if (EntityLast.isMacroID()) {
226      Terminated = true;
227      TerminatedByMacro = true;
228    }
229  
230    // We save the most recent candidate for the exclusive end location.
231    SourceLocation End = Tok.getEndLoc();
232  
233    while (!Terminated) {
234      // Lex the next token we want to possibly expand the range with.
235      Lexer->LexFromRawLexer(Tok);
236  
237      switch (Tok.getKind()) {
238      case tok::eof:
239      // Unexpected separators.
240      case tok::l_brace:
241      case tok::r_brace:
242      case tok::comma:
243        return End;
244      // Whitespace pseudo-tokens.
245      case tok::unknown:
246        if (startsWithNewline(SM, Tok))
247          // Include at least until the end of the line.
248          End = Tok.getEndLoc();
249        break;
250      default:
251        if (contains(Terminators, Tok))
252          Terminated = true;
253        End = Tok.getEndLoc();
254        break;
255      }
256    }
257  
258    do {
259      // Lex the next token we want to possibly expand the range with.
260      Lexer->LexFromRawLexer(Tok);
261  
262      switch (Tok.getKind()) {
263      case tok::unknown:
264        if (startsWithNewline(SM, Tok))
265          // We're done, but include this newline.
266          return Tok.getEndLoc();
267        break;
268      case tok::comment:
269        // Include any comments we find on the way.
270        End = Tok.getEndLoc();
271        break;
272      case tok::semi:
273      case tok::comma:
274        if (TerminatedByMacro && contains(Terminators, Tok)) {
275          End = Tok.getEndLoc();
276          // We've found a real terminator.
277          TerminatedByMacro = false;
278          break;
279        }
280        // Found an unrelated token; stop and don't include it.
281        return End;
282      default:
283        // Found an unrelated token; stop and don't include it.
284        return End;
285      }
286    } while (true);
287  }
288  
289  // Returns the expected terminator tokens for the given declaration.
290  //
291  // If we do not know the correct terminator token, returns an empty set.
292  //
293  // There are cases where we have more than one possible terminator (for example,
294  // we find either a comma or a semicolon after a VarDecl).
295  static std::set<tok::TokenKind> getTerminators(const Decl &D) {
296    if (llvm::isa<RecordDecl>(D) || llvm::isa<UsingDecl>(D))
297      return {tok::semi};
298  
299    if (llvm::isa<FunctionDecl>(D) || llvm::isa<LinkageSpecDecl>(D))
300      return {tok::r_brace, tok::semi};
301  
302    if (llvm::isa<VarDecl>(D) || llvm::isa<FieldDecl>(D))
303      return {tok::comma, tok::semi};
304  
305    return {};
306  }
307  
308  // Starting from `Loc`, skips whitespace up to, and including, a single
309  // newline. Returns the (exclusive) end of any skipped whitespace (that is, the
310  // location immediately after the whitespace).
311  static SourceLocation skipWhitespaceAndNewline(const SourceManager &SM,
312                                                 SourceLocation Loc,
313                                                 const LangOptions &LangOpts) {
314    const char *LocChars = SM.getCharacterData(Loc);
315    int i = 0;
316    while (isHorizontalWhitespace(LocChars[i]))
317      ++i;
318    if (isVerticalWhitespace(LocChars[i]))
319      ++i;
320    return Loc.getLocWithOffset(i);
321  }
322  
323  // Is `Loc` separated from any following decl by something meaningful (e.g. an
324  // empty line, a comment), ignoring horizontal whitespace?  Since this is a
325  // heuristic, we return false when in doubt.  `Loc` cannot be the first location
326  // in the file.
327  static bool atOrBeforeSeparation(const SourceManager &SM, SourceLocation Loc,
328                                   const LangOptions &LangOpts) {
329    // If the preceding character is a newline, we'll check for an empty line as a
330    // separator. However, we can't identify an empty line using tokens, so we
331    // analyse the characters. If we try to use tokens, we'll just end up with a
332    // whitespace token, whose characters we'd have to analyse anyhow.
333    bool Invalid = false;
334    const char *LocChars =
335        SM.getCharacterData(Loc.getLocWithOffset(-1), &Invalid);
336    assert(!Invalid &&
337           "Loc must be a valid character and not the first of the source file.");
338    if (isVerticalWhitespace(LocChars[0])) {
339      for (int i = 1; isWhitespace(LocChars[i]); ++i)
340        if (isVerticalWhitespace(LocChars[i]))
341          return true;
342    }
343    // We didn't find an empty line, so lex the next token, skipping past any
344    // whitespace we just scanned.
345    Token Tok;
346    bool Failed = Lexer::getRawToken(Loc, Tok, SM, LangOpts,
347                                     /*IgnoreWhiteSpace=*/true);
348    if (Failed)
349      // Any text that confuses the lexer seems fair to consider a separation.
350      return true;
351  
352    switch (Tok.getKind()) {
353    case tok::comment:
354    case tok::l_brace:
355    case tok::r_brace:
356    case tok::eof:
357      return true;
358    default:
359      return false;
360    }
361  }
362  
363  CharSourceRange tooling::getAssociatedRange(const Decl &Decl,
364                                              ASTContext &Context) {
365    const SourceManager &SM = Context.getSourceManager();
366    const LangOptions &LangOpts = Context.getLangOpts();
367    CharSourceRange Range = CharSourceRange::getTokenRange(Decl.getSourceRange());
368  
369    // First, expand to the start of the template<> declaration if necessary.
370    if (const auto *Record = llvm::dyn_cast<CXXRecordDecl>(&Decl)) {
371      if (const auto *T = Record->getDescribedClassTemplate())
372        if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
373          Range.setBegin(T->getBeginLoc());
374    } else if (const auto *F = llvm::dyn_cast<FunctionDecl>(&Decl)) {
375      if (const auto *T = F->getDescribedFunctionTemplate())
376        if (SM.isBeforeInTranslationUnit(T->getBeginLoc(), Range.getBegin()))
377          Range.setBegin(T->getBeginLoc());
378    }
379  
380    // Next, expand the end location past trailing comments to include a potential
381    // newline at the end of the decl's line.
382    Range.setEnd(
383        getEntityEndLoc(SM, Decl.getEndLoc(), getTerminators(Decl), LangOpts));
384    Range.setTokenRange(false);
385  
386    // Expand to include preceeding associated comments. We ignore any comments
387    // that are not preceeding the decl, since we've already skipped trailing
388    // comments with getEntityEndLoc.
389    if (const RawComment *Comment =
390            Decl.getASTContext().getRawCommentForDeclNoCache(&Decl))
391      // Only include a preceding comment if:
392      // * it is *not* separate from the declaration (not including any newline
393      //   that immediately follows the comment),
394      // * the decl *is* separate from any following entity (so, there are no
395      //   other entities the comment could refer to), and
396      // * it is not a IfThisThenThat lint check.
397      if (SM.isBeforeInTranslationUnit(Comment->getBeginLoc(),
398                                       Range.getBegin()) &&
399          !atOrBeforeSeparation(
400              SM, skipWhitespaceAndNewline(SM, Comment->getEndLoc(), LangOpts),
401              LangOpts) &&
402          atOrBeforeSeparation(SM, Range.getEnd(), LangOpts)) {
403        const StringRef CommentText = Comment->getRawText(SM);
404        if (!CommentText.contains("LINT.IfChange") &&
405            !CommentText.contains("LINT.ThenChange"))
406          Range.setBegin(Comment->getBeginLoc());
407      }
408    // Add leading attributes.
409    for (auto *Attr : Decl.attrs()) {
410      if (Attr->getLocation().isInvalid() ||
411          !SM.isBeforeInTranslationUnit(Attr->getLocation(), Range.getBegin()))
412        continue;
413      Range.setBegin(Attr->getLocation());
414  
415      // Extend to the left '[[' or '__attribute((' if we saw the attribute,
416      // unless it is not a valid location.
417      bool Invalid;
418      StringRef Source =
419          SM.getBufferData(SM.getFileID(Range.getBegin()), &Invalid);
420      if (Invalid)
421        continue;
422      llvm::StringRef BeforeAttr =
423          Source.substr(0, SM.getFileOffset(Range.getBegin()));
424      llvm::StringRef BeforeAttrStripped = BeforeAttr.rtrim();
425  
426      for (llvm::StringRef Prefix : {"[[", "__attribute__(("}) {
427        // Handle whitespace between attribute prefix and attribute value.
428        if (BeforeAttrStripped.endswith(Prefix)) {
429          // Move start to start position of prefix, which is
430          // length(BeforeAttr) - length(BeforeAttrStripped) + length(Prefix)
431          // positions to the left.
432          Range.setBegin(Range.getBegin().getLocWithOffset(static_cast<int>(
433              -BeforeAttr.size() + BeforeAttrStripped.size() - Prefix.size())));
434          break;
435          // If we didn't see '[[' or '__attribute' it's probably coming from a
436          // macro expansion which is already handled by makeFileCharRange(),
437          // below.
438        }
439      }
440    }
441  
442    // Range.getEnd() is already fully un-expanded by getEntityEndLoc. But,
443    // Range.getBegin() may be inside an expansion.
444    return Lexer::makeFileCharRange(Range, SM, LangOpts);
445  }
446