xref: /freebsd/contrib/llvm-project/clang/lib/AST/RawCommentList.cpp (revision e64bea71c21eb42e97aa615188ba91f6cce0d36d)
1 //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang/AST/RawCommentList.h"
10 #include "clang/AST/ASTContext.h"
11 #include "clang/AST/Comment.h"
12 #include "clang/AST/CommentBriefParser.h"
13 #include "clang/AST/CommentCommandTraits.h"
14 #include "clang/AST/CommentLexer.h"
15 #include "clang/AST/CommentParser.h"
16 #include "clang/AST/CommentSema.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/Support/Allocator.h"
19 
20 using namespace clang;
21 
22 namespace {
23 /// Get comment kind and bool describing if it is a trailing comment.
24 std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
25                                                         bool ParseAllComments) {
26   const size_t MinCommentLength = ParseAllComments ? 2 : 3;
27   if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
28     return std::make_pair(RawComment::RCK_Invalid, false);
29 
30   RawComment::CommentKind K;
31   if (Comment[1] == '/') {
32     if (Comment.size() < 3)
33       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
34 
35     if (Comment[2] == '/')
36       K = RawComment::RCK_BCPLSlash;
37     else if (Comment[2] == '!')
38       K = RawComment::RCK_BCPLExcl;
39     else
40       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
41   } else {
42     assert(Comment.size() >= 4);
43 
44     // Comment lexer does not understand escapes in comment markers, so pretend
45     // that this is not a comment.
46     if (Comment[1] != '*' ||
47         Comment[Comment.size() - 2] != '*' ||
48         Comment[Comment.size() - 1] != '/')
49       return std::make_pair(RawComment::RCK_Invalid, false);
50 
51     if (Comment[2] == '*')
52       K = RawComment::RCK_JavaDoc;
53     else if (Comment[2] == '!')
54       K = RawComment::RCK_Qt;
55     else
56       return std::make_pair(RawComment::RCK_OrdinaryC, false);
57   }
58   const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
59   return std::make_pair(K, TrailingComment);
60 }
61 
62 bool mergedCommentIsTrailingComment(StringRef Comment) {
63   return (Comment.size() > 3) && (Comment[3] == '<');
64 }
65 
66 /// Returns true if R1 and R2 both have valid locations that start on the same
67 /// column.
68 bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
69                                const RawComment &R2) {
70   SourceLocation L1 = R1.getBeginLoc();
71   SourceLocation L2 = R2.getBeginLoc();
72   bool Invalid = false;
73   unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
74   if (!Invalid) {
75     unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
76     return !Invalid && (C1 == C2);
77   }
78   return false;
79 }
80 } // unnamed namespace
81 
82 /// Determines whether there is only whitespace in `Buffer` between `P`
83 /// and the previous line.
84 /// \param Buffer The buffer to search in.
85 /// \param P The offset from the beginning of `Buffer` to start from.
86 /// \return true if all of the characters in `Buffer` ranging from the closest
87 /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
88 /// are whitespace.
89 static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
90   // Search backwards until we see linefeed or carriage return.
91   for (unsigned I = P; I != 0; --I) {
92     char C = Buffer[I - 1];
93     if (isVerticalWhitespace(C))
94       return true;
95     if (!isHorizontalWhitespace(C))
96       return false;
97   }
98   // We hit the beginning of the buffer.
99   return true;
100 }
101 
102 /// Returns whether `K` is an ordinary comment kind.
103 static bool isOrdinaryKind(RawComment::CommentKind K) {
104   return (K == RawComment::RCK_OrdinaryBCPL) ||
105          (K == RawComment::RCK_OrdinaryC);
106 }
107 
108 RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
109                        const CommentOptions &CommentOpts, bool Merged) :
110     Range(SR), RawTextValid(false), BriefTextValid(false),
111     IsAttached(false), IsTrailingComment(false),
112     IsAlmostTrailingComment(false) {
113   // Extract raw comment text, if possible.
114   if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
115     Kind = RCK_Invalid;
116     return;
117   }
118 
119   // Guess comment kind.
120   std::pair<CommentKind, bool> K =
121       getCommentKind(RawText, CommentOpts.ParseAllComments);
122 
123   // Guess whether an ordinary comment is trailing.
124   if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
125     FileID BeginFileID;
126     unsigned BeginOffset;
127     std::tie(BeginFileID, BeginOffset) =
128         SourceMgr.getDecomposedLoc(Range.getBegin());
129     if (BeginOffset != 0) {
130       bool Invalid = false;
131       const char *Buffer =
132           SourceMgr.getBufferData(BeginFileID, &Invalid).data();
133       IsTrailingComment |=
134           (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
135     }
136   }
137 
138   if (!Merged) {
139     Kind = K.first;
140     IsTrailingComment |= K.second;
141 
142     IsAlmostTrailingComment =
143         RawText.starts_with("//<") || RawText.starts_with("/*<");
144   } else {
145     Kind = RCK_Merged;
146     IsTrailingComment =
147         IsTrailingComment || mergedCommentIsTrailingComment(RawText);
148   }
149 }
150 
151 StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
152   FileID BeginFileID;
153   FileID EndFileID;
154   unsigned BeginOffset;
155   unsigned EndOffset;
156 
157   std::tie(BeginFileID, BeginOffset) =
158       SourceMgr.getDecomposedLoc(Range.getBegin());
159   std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
160 
161   const unsigned Length = EndOffset - BeginOffset;
162   if (Length < 2)
163     return StringRef();
164 
165   // The comment can't begin in one file and end in another.
166   assert(BeginFileID == EndFileID);
167 
168   bool Invalid = false;
169   const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
170                                                     &Invalid).data();
171   if (Invalid)
172     return StringRef();
173 
174   return StringRef(BufferStart + BeginOffset, Length);
175 }
176 
177 const char *RawComment::extractBriefText(const ASTContext &Context) const {
178   // Lazily initialize RawText using the accessor before using it.
179   (void)getRawText(Context.getSourceManager());
180 
181   // Since we will be copying the resulting text, all allocations made during
182   // parsing are garbage after resulting string is formed.  Thus we can use
183   // a separate allocator for all temporary stuff.
184   llvm::BumpPtrAllocator Allocator;
185 
186   comments::Lexer L(Allocator, Context.getDiagnostics(),
187                     Context.getCommentCommandTraits(),
188                     Range.getBegin(),
189                     RawText.begin(), RawText.end());
190   comments::BriefParser P(L, Context.getCommentCommandTraits());
191 
192   const std::string Result = P.Parse();
193   const unsigned BriefTextLength = Result.size();
194   char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
195   memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
196   BriefText = BriefTextPtr;
197   BriefTextValid = true;
198 
199   return BriefTextPtr;
200 }
201 
202 comments::FullComment *RawComment::parse(const ASTContext &Context,
203                                          const Preprocessor *PP,
204                                          const Decl *D) const {
205   // Lazily initialize RawText using the accessor before using it.
206   (void)getRawText(Context.getSourceManager());
207 
208   comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
209                     Context.getCommentCommandTraits(),
210                     getSourceRange().getBegin(),
211                     RawText.begin(), RawText.end());
212   comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
213                    Context.getDiagnostics(),
214                    Context.getCommentCommandTraits(),
215                    PP);
216   S.setDecl(D);
217   comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
218                      Context.getDiagnostics(),
219                      Context.getCommentCommandTraits());
220 
221   return P.parseFullComment();
222 }
223 
224 static bool onlyWhitespaceBetween(SourceManager &SM,
225                                   SourceLocation Loc1, SourceLocation Loc2,
226                                   unsigned MaxNewlinesAllowed) {
227   FileIDAndOffset Loc1Info = SM.getDecomposedLoc(Loc1);
228   FileIDAndOffset Loc2Info = SM.getDecomposedLoc(Loc2);
229 
230   // Question does not make sense if locations are in different files.
231   if (Loc1Info.first != Loc2Info.first)
232     return false;
233 
234   bool Invalid = false;
235   const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
236   if (Invalid)
237     return false;
238 
239   unsigned NumNewlines = 0;
240   assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
241   // Look for non-whitespace characters and remember any newlines seen.
242   for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
243     switch (Buffer[I]) {
244     default:
245       return false;
246     case ' ':
247     case '\t':
248     case '\f':
249     case '\v':
250       break;
251     case '\r':
252     case '\n':
253       ++NumNewlines;
254 
255       // Check if we have found more than the maximum allowed number of
256       // newlines.
257       if (NumNewlines > MaxNewlinesAllowed)
258         return false;
259 
260       // Collapse \r\n and \n\r into a single newline.
261       if (I + 1 != Loc2Info.second &&
262           (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
263           Buffer[I] != Buffer[I + 1])
264         ++I;
265       break;
266     }
267   }
268 
269   return true;
270 }
271 
272 void RawCommentList::addComment(const RawComment &RC,
273                                 const CommentOptions &CommentOpts,
274                                 llvm::BumpPtrAllocator &Allocator) {
275   if (RC.isInvalid())
276     return;
277 
278   // Ordinary comments are not interesting for us.
279   if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
280     return;
281 
282   FileIDAndOffset Loc = SourceMgr.getDecomposedLoc(RC.getBeginLoc());
283 
284   const FileID CommentFile = Loc.first;
285   const unsigned CommentOffset = Loc.second;
286 
287   // If this is the first Doxygen comment, save it (because there isn't
288   // anything to merge it with).
289   auto &OC = OrderedComments[CommentFile];
290   if (OC.empty()) {
291     OC[CommentOffset] = new (Allocator) RawComment(RC);
292     return;
293   }
294 
295   const RawComment &C1 = *OC.rbegin()->second;
296   const RawComment &C2 = RC;
297 
298   // Merge comments only if there is only whitespace between them.
299   // Can't merge trailing and non-trailing comments unless the second is
300   // non-trailing ordinary in the same column, as in the case:
301   //   int x; // documents x
302   //          // more text
303   // versus:
304   //   int x; // documents x
305   //   int y; // documents y
306   // or:
307   //   int x; // documents x
308   //   // documents y
309   //   int y;
310   // Merge comments if they are on same or consecutive lines.
311   if ((C1.isTrailingComment() == C2.isTrailingComment() ||
312        (C1.isTrailingComment() && !C2.isTrailingComment() &&
313         isOrdinaryKind(C2.getKind()) &&
314         commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
315       onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
316                             /*MaxNewlinesAllowed=*/1)) {
317     SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
318     *OrderedComments[CommentFile].rbegin()->second =
319         RawComment(SourceMgr, MergedRange, CommentOpts, true);
320   } else {
321     OrderedComments[CommentFile][CommentOffset] =
322         new (Allocator) RawComment(RC);
323   }
324 }
325 
326 const std::map<unsigned, RawComment *> *
327 RawCommentList::getCommentsInFile(FileID File) const {
328   auto CommentsInFile = OrderedComments.find(File);
329   if (CommentsInFile == OrderedComments.end())
330     return nullptr;
331 
332   return &CommentsInFile->second;
333 }
334 
335 bool RawCommentList::empty() const { return OrderedComments.empty(); }
336 
337 unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
338                                              unsigned Offset) const {
339   auto Cached = CommentBeginLine.find(C);
340   if (Cached != CommentBeginLine.end())
341     return Cached->second;
342   const unsigned Line = SourceMgr.getLineNumber(File, Offset);
343   CommentBeginLine[C] = Line;
344   return Line;
345 }
346 
347 unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
348   auto Cached = CommentEndOffset.find(C);
349   if (Cached != CommentEndOffset.end())
350     return Cached->second;
351   const unsigned Offset =
352       SourceMgr.getDecomposedLoc(C->getSourceRange().getEnd()).second;
353   CommentEndOffset[C] = Offset;
354   return Offset;
355 }
356 
357 std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
358                                          DiagnosticsEngine &Diags) const {
359   llvm::StringRef CommentText = getRawText(SourceMgr);
360   if (CommentText.empty())
361     return "";
362 
363   std::string Result;
364   for (const RawComment::CommentLine &Line :
365        getFormattedLines(SourceMgr, Diags))
366     Result += Line.Text + "\n";
367 
368   auto LastChar = Result.find_last_not_of('\n');
369   Result.erase(LastChar + 1, Result.size());
370 
371   return Result;
372 }
373 
374 std::vector<RawComment::CommentLine>
375 RawComment::getFormattedLines(const SourceManager &SourceMgr,
376                               DiagnosticsEngine &Diags) const {
377   llvm::StringRef CommentText = getRawText(SourceMgr);
378   if (CommentText.empty())
379     return {};
380 
381   llvm::BumpPtrAllocator Allocator;
382   // We do not parse any commands, so CommentOptions are ignored by
383   // comments::Lexer. Therefore, we just use default-constructed options.
384   CommentOptions DefOpts;
385   comments::CommandTraits EmptyTraits(Allocator, DefOpts);
386   comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
387                     CommentText.begin(), CommentText.end(),
388                     /*ParseCommands=*/false);
389 
390   std::vector<RawComment::CommentLine> Result;
391   // A column number of the first non-whitespace token in the comment text.
392   // We skip whitespace up to this column, but keep the whitespace after this
393   // column. IndentColumn is calculated when lexing the first line and reused
394   // for the rest of lines.
395   unsigned IndentColumn = 0;
396 
397   // Record the line number of the last processed comment line.
398   // For block-style comments, an extra newline token will be produced after
399   // the end-comment marker, e.g.:
400   //   /** This is a multi-line comment block.
401   //       The lexer will produce two newline tokens here > */
402   // previousLine will record the line number when we previously saw a newline
403   // token and recorded a comment line. If we see another newline token on the
404   // same line, don't record anything in between.
405   unsigned PreviousLine = 0;
406 
407   // Processes one line of the comment and adds it to the result.
408   // Handles skipping the indent at the start of the line.
409   // Returns false when eof is reached and true otherwise.
410   auto LexLine = [&](bool IsFirstLine) -> bool {
411     comments::Token Tok;
412     // Lex the first token on the line. We handle it separately, because we to
413     // fix up its indentation.
414     L.lex(Tok);
415     if (Tok.is(comments::tok::eof))
416       return false;
417     if (Tok.is(comments::tok::newline)) {
418       PresumedLoc Loc = SourceMgr.getPresumedLoc(Tok.getLocation());
419       if (Loc.getLine() != PreviousLine) {
420         Result.emplace_back("", Loc, Loc);
421         PreviousLine = Loc.getLine();
422       }
423       return true;
424     }
425     SmallString<124> Line;
426     llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
427     bool LocInvalid = false;
428     unsigned TokColumn =
429         SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
430     assert(!LocInvalid && "getFormattedText for invalid location");
431 
432     // Amount of leading whitespace in TokText.
433     size_t WhitespaceLen = TokText.find_first_not_of(" \t");
434     if (WhitespaceLen == StringRef::npos)
435       WhitespaceLen = TokText.size();
436     // Remember the amount of whitespace we skipped in the first line to remove
437     // indent up to that column in the following lines.
438     if (IsFirstLine)
439       IndentColumn = TokColumn + WhitespaceLen;
440 
441     // Amount of leading whitespace we actually want to skip.
442     // For the first line we skip all the whitespace.
443     // For the rest of the lines, we skip whitespace up to IndentColumn.
444     unsigned SkipLen =
445         IsFirstLine
446             ? WhitespaceLen
447             : std::min<size_t>(
448                   WhitespaceLen,
449                   std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
450     llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
451     Line += Trimmed;
452     // Get the beginning location of the adjusted comment line.
453     PresumedLoc Begin =
454         SourceMgr.getPresumedLoc(Tok.getLocation().getLocWithOffset(SkipLen));
455 
456     // Lex all tokens in the rest of the line.
457     for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
458       if (Tok.is(comments::tok::newline)) {
459         // Get the ending location of the comment line.
460         PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
461         if (End.getLine() != PreviousLine) {
462           Result.emplace_back(Line, Begin, End);
463           PreviousLine = End.getLine();
464         }
465         return true;
466       }
467       Line += L.getSpelling(Tok, SourceMgr);
468     }
469     PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
470     Result.emplace_back(Line, Begin, End);
471     // We've reached the end of file token.
472     return false;
473   };
474 
475   // Process first line separately to remember indent for the following lines.
476   if (!LexLine(/*IsFirstLine=*/true))
477     return Result;
478   // Process the rest of the lines.
479   while (LexLine(/*IsFirstLine=*/false))
480     ;
481   return Result;
482 }
483