xref: /freebsd/contrib/llvm-project/clang/lib/Format/BreakableToken.cpp (revision 1165fc9a526630487a1feb63daef65c5aee1a583)
1 //===--- BreakableToken.cpp - Format C++ code -----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// Contains implementation of BreakableToken class and classes derived
11 /// from it.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "BreakableToken.h"
16 #include "ContinuationIndenter.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "clang/Format/Format.h"
19 #include "llvm/ADT/STLExtras.h"
20 #include "llvm/Support/Debug.h"
21 #include <algorithm>
22 
23 #define DEBUG_TYPE "format-token-breaker"
24 
25 namespace clang {
26 namespace format {
27 
28 static constexpr StringRef Blanks = " \t\v\f\r";
29 static bool IsBlank(char C) {
30   switch (C) {
31   case ' ':
32   case '\t':
33   case '\v':
34   case '\f':
35   case '\r':
36     return true;
37   default:
38     return false;
39   }
40 }
41 
42 static StringRef getLineCommentIndentPrefix(StringRef Comment,
43                                             const FormatStyle &Style) {
44   static constexpr StringRef KnownCStylePrefixes[] = {"///<", "//!<", "///",
45                                                       "//!",  "//:",  "//"};
46   static constexpr StringRef KnownTextProtoPrefixes[] = {"####", "###", "##",
47                                                          "//", "#"};
48   ArrayRef<StringRef> KnownPrefixes(KnownCStylePrefixes);
49   if (Style.Language == FormatStyle::LK_TextProto)
50     KnownPrefixes = KnownTextProtoPrefixes;
51 
52   assert(std::is_sorted(KnownPrefixes.begin(), KnownPrefixes.end(),
53                         [](StringRef Lhs, StringRef Rhs) noexcept {
54                           return Lhs.size() > Rhs.size();
55                         }));
56 
57   for (StringRef KnownPrefix : KnownPrefixes) {
58     if (Comment.startswith(KnownPrefix)) {
59       const auto PrefixLength =
60           Comment.find_first_not_of(' ', KnownPrefix.size());
61       return Comment.substr(0, PrefixLength);
62     }
63   }
64   return {};
65 }
66 
67 static BreakableToken::Split
68 getCommentSplit(StringRef Text, unsigned ContentStartColumn,
69                 unsigned ColumnLimit, unsigned TabWidth,
70                 encoding::Encoding Encoding, const FormatStyle &Style,
71                 bool DecorationEndsWithStar = false) {
72   LLVM_DEBUG(llvm::dbgs() << "Comment split: \"" << Text
73                           << "\", Column limit: " << ColumnLimit
74                           << ", Content start: " << ContentStartColumn << "\n");
75   if (ColumnLimit <= ContentStartColumn + 1)
76     return BreakableToken::Split(StringRef::npos, 0);
77 
78   unsigned MaxSplit = ColumnLimit - ContentStartColumn + 1;
79   unsigned MaxSplitBytes = 0;
80 
81   for (unsigned NumChars = 0;
82        NumChars < MaxSplit && MaxSplitBytes < Text.size();) {
83     unsigned BytesInChar =
84         encoding::getCodePointNumBytes(Text[MaxSplitBytes], Encoding);
85     NumChars +=
86         encoding::columnWidthWithTabs(Text.substr(MaxSplitBytes, BytesInChar),
87                                       ContentStartColumn, TabWidth, Encoding);
88     MaxSplitBytes += BytesInChar;
89   }
90 
91   // In JavaScript, some @tags can be followed by {, and machinery that parses
92   // these comments will fail to understand the comment if followed by a line
93   // break. So avoid ever breaking before a {.
94   if (Style.isJavaScript()) {
95     StringRef::size_type SpaceOffset =
96         Text.find_first_of(Blanks, MaxSplitBytes);
97     if (SpaceOffset != StringRef::npos && SpaceOffset + 1 < Text.size() &&
98         Text[SpaceOffset + 1] == '{') {
99       MaxSplitBytes = SpaceOffset + 1;
100     }
101   }
102 
103   StringRef::size_type SpaceOffset = Text.find_last_of(Blanks, MaxSplitBytes);
104 
105   static const auto kNumberedListRegexp = llvm::Regex("^[1-9][0-9]?\\.");
106   // Some spaces are unacceptable to break on, rewind past them.
107   while (SpaceOffset != StringRef::npos) {
108     // If a line-comment ends with `\`, the next line continues the comment,
109     // whether or not it starts with `//`. This is confusing and triggers
110     // -Wcomment.
111     // Avoid introducing multiline comments by not allowing a break right
112     // after '\'.
113     if (Style.isCpp()) {
114       StringRef::size_type LastNonBlank =
115           Text.find_last_not_of(Blanks, SpaceOffset);
116       if (LastNonBlank != StringRef::npos && Text[LastNonBlank] == '\\') {
117         SpaceOffset = Text.find_last_of(Blanks, LastNonBlank);
118         continue;
119       }
120     }
121 
122     // Do not split before a number followed by a dot: this would be interpreted
123     // as a numbered list, which would prevent re-flowing in subsequent passes.
124     if (kNumberedListRegexp.match(Text.substr(SpaceOffset).ltrim(Blanks))) {
125       SpaceOffset = Text.find_last_of(Blanks, SpaceOffset);
126       continue;
127     }
128 
129     // Avoid ever breaking before a @tag or a { in JavaScript.
130     if (Style.isJavaScript() && SpaceOffset + 1 < Text.size() &&
131         (Text[SpaceOffset + 1] == '{' || Text[SpaceOffset + 1] == '@')) {
132       SpaceOffset = Text.find_last_of(Blanks, SpaceOffset);
133       continue;
134     }
135 
136     break;
137   }
138 
139   if (SpaceOffset == StringRef::npos ||
140       // Don't break at leading whitespace.
141       Text.find_last_not_of(Blanks, SpaceOffset) == StringRef::npos) {
142     // Make sure that we don't break at leading whitespace that
143     // reaches past MaxSplit.
144     StringRef::size_type FirstNonWhitespace = Text.find_first_not_of(Blanks);
145     if (FirstNonWhitespace == StringRef::npos)
146       // If the comment is only whitespace, we cannot split.
147       return BreakableToken::Split(StringRef::npos, 0);
148     SpaceOffset = Text.find_first_of(
149         Blanks, std::max<unsigned>(MaxSplitBytes, FirstNonWhitespace));
150   }
151   if (SpaceOffset != StringRef::npos && SpaceOffset != 0) {
152     // adaptStartOfLine will break after lines starting with /** if the comment
153     // is broken anywhere. Avoid emitting this break twice here.
154     // Example: in /** longtextcomesherethatbreaks */ (with ColumnLimit 20) will
155     // insert a break after /**, so this code must not insert the same break.
156     if (SpaceOffset == 1 && Text[SpaceOffset - 1] == '*')
157       return BreakableToken::Split(StringRef::npos, 0);
158     StringRef BeforeCut = Text.substr(0, SpaceOffset).rtrim(Blanks);
159     StringRef AfterCut = Text.substr(SpaceOffset);
160     // Don't trim the leading blanks if it would create a */ after the break.
161     if (!DecorationEndsWithStar || AfterCut.size() <= 1 || AfterCut[1] != '/')
162       AfterCut = AfterCut.ltrim(Blanks);
163     return BreakableToken::Split(BeforeCut.size(),
164                                  AfterCut.begin() - BeforeCut.end());
165   }
166   return BreakableToken::Split(StringRef::npos, 0);
167 }
168 
169 static BreakableToken::Split
170 getStringSplit(StringRef Text, unsigned UsedColumns, unsigned ColumnLimit,
171                unsigned TabWidth, encoding::Encoding Encoding) {
172   // FIXME: Reduce unit test case.
173   if (Text.empty())
174     return BreakableToken::Split(StringRef::npos, 0);
175   if (ColumnLimit <= UsedColumns)
176     return BreakableToken::Split(StringRef::npos, 0);
177   unsigned MaxSplit = ColumnLimit - UsedColumns;
178   StringRef::size_type SpaceOffset = 0;
179   StringRef::size_type SlashOffset = 0;
180   StringRef::size_type WordStartOffset = 0;
181   StringRef::size_type SplitPoint = 0;
182   for (unsigned Chars = 0;;) {
183     unsigned Advance;
184     if (Text[0] == '\\') {
185       Advance = encoding::getEscapeSequenceLength(Text);
186       Chars += Advance;
187     } else {
188       Advance = encoding::getCodePointNumBytes(Text[0], Encoding);
189       Chars += encoding::columnWidthWithTabs(
190           Text.substr(0, Advance), UsedColumns + Chars, TabWidth, Encoding);
191     }
192 
193     if (Chars > MaxSplit || Text.size() <= Advance)
194       break;
195 
196     if (IsBlank(Text[0]))
197       SpaceOffset = SplitPoint;
198     if (Text[0] == '/')
199       SlashOffset = SplitPoint;
200     if (Advance == 1 && !isAlphanumeric(Text[0]))
201       WordStartOffset = SplitPoint;
202 
203     SplitPoint += Advance;
204     Text = Text.substr(Advance);
205   }
206 
207   if (SpaceOffset != 0)
208     return BreakableToken::Split(SpaceOffset + 1, 0);
209   if (SlashOffset != 0)
210     return BreakableToken::Split(SlashOffset + 1, 0);
211   if (WordStartOffset != 0)
212     return BreakableToken::Split(WordStartOffset + 1, 0);
213   if (SplitPoint != 0)
214     return BreakableToken::Split(SplitPoint, 0);
215   return BreakableToken::Split(StringRef::npos, 0);
216 }
217 
218 bool switchesFormatting(const FormatToken &Token) {
219   assert((Token.is(TT_BlockComment) || Token.is(TT_LineComment)) &&
220          "formatting regions are switched by comment tokens");
221   StringRef Content = Token.TokenText.substr(2).ltrim();
222   return Content.startswith("clang-format on") ||
223          Content.startswith("clang-format off");
224 }
225 
226 unsigned
227 BreakableToken::getLengthAfterCompression(unsigned RemainingTokenColumns,
228                                           Split Split) const {
229   // Example: consider the content
230   // lala  lala
231   // - RemainingTokenColumns is the original number of columns, 10;
232   // - Split is (4, 2), denoting the two spaces between the two words;
233   //
234   // We compute the number of columns when the split is compressed into a single
235   // space, like:
236   // lala lala
237   //
238   // FIXME: Correctly measure the length of whitespace in Split.second so it
239   // works with tabs.
240   return RemainingTokenColumns + 1 - Split.second;
241 }
242 
243 unsigned BreakableStringLiteral::getLineCount() const { return 1; }
244 
245 unsigned BreakableStringLiteral::getRangeLength(unsigned LineIndex,
246                                                 unsigned Offset,
247                                                 StringRef::size_type Length,
248                                                 unsigned StartColumn) const {
249   llvm_unreachable("Getting the length of a part of the string literal "
250                    "indicates that the code tries to reflow it.");
251 }
252 
253 unsigned
254 BreakableStringLiteral::getRemainingLength(unsigned LineIndex, unsigned Offset,
255                                            unsigned StartColumn) const {
256   return UnbreakableTailLength + Postfix.size() +
257          encoding::columnWidthWithTabs(Line.substr(Offset), StartColumn,
258                                        Style.TabWidth, Encoding);
259 }
260 
261 unsigned BreakableStringLiteral::getContentStartColumn(unsigned LineIndex,
262                                                        bool Break) const {
263   return StartColumn + Prefix.size();
264 }
265 
266 BreakableStringLiteral::BreakableStringLiteral(
267     const FormatToken &Tok, unsigned StartColumn, StringRef Prefix,
268     StringRef Postfix, unsigned UnbreakableTailLength, bool InPPDirective,
269     encoding::Encoding Encoding, const FormatStyle &Style)
270     : BreakableToken(Tok, InPPDirective, Encoding, Style),
271       StartColumn(StartColumn), Prefix(Prefix), Postfix(Postfix),
272       UnbreakableTailLength(UnbreakableTailLength) {
273   assert(Tok.TokenText.startswith(Prefix) && Tok.TokenText.endswith(Postfix));
274   Line = Tok.TokenText.substr(
275       Prefix.size(), Tok.TokenText.size() - Prefix.size() - Postfix.size());
276 }
277 
278 BreakableToken::Split BreakableStringLiteral::getSplit(
279     unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
280     unsigned ContentStartColumn, const llvm::Regex &CommentPragmasRegex) const {
281   return getStringSplit(Line.substr(TailOffset), ContentStartColumn,
282                         ColumnLimit - Postfix.size(), Style.TabWidth, Encoding);
283 }
284 
285 void BreakableStringLiteral::insertBreak(unsigned LineIndex,
286                                          unsigned TailOffset, Split Split,
287                                          unsigned ContentIndent,
288                                          WhitespaceManager &Whitespaces) const {
289   Whitespaces.replaceWhitespaceInToken(
290       Tok, Prefix.size() + TailOffset + Split.first, Split.second, Postfix,
291       Prefix, InPPDirective, 1, StartColumn);
292 }
293 
294 BreakableComment::BreakableComment(const FormatToken &Token,
295                                    unsigned StartColumn, bool InPPDirective,
296                                    encoding::Encoding Encoding,
297                                    const FormatStyle &Style)
298     : BreakableToken(Token, InPPDirective, Encoding, Style),
299       StartColumn(StartColumn) {}
300 
301 unsigned BreakableComment::getLineCount() const { return Lines.size(); }
302 
303 BreakableToken::Split
304 BreakableComment::getSplit(unsigned LineIndex, unsigned TailOffset,
305                            unsigned ColumnLimit, unsigned ContentStartColumn,
306                            const llvm::Regex &CommentPragmasRegex) const {
307   // Don't break lines matching the comment pragmas regex.
308   if (CommentPragmasRegex.match(Content[LineIndex]))
309     return Split(StringRef::npos, 0);
310   return getCommentSplit(Content[LineIndex].substr(TailOffset),
311                          ContentStartColumn, ColumnLimit, Style.TabWidth,
312                          Encoding, Style);
313 }
314 
315 void BreakableComment::compressWhitespace(
316     unsigned LineIndex, unsigned TailOffset, Split Split,
317     WhitespaceManager &Whitespaces) const {
318   StringRef Text = Content[LineIndex].substr(TailOffset);
319   // Text is relative to the content line, but Whitespaces operates relative to
320   // the start of the corresponding token, so compute the start of the Split
321   // that needs to be compressed into a single space relative to the start of
322   // its token.
323   unsigned BreakOffsetInToken =
324       Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
325   unsigned CharsToRemove = Split.second;
326   Whitespaces.replaceWhitespaceInToken(
327       tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "", "",
328       /*InPPDirective=*/false, /*Newlines=*/0, /*Spaces=*/1);
329 }
330 
331 const FormatToken &BreakableComment::tokenAt(unsigned LineIndex) const {
332   return Tokens[LineIndex] ? *Tokens[LineIndex] : Tok;
333 }
334 
335 static bool mayReflowContent(StringRef Content) {
336   Content = Content.trim(Blanks);
337   // Lines starting with '@' commonly have special meaning.
338   // Lines starting with '-', '-#', '+' or '*' are bulleted/numbered lists.
339   bool hasSpecialMeaningPrefix = false;
340   for (StringRef Prefix :
341        {"@", "TODO", "FIXME", "XXX", "-# ", "- ", "+ ", "* "}) {
342     if (Content.startswith(Prefix)) {
343       hasSpecialMeaningPrefix = true;
344       break;
345     }
346   }
347 
348   // Numbered lists may also start with a number followed by '.'
349   // To avoid issues if a line starts with a number which is actually the end
350   // of a previous line, we only consider numbers with up to 2 digits.
351   static const auto kNumberedListRegexp = llvm::Regex("^[1-9][0-9]?\\. ");
352   hasSpecialMeaningPrefix =
353       hasSpecialMeaningPrefix || kNumberedListRegexp.match(Content);
354 
355   // Simple heuristic for what to reflow: content should contain at least two
356   // characters and either the first or second character must be
357   // non-punctuation.
358   return Content.size() >= 2 && !hasSpecialMeaningPrefix &&
359          !Content.endswith("\\") &&
360          // Note that this is UTF-8 safe, since if isPunctuation(Content[0]) is
361          // true, then the first code point must be 1 byte long.
362          (!isPunctuation(Content[0]) || !isPunctuation(Content[1]));
363 }
364 
365 BreakableBlockComment::BreakableBlockComment(
366     const FormatToken &Token, unsigned StartColumn,
367     unsigned OriginalStartColumn, bool FirstInLine, bool InPPDirective,
368     encoding::Encoding Encoding, const FormatStyle &Style, bool UseCRLF)
369     : BreakableComment(Token, StartColumn, InPPDirective, Encoding, Style),
370       DelimitersOnNewline(false),
371       UnbreakableTailLength(Token.UnbreakableTailLength) {
372   assert(Tok.is(TT_BlockComment) &&
373          "block comment section must start with a block comment");
374 
375   StringRef TokenText(Tok.TokenText);
376   assert(TokenText.startswith("/*") && TokenText.endswith("*/"));
377   TokenText.substr(2, TokenText.size() - 4)
378       .split(Lines, UseCRLF ? "\r\n" : "\n");
379 
380   int IndentDelta = StartColumn - OriginalStartColumn;
381   Content.resize(Lines.size());
382   Content[0] = Lines[0];
383   ContentColumn.resize(Lines.size());
384   // Account for the initial '/*'.
385   ContentColumn[0] = StartColumn + 2;
386   Tokens.resize(Lines.size());
387   for (size_t i = 1; i < Lines.size(); ++i)
388     adjustWhitespace(i, IndentDelta);
389 
390   // Align decorations with the column of the star on the first line,
391   // that is one column after the start "/*".
392   DecorationColumn = StartColumn + 1;
393 
394   // Account for comment decoration patterns like this:
395   //
396   // /*
397   // ** blah blah blah
398   // */
399   if (Lines.size() >= 2 && Content[1].startswith("**") &&
400       static_cast<unsigned>(ContentColumn[1]) == StartColumn) {
401     DecorationColumn = StartColumn;
402   }
403 
404   Decoration = "* ";
405   if (Lines.size() == 1 && !FirstInLine) {
406     // Comments for which FirstInLine is false can start on arbitrary column,
407     // and available horizontal space can be too small to align consecutive
408     // lines with the first one.
409     // FIXME: We could, probably, align them to current indentation level, but
410     // now we just wrap them without stars.
411     Decoration = "";
412   }
413   for (size_t i = 1, e = Lines.size(); i < e && !Decoration.empty(); ++i) {
414     // If the last line is empty, the closing "*/" will have a star.
415     if (i + 1 == e && Content[i].empty())
416       break;
417     if (!Content[i].empty() && i + 1 != e && Decoration.startswith(Content[i]))
418       continue;
419     while (!Content[i].startswith(Decoration))
420       Decoration = Decoration.substr(0, Decoration.size() - 1);
421   }
422 
423   LastLineNeedsDecoration = true;
424   IndentAtLineBreak = ContentColumn[0] + 1;
425   for (size_t i = 1, e = Lines.size(); i < e; ++i) {
426     if (Content[i].empty()) {
427       if (i + 1 == e) {
428         // Empty last line means that we already have a star as a part of the
429         // trailing */. We also need to preserve whitespace, so that */ is
430         // correctly indented.
431         LastLineNeedsDecoration = false;
432         // Align the star in the last '*/' with the stars on the previous lines.
433         if (e >= 2 && !Decoration.empty()) {
434           ContentColumn[i] = DecorationColumn;
435         }
436       } else if (Decoration.empty()) {
437         // For all other lines, set the start column to 0 if they're empty, so
438         // we do not insert trailing whitespace anywhere.
439         ContentColumn[i] = 0;
440       }
441       continue;
442     }
443 
444     // The first line already excludes the star.
445     // The last line excludes the star if LastLineNeedsDecoration is false.
446     // For all other lines, adjust the line to exclude the star and
447     // (optionally) the first whitespace.
448     unsigned DecorationSize = Decoration.startswith(Content[i])
449                                   ? Content[i].size()
450                                   : Decoration.size();
451     if (DecorationSize) {
452       ContentColumn[i] = DecorationColumn + DecorationSize;
453     }
454     Content[i] = Content[i].substr(DecorationSize);
455     if (!Decoration.startswith(Content[i]))
456       IndentAtLineBreak =
457           std::min<int>(IndentAtLineBreak, std::max(0, ContentColumn[i]));
458   }
459   IndentAtLineBreak = std::max<unsigned>(IndentAtLineBreak, Decoration.size());
460 
461   // Detect a multiline jsdoc comment and set DelimitersOnNewline in that case.
462   if (Style.isJavaScript() || Style.Language == FormatStyle::LK_Java) {
463     if ((Lines[0] == "*" || Lines[0].startswith("* ")) && Lines.size() > 1) {
464       // This is a multiline jsdoc comment.
465       DelimitersOnNewline = true;
466     } else if (Lines[0].startswith("* ") && Lines.size() == 1) {
467       // Detect a long single-line comment, like:
468       // /** long long long */
469       // Below, '2' is the width of '*/'.
470       unsigned EndColumn =
471           ContentColumn[0] +
472           encoding::columnWidthWithTabs(Lines[0], ContentColumn[0],
473                                         Style.TabWidth, Encoding) +
474           2;
475       DelimitersOnNewline = EndColumn > Style.ColumnLimit;
476     }
477   }
478 
479   LLVM_DEBUG({
480     llvm::dbgs() << "IndentAtLineBreak " << IndentAtLineBreak << "\n";
481     llvm::dbgs() << "DelimitersOnNewline " << DelimitersOnNewline << "\n";
482     for (size_t i = 0; i < Lines.size(); ++i) {
483       llvm::dbgs() << i << " |" << Content[i] << "| "
484                    << "CC=" << ContentColumn[i] << "| "
485                    << "IN=" << (Content[i].data() - Lines[i].data()) << "\n";
486     }
487   });
488 }
489 
490 BreakableToken::Split BreakableBlockComment::getSplit(
491     unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
492     unsigned ContentStartColumn, const llvm::Regex &CommentPragmasRegex) const {
493   // Don't break lines matching the comment pragmas regex.
494   if (CommentPragmasRegex.match(Content[LineIndex]))
495     return Split(StringRef::npos, 0);
496   return getCommentSplit(Content[LineIndex].substr(TailOffset),
497                          ContentStartColumn, ColumnLimit, Style.TabWidth,
498                          Encoding, Style, Decoration.endswith("*"));
499 }
500 
501 void BreakableBlockComment::adjustWhitespace(unsigned LineIndex,
502                                              int IndentDelta) {
503   // When in a preprocessor directive, the trailing backslash in a block comment
504   // is not needed, but can serve a purpose of uniformity with necessary escaped
505   // newlines outside the comment. In this case we remove it here before
506   // trimming the trailing whitespace. The backslash will be re-added later when
507   // inserting a line break.
508   size_t EndOfPreviousLine = Lines[LineIndex - 1].size();
509   if (InPPDirective && Lines[LineIndex - 1].endswith("\\"))
510     --EndOfPreviousLine;
511 
512   // Calculate the end of the non-whitespace text in the previous line.
513   EndOfPreviousLine =
514       Lines[LineIndex - 1].find_last_not_of(Blanks, EndOfPreviousLine);
515   if (EndOfPreviousLine == StringRef::npos)
516     EndOfPreviousLine = 0;
517   else
518     ++EndOfPreviousLine;
519   // Calculate the start of the non-whitespace text in the current line.
520   size_t StartOfLine = Lines[LineIndex].find_first_not_of(Blanks);
521   if (StartOfLine == StringRef::npos)
522     StartOfLine = Lines[LineIndex].size();
523 
524   StringRef Whitespace = Lines[LineIndex].substr(0, StartOfLine);
525   // Adjust Lines to only contain relevant text.
526   size_t PreviousContentOffset =
527       Content[LineIndex - 1].data() - Lines[LineIndex - 1].data();
528   Content[LineIndex - 1] = Lines[LineIndex - 1].substr(
529       PreviousContentOffset, EndOfPreviousLine - PreviousContentOffset);
530   Content[LineIndex] = Lines[LineIndex].substr(StartOfLine);
531 
532   // Adjust the start column uniformly across all lines.
533   ContentColumn[LineIndex] =
534       encoding::columnWidthWithTabs(Whitespace, 0, Style.TabWidth, Encoding) +
535       IndentDelta;
536 }
537 
538 unsigned BreakableBlockComment::getRangeLength(unsigned LineIndex,
539                                                unsigned Offset,
540                                                StringRef::size_type Length,
541                                                unsigned StartColumn) const {
542   return encoding::columnWidthWithTabs(
543       Content[LineIndex].substr(Offset, Length), StartColumn, Style.TabWidth,
544       Encoding);
545 }
546 
547 unsigned BreakableBlockComment::getRemainingLength(unsigned LineIndex,
548                                                    unsigned Offset,
549                                                    unsigned StartColumn) const {
550   unsigned LineLength =
551       UnbreakableTailLength +
552       getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
553   if (LineIndex + 1 == Lines.size()) {
554     LineLength += 2;
555     // We never need a decoration when breaking just the trailing "*/" postfix.
556     bool HasRemainingText = Offset < Content[LineIndex].size();
557     if (!HasRemainingText) {
558       bool HasDecoration = Lines[LineIndex].ltrim().startswith(Decoration);
559       if (HasDecoration)
560         LineLength -= Decoration.size();
561     }
562   }
563   return LineLength;
564 }
565 
566 unsigned BreakableBlockComment::getContentStartColumn(unsigned LineIndex,
567                                                       bool Break) const {
568   if (Break)
569     return IndentAtLineBreak;
570   return std::max(0, ContentColumn[LineIndex]);
571 }
572 
573 const llvm::StringSet<>
574     BreakableBlockComment::ContentIndentingJavadocAnnotations = {
575         "@param", "@return",     "@returns", "@throws",  "@type", "@template",
576         "@see",   "@deprecated", "@define",  "@exports", "@mods", "@private",
577 };
578 
579 unsigned BreakableBlockComment::getContentIndent(unsigned LineIndex) const {
580   if (Style.Language != FormatStyle::LK_Java && !Style.isJavaScript())
581     return 0;
582   // The content at LineIndex 0 of a comment like:
583   // /** line 0 */
584   // is "* line 0", so we need to skip over the decoration in that case.
585   StringRef ContentWithNoDecoration = Content[LineIndex];
586   if (LineIndex == 0 && ContentWithNoDecoration.startswith("*")) {
587     ContentWithNoDecoration = ContentWithNoDecoration.substr(1).ltrim(Blanks);
588   }
589   StringRef FirstWord = ContentWithNoDecoration.substr(
590       0, ContentWithNoDecoration.find_first_of(Blanks));
591   if (ContentIndentingJavadocAnnotations.find(FirstWord) !=
592       ContentIndentingJavadocAnnotations.end())
593     return Style.ContinuationIndentWidth;
594   return 0;
595 }
596 
597 void BreakableBlockComment::insertBreak(unsigned LineIndex, unsigned TailOffset,
598                                         Split Split, unsigned ContentIndent,
599                                         WhitespaceManager &Whitespaces) const {
600   StringRef Text = Content[LineIndex].substr(TailOffset);
601   StringRef Prefix = Decoration;
602   // We need this to account for the case when we have a decoration "* " for all
603   // the lines except for the last one, where the star in "*/" acts as a
604   // decoration.
605   unsigned LocalIndentAtLineBreak = IndentAtLineBreak;
606   if (LineIndex + 1 == Lines.size() &&
607       Text.size() == Split.first + Split.second) {
608     // For the last line we need to break before "*/", but not to add "* ".
609     Prefix = "";
610     if (LocalIndentAtLineBreak >= 2)
611       LocalIndentAtLineBreak -= 2;
612   }
613   // The split offset is from the beginning of the line. Convert it to an offset
614   // from the beginning of the token text.
615   unsigned BreakOffsetInToken =
616       Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
617   unsigned CharsToRemove = Split.second;
618   assert(LocalIndentAtLineBreak >= Prefix.size());
619   std::string PrefixWithTrailingIndent = std::string(Prefix);
620   PrefixWithTrailingIndent.append(ContentIndent, ' ');
621   Whitespaces.replaceWhitespaceInToken(
622       tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "",
623       PrefixWithTrailingIndent, InPPDirective, /*Newlines=*/1,
624       /*Spaces=*/LocalIndentAtLineBreak + ContentIndent -
625           PrefixWithTrailingIndent.size());
626 }
627 
628 BreakableToken::Split BreakableBlockComment::getReflowSplit(
629     unsigned LineIndex, const llvm::Regex &CommentPragmasRegex) const {
630   if (!mayReflow(LineIndex, CommentPragmasRegex))
631     return Split(StringRef::npos, 0);
632 
633   // If we're reflowing into a line with content indent, only reflow the next
634   // line if its starting whitespace matches the content indent.
635   size_t Trimmed = Content[LineIndex].find_first_not_of(Blanks);
636   if (LineIndex) {
637     unsigned PreviousContentIndent = getContentIndent(LineIndex - 1);
638     if (PreviousContentIndent && Trimmed != StringRef::npos &&
639         Trimmed != PreviousContentIndent)
640       return Split(StringRef::npos, 0);
641   }
642 
643   return Split(0, Trimmed != StringRef::npos ? Trimmed : 0);
644 }
645 
646 bool BreakableBlockComment::introducesBreakBeforeToken() const {
647   // A break is introduced when we want delimiters on newline.
648   return DelimitersOnNewline &&
649          Lines[0].substr(1).find_first_not_of(Blanks) != StringRef::npos;
650 }
651 
652 void BreakableBlockComment::reflow(unsigned LineIndex,
653                                    WhitespaceManager &Whitespaces) const {
654   StringRef TrimmedContent = Content[LineIndex].ltrim(Blanks);
655   // Here we need to reflow.
656   assert(Tokens[LineIndex - 1] == Tokens[LineIndex] &&
657          "Reflowing whitespace within a token");
658   // This is the offset of the end of the last line relative to the start of
659   // the token text in the token.
660   unsigned WhitespaceOffsetInToken = Content[LineIndex - 1].data() +
661                                      Content[LineIndex - 1].size() -
662                                      tokenAt(LineIndex).TokenText.data();
663   unsigned WhitespaceLength = TrimmedContent.data() -
664                               tokenAt(LineIndex).TokenText.data() -
665                               WhitespaceOffsetInToken;
666   Whitespaces.replaceWhitespaceInToken(
667       tokenAt(LineIndex), WhitespaceOffsetInToken,
668       /*ReplaceChars=*/WhitespaceLength, /*PreviousPostfix=*/"",
669       /*CurrentPrefix=*/ReflowPrefix, InPPDirective, /*Newlines=*/0,
670       /*Spaces=*/0);
671 }
672 
673 void BreakableBlockComment::adaptStartOfLine(
674     unsigned LineIndex, WhitespaceManager &Whitespaces) const {
675   if (LineIndex == 0) {
676     if (DelimitersOnNewline) {
677       // Since we're breaking at index 1 below, the break position and the
678       // break length are the same.
679       // Note: this works because getCommentSplit is careful never to split at
680       // the beginning of a line.
681       size_t BreakLength = Lines[0].substr(1).find_first_not_of(Blanks);
682       if (BreakLength != StringRef::npos)
683         insertBreak(LineIndex, 0, Split(1, BreakLength), /*ContentIndent=*/0,
684                     Whitespaces);
685     }
686     return;
687   }
688   // Here no reflow with the previous line will happen.
689   // Fix the decoration of the line at LineIndex.
690   StringRef Prefix = Decoration;
691   if (Content[LineIndex].empty()) {
692     if (LineIndex + 1 == Lines.size()) {
693       if (!LastLineNeedsDecoration) {
694         // If the last line was empty, we don't need a prefix, as the */ will
695         // line up with the decoration (if it exists).
696         Prefix = "";
697       }
698     } else if (!Decoration.empty()) {
699       // For other empty lines, if we do have a decoration, adapt it to not
700       // contain a trailing whitespace.
701       Prefix = Prefix.substr(0, 1);
702     }
703   } else {
704     if (ContentColumn[LineIndex] == 1) {
705       // This line starts immediately after the decorating *.
706       Prefix = Prefix.substr(0, 1);
707     }
708   }
709   // This is the offset of the end of the last line relative to the start of the
710   // token text in the token.
711   unsigned WhitespaceOffsetInToken = Content[LineIndex - 1].data() +
712                                      Content[LineIndex - 1].size() -
713                                      tokenAt(LineIndex).TokenText.data();
714   unsigned WhitespaceLength = Content[LineIndex].data() -
715                               tokenAt(LineIndex).TokenText.data() -
716                               WhitespaceOffsetInToken;
717   Whitespaces.replaceWhitespaceInToken(
718       tokenAt(LineIndex), WhitespaceOffsetInToken, WhitespaceLength, "", Prefix,
719       InPPDirective, /*Newlines=*/1, ContentColumn[LineIndex] - Prefix.size());
720 }
721 
722 BreakableToken::Split
723 BreakableBlockComment::getSplitAfterLastLine(unsigned TailOffset) const {
724   if (DelimitersOnNewline) {
725     // Replace the trailing whitespace of the last line with a newline.
726     // In case the last line is empty, the ending '*/' is already on its own
727     // line.
728     StringRef Line = Content.back().substr(TailOffset);
729     StringRef TrimmedLine = Line.rtrim(Blanks);
730     if (!TrimmedLine.empty())
731       return Split(TrimmedLine.size(), Line.size() - TrimmedLine.size());
732   }
733   return Split(StringRef::npos, 0);
734 }
735 
736 bool BreakableBlockComment::mayReflow(
737     unsigned LineIndex, const llvm::Regex &CommentPragmasRegex) const {
738   // Content[LineIndex] may exclude the indent after the '*' decoration. In that
739   // case, we compute the start of the comment pragma manually.
740   StringRef IndentContent = Content[LineIndex];
741   if (Lines[LineIndex].ltrim(Blanks).startswith("*")) {
742     IndentContent = Lines[LineIndex].ltrim(Blanks).substr(1);
743   }
744   return LineIndex > 0 && !CommentPragmasRegex.match(IndentContent) &&
745          mayReflowContent(Content[LineIndex]) && !Tok.Finalized &&
746          !switchesFormatting(tokenAt(LineIndex));
747 }
748 
749 BreakableLineCommentSection::BreakableLineCommentSection(
750     const FormatToken &Token, unsigned StartColumn, bool InPPDirective,
751     encoding::Encoding Encoding, const FormatStyle &Style)
752     : BreakableComment(Token, StartColumn, InPPDirective, Encoding, Style) {
753   assert(Tok.is(TT_LineComment) &&
754          "line comment section must start with a line comment");
755   FormatToken *LineTok = nullptr;
756   const int Minimum = Style.SpacesInLineCommentPrefix.Minimum;
757   // How many spaces we changed in the first line of the section, this will be
758   // applied in all following lines
759   int FirstLineSpaceChange = 0;
760   for (const FormatToken *CurrentTok = &Tok;
761        CurrentTok && CurrentTok->is(TT_LineComment);
762        CurrentTok = CurrentTok->Next) {
763     LastLineTok = LineTok;
764     StringRef TokenText(CurrentTok->TokenText);
765     assert((TokenText.startswith("//") || TokenText.startswith("#")) &&
766            "unsupported line comment prefix, '//' and '#' are supported");
767     size_t FirstLineIndex = Lines.size();
768     TokenText.split(Lines, "\n");
769     Content.resize(Lines.size());
770     ContentColumn.resize(Lines.size());
771     PrefixSpaceChange.resize(Lines.size());
772     Tokens.resize(Lines.size());
773     Prefix.resize(Lines.size());
774     OriginalPrefix.resize(Lines.size());
775     for (size_t i = FirstLineIndex, e = Lines.size(); i < e; ++i) {
776       Lines[i] = Lines[i].ltrim(Blanks);
777       StringRef IndentPrefix = getLineCommentIndentPrefix(Lines[i], Style);
778       OriginalPrefix[i] = IndentPrefix;
779       const int SpacesInPrefix = llvm::count(IndentPrefix, ' ');
780 
781       // On the first line of the comment section we calculate how many spaces
782       // are to be added or removed, all lines after that just get only the
783       // change and we will not look at the maximum anymore. Additionally to the
784       // actual first line, we calculate that when the non space Prefix changes,
785       // e.g. from "///" to "//".
786       if (i == 0 || OriginalPrefix[i].rtrim(Blanks) !=
787                         OriginalPrefix[i - 1].rtrim(Blanks)) {
788         if (SpacesInPrefix < Minimum && Lines[i].size() > IndentPrefix.size() &&
789             isAlphanumeric(Lines[i][IndentPrefix.size()])) {
790           FirstLineSpaceChange = Minimum - SpacesInPrefix;
791         } else if (static_cast<unsigned>(SpacesInPrefix) >
792                    Style.SpacesInLineCommentPrefix.Maximum) {
793           FirstLineSpaceChange =
794               Style.SpacesInLineCommentPrefix.Maximum - SpacesInPrefix;
795         } else {
796           FirstLineSpaceChange = 0;
797         }
798       }
799 
800       if (Lines[i].size() != IndentPrefix.size()) {
801         PrefixSpaceChange[i] = FirstLineSpaceChange;
802 
803         if (SpacesInPrefix + PrefixSpaceChange[i] < Minimum) {
804           PrefixSpaceChange[i] +=
805               Minimum - (SpacesInPrefix + PrefixSpaceChange[i]);
806         }
807 
808         assert(Lines[i].size() > IndentPrefix.size());
809         const auto FirstNonSpace = Lines[i][IndentPrefix.size()];
810         const auto AllowsSpaceChange =
811             SpacesInPrefix != 0 ||
812             (isAlphanumeric(FirstNonSpace) ||
813              (FirstNonSpace == '}' && FirstLineSpaceChange != 0));
814 
815         if (PrefixSpaceChange[i] > 0 && AllowsSpaceChange) {
816           Prefix[i] = IndentPrefix.str();
817           Prefix[i].append(PrefixSpaceChange[i], ' ');
818         } else if (PrefixSpaceChange[i] < 0 && AllowsSpaceChange) {
819           Prefix[i] = IndentPrefix
820                           .drop_back(std::min<std::size_t>(
821                               -PrefixSpaceChange[i], SpacesInPrefix))
822                           .str();
823         } else {
824           Prefix[i] = IndentPrefix.str();
825         }
826       } else {
827         // If the IndentPrefix is the whole line, there is no content and we
828         // drop just all space
829         Prefix[i] = IndentPrefix.drop_back(SpacesInPrefix).str();
830       }
831 
832       Tokens[i] = LineTok;
833       Content[i] = Lines[i].substr(IndentPrefix.size());
834       ContentColumn[i] =
835           StartColumn + encoding::columnWidthWithTabs(Prefix[i], StartColumn,
836                                                       Style.TabWidth, Encoding);
837 
838       // Calculate the end of the non-whitespace text in this line.
839       size_t EndOfLine = Content[i].find_last_not_of(Blanks);
840       if (EndOfLine == StringRef::npos)
841         EndOfLine = Content[i].size();
842       else
843         ++EndOfLine;
844       Content[i] = Content[i].substr(0, EndOfLine);
845     }
846     LineTok = CurrentTok->Next;
847     if (CurrentTok->Next && !CurrentTok->Next->ContinuesLineCommentSection) {
848       // A line comment section needs to broken by a line comment that is
849       // preceded by at least two newlines. Note that we put this break here
850       // instead of breaking at a previous stage during parsing, since that
851       // would split the contents of the enum into two unwrapped lines in this
852       // example, which is undesirable:
853       // enum A {
854       //   a, // comment about a
855       //
856       //   // comment about b
857       //   b
858       // };
859       //
860       // FIXME: Consider putting separate line comment sections as children to
861       // the unwrapped line instead.
862       break;
863     }
864   }
865 }
866 
867 unsigned
868 BreakableLineCommentSection::getRangeLength(unsigned LineIndex, unsigned Offset,
869                                             StringRef::size_type Length,
870                                             unsigned StartColumn) const {
871   return encoding::columnWidthWithTabs(
872       Content[LineIndex].substr(Offset, Length), StartColumn, Style.TabWidth,
873       Encoding);
874 }
875 
876 unsigned
877 BreakableLineCommentSection::getContentStartColumn(unsigned LineIndex,
878                                                    bool /*Break*/) const {
879   return ContentColumn[LineIndex];
880 }
881 
882 void BreakableLineCommentSection::insertBreak(
883     unsigned LineIndex, unsigned TailOffset, Split Split,
884     unsigned ContentIndent, WhitespaceManager &Whitespaces) const {
885   StringRef Text = Content[LineIndex].substr(TailOffset);
886   // Compute the offset of the split relative to the beginning of the token
887   // text.
888   unsigned BreakOffsetInToken =
889       Text.data() - tokenAt(LineIndex).TokenText.data() + Split.first;
890   unsigned CharsToRemove = Split.second;
891   Whitespaces.replaceWhitespaceInToken(
892       tokenAt(LineIndex), BreakOffsetInToken, CharsToRemove, "",
893       Prefix[LineIndex], InPPDirective, /*Newlines=*/1,
894       /*Spaces=*/ContentColumn[LineIndex] - Prefix[LineIndex].size());
895 }
896 
897 BreakableComment::Split BreakableLineCommentSection::getReflowSplit(
898     unsigned LineIndex, const llvm::Regex &CommentPragmasRegex) const {
899   if (!mayReflow(LineIndex, CommentPragmasRegex))
900     return Split(StringRef::npos, 0);
901 
902   size_t Trimmed = Content[LineIndex].find_first_not_of(Blanks);
903 
904   // In a line comment section each line is a separate token; thus, after a
905   // split we replace all whitespace before the current line comment token
906   // (which does not need to be included in the split), plus the start of the
907   // line up to where the content starts.
908   return Split(0, Trimmed != StringRef::npos ? Trimmed : 0);
909 }
910 
911 void BreakableLineCommentSection::reflow(unsigned LineIndex,
912                                          WhitespaceManager &Whitespaces) const {
913   if (LineIndex > 0 && Tokens[LineIndex] != Tokens[LineIndex - 1]) {
914     // Reflow happens between tokens. Replace the whitespace between the
915     // tokens by the empty string.
916     Whitespaces.replaceWhitespace(
917         *Tokens[LineIndex], /*Newlines=*/0, /*Spaces=*/0,
918         /*StartOfTokenColumn=*/StartColumn, /*IsAligned=*/true,
919         /*InPPDirective=*/false);
920   } else if (LineIndex > 0) {
921     // In case we're reflowing after the '\' in:
922     //
923     //   // line comment \
924     //   // line 2
925     //
926     // the reflow happens inside the single comment token (it is a single line
927     // comment with an unescaped newline).
928     // Replace the whitespace between the '\' and '//' with the empty string.
929     //
930     // Offset points to after the '\' relative to start of the token.
931     unsigned Offset = Lines[LineIndex - 1].data() +
932                       Lines[LineIndex - 1].size() -
933                       tokenAt(LineIndex - 1).TokenText.data();
934     // WhitespaceLength is the number of chars between the '\' and the '//' on
935     // the next line.
936     unsigned WhitespaceLength =
937         Lines[LineIndex].data() - tokenAt(LineIndex).TokenText.data() - Offset;
938     Whitespaces.replaceWhitespaceInToken(*Tokens[LineIndex], Offset,
939                                          /*ReplaceChars=*/WhitespaceLength,
940                                          /*PreviousPostfix=*/"",
941                                          /*CurrentPrefix=*/"",
942                                          /*InPPDirective=*/false,
943                                          /*Newlines=*/0,
944                                          /*Spaces=*/0);
945   }
946   // Replace the indent and prefix of the token with the reflow prefix.
947   unsigned Offset =
948       Lines[LineIndex].data() - tokenAt(LineIndex).TokenText.data();
949   unsigned WhitespaceLength =
950       Content[LineIndex].data() - Lines[LineIndex].data();
951   Whitespaces.replaceWhitespaceInToken(*Tokens[LineIndex], Offset,
952                                        /*ReplaceChars=*/WhitespaceLength,
953                                        /*PreviousPostfix=*/"",
954                                        /*CurrentPrefix=*/ReflowPrefix,
955                                        /*InPPDirective=*/false,
956                                        /*Newlines=*/0,
957                                        /*Spaces=*/0);
958 }
959 
960 void BreakableLineCommentSection::adaptStartOfLine(
961     unsigned LineIndex, WhitespaceManager &Whitespaces) const {
962   // If this is the first line of a token, we need to inform Whitespace Manager
963   // about it: either adapt the whitespace range preceding it, or mark it as an
964   // untouchable token.
965   // This happens for instance here:
966   // // line 1 \
967   // // line 2
968   if (LineIndex > 0 && Tokens[LineIndex] != Tokens[LineIndex - 1]) {
969     // This is the first line for the current token, but no reflow with the
970     // previous token is necessary. However, we still may need to adjust the
971     // start column. Note that ContentColumn[LineIndex] is the expected
972     // content column after a possible update to the prefix, hence the prefix
973     // length change is included.
974     unsigned LineColumn =
975         ContentColumn[LineIndex] -
976         (Content[LineIndex].data() - Lines[LineIndex].data()) +
977         (OriginalPrefix[LineIndex].size() - Prefix[LineIndex].size());
978 
979     // We always want to create a replacement instead of adding an untouchable
980     // token, even if LineColumn is the same as the original column of the
981     // token. This is because WhitespaceManager doesn't align trailing
982     // comments if they are untouchable.
983     Whitespaces.replaceWhitespace(*Tokens[LineIndex],
984                                   /*Newlines=*/1,
985                                   /*Spaces=*/LineColumn,
986                                   /*StartOfTokenColumn=*/LineColumn,
987                                   /*IsAligned=*/true,
988                                   /*InPPDirective=*/false);
989   }
990   if (OriginalPrefix[LineIndex] != Prefix[LineIndex]) {
991     // Adjust the prefix if necessary.
992     const auto SpacesToRemove = -std::min(PrefixSpaceChange[LineIndex], 0);
993     const auto SpacesToAdd = std::max(PrefixSpaceChange[LineIndex], 0);
994     Whitespaces.replaceWhitespaceInToken(
995         tokenAt(LineIndex), OriginalPrefix[LineIndex].size() - SpacesToRemove,
996         /*ReplaceChars=*/SpacesToRemove, "", "", /*InPPDirective=*/false,
997         /*Newlines=*/0, /*Spaces=*/SpacesToAdd);
998   }
999 }
1000 
1001 void BreakableLineCommentSection::updateNextToken(LineState &State) const {
1002   if (LastLineTok) {
1003     State.NextToken = LastLineTok->Next;
1004   }
1005 }
1006 
1007 bool BreakableLineCommentSection::mayReflow(
1008     unsigned LineIndex, const llvm::Regex &CommentPragmasRegex) const {
1009   // Line comments have the indent as part of the prefix, so we need to
1010   // recompute the start of the line.
1011   StringRef IndentContent = Content[LineIndex];
1012   if (Lines[LineIndex].startswith("//")) {
1013     IndentContent = Lines[LineIndex].substr(2);
1014   }
1015   // FIXME: Decide whether we want to reflow non-regular indents:
1016   // Currently, we only reflow when the OriginalPrefix[LineIndex] matches the
1017   // OriginalPrefix[LineIndex-1]. That means we don't reflow
1018   // // text that protrudes
1019   // //    into text with different indent
1020   // We do reflow in that case in block comments.
1021   return LineIndex > 0 && !CommentPragmasRegex.match(IndentContent) &&
1022          mayReflowContent(Content[LineIndex]) && !Tok.Finalized &&
1023          !switchesFormatting(tokenAt(LineIndex)) &&
1024          OriginalPrefix[LineIndex] == OriginalPrefix[LineIndex - 1];
1025 }
1026 
1027 } // namespace format
1028 } // namespace clang
1029