xref: /freebsd/contrib/llvm-project/clang/lib/Format/BreakableToken.h (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 //===--- BreakableToken.h - Format C++ code ---------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// Declares BreakableToken, BreakableStringLiteral, BreakableComment,
11 /// BreakableBlockComment and BreakableLineCommentSection classes, that contain
12 /// token type-specific logic to break long lines in tokens and reflow content
13 /// between tokens.
14 ///
15 //===----------------------------------------------------------------------===//
16 
17 #ifndef LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
18 #define LLVM_CLANG_LIB_FORMAT_BREAKABLETOKEN_H
19 
20 #include "Encoding.h"
21 #include "WhitespaceManager.h"
22 #include "llvm/ADT/StringSet.h"
23 
24 namespace clang {
25 namespace format {
26 
27 /// Checks if \p Token switches formatting, like /* clang-format off */.
28 /// \p Token must be a comment.
29 bool switchesFormatting(const FormatToken &Token);
30 
31 struct FormatStyle;
32 
33 /// Base class for tokens / ranges of tokens that can allow breaking
34 /// within the tokens - for example, to avoid whitespace beyond the column
35 /// limit, or to reflow text.
36 ///
37 /// Generally, a breakable token consists of logical lines, addressed by a line
38 /// index. For example, in a sequence of line comments, each line comment is its
39 /// own logical line; similarly, for a block comment, each line in the block
40 /// comment is on its own logical line.
41 ///
42 /// There are two methods to compute the layout of the token:
43 /// - getRangeLength measures the number of columns needed for a range of text
44 ///   within a logical line, and
45 /// - getContentStartColumn returns the start column at which we want the
46 ///   content of a logical line to start (potentially after introducing a line
47 ///   break).
48 ///
49 /// The mechanism to adapt the layout of the breakable token is organised
50 /// around the concept of a \c Split, which is a whitespace range that signifies
51 /// a position of the content of a token where a reformatting might be done.
52 ///
53 /// Operating with splits is divided into two operations:
54 /// - getSplit, for finding a split starting at a position,
55 /// - insertBreak, for executing the split using a whitespace manager.
56 ///
57 /// There is a pair of operations that are used to compress a long whitespace
58 /// range with a single space if that will bring the line length under the
59 /// column limit:
60 /// - getLineLengthAfterCompression, for calculating the size in columns of the
61 ///   line after a whitespace range has been compressed, and
62 /// - compressWhitespace, for executing the whitespace compression using a
63 ///   whitespace manager; note that the compressed whitespace may be in the
64 ///   middle of the original line and of the reformatted line.
65 ///
66 /// For tokens where the whitespace before each line needs to be also
67 /// reformatted, for example for tokens supporting reflow, there are analogous
68 /// operations that might be executed before the main line breaking occurs:
69 /// - getReflowSplit, for finding a split such that the content preceding it
70 ///   needs to be specially reflown,
71 /// - reflow, for executing the split using a whitespace manager,
72 /// - introducesBreakBefore, for checking if reformatting the beginning
73 ///   of the content introduces a line break before it,
74 /// - adaptStartOfLine, for executing the reflow using a whitespace
75 ///   manager.
76 ///
77 /// For tokens that require the whitespace after the last line to be
78 /// reformatted, for example in multiline jsdoc comments that require the
79 /// trailing '*/' to be on a line of itself, there are analogous operations
80 /// that might be executed after the last line has been reformatted:
81 /// - getSplitAfterLastLine, for finding a split after the last line that needs
82 ///   to be reflown,
83 /// - replaceWhitespaceAfterLastLine, for executing the reflow using a
84 ///   whitespace manager.
85 ///
86 class BreakableToken {
87 public:
88   /// Contains starting character index and length of split.
89   typedef std::pair<StringRef::size_type, unsigned> Split;
90 
91   virtual ~BreakableToken() {}
92 
93   /// Returns the number of lines in this token in the original code.
94   virtual unsigned getLineCount() const = 0;
95 
96   /// Returns the number of columns required to format the text in the
97   /// byte range [\p Offset, \p Offset \c + \p Length).
98   ///
99   /// \p Offset is the byte offset from the start of the content of the line
100   ///    at \p LineIndex.
101   ///
102   /// \p StartColumn is the column at which the text starts in the formatted
103   ///    file, needed to compute tab stops correctly.
104   virtual unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
105                                   StringRef::size_type Length,
106                                   unsigned StartColumn) const = 0;
107 
108   /// Returns the number of columns required to format the text following
109   /// the byte \p Offset in the line \p LineIndex, including potentially
110   /// unbreakable sequences of tokens following after the end of the token.
111   ///
112   /// \p Offset is the byte offset from the start of the content of the line
113   ///    at \p LineIndex.
114   ///
115   /// \p StartColumn is the column at which the text starts in the formatted
116   ///    file, needed to compute tab stops correctly.
117   ///
118   /// For breakable tokens that never use extra space at the end of a line, this
119   /// is equivalent to getRangeLength with a Length of StringRef::npos.
120   virtual unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
121                                       unsigned StartColumn) const {
122     return getRangeLength(LineIndex, Offset, StringRef::npos, StartColumn);
123   }
124 
125   /// Returns the column at which content in line \p LineIndex starts,
126   /// assuming no reflow.
127   ///
128   /// If \p Break is true, returns the column at which the line should start
129   /// after the line break.
130   /// If \p Break is false, returns the column at which the line itself will
131   /// start.
132   virtual unsigned getContentStartColumn(unsigned LineIndex,
133                                          bool Break) const = 0;
134 
135   /// Returns additional content indent required for the second line after the
136   /// content at line \p LineIndex is broken.
137   ///
138   // (Next lines do not start with `///` since otherwise -Wdocumentation picks
139   // up the example annotations and generates warnings for them)
140   // For example, Javadoc @param annotations require and indent of 4 spaces and
141   // in this example getContentIndex(1) returns 4.
142   // /**
143   //  * @param loooooooooooooong line
144   //  *     continuation
145   //  */
146   virtual unsigned getContentIndent(unsigned LineIndex) const { return 0; }
147 
148   /// Returns a range (offset, length) at which to break the line at
149   /// \p LineIndex, if previously broken at \p TailOffset. If possible, do not
150   /// violate \p ColumnLimit, assuming the text starting at \p TailOffset in
151   /// the token is formatted starting at ContentStartColumn in the reformatted
152   /// file.
153   virtual Split getSplit(unsigned LineIndex, unsigned TailOffset,
154                          unsigned ColumnLimit, unsigned ContentStartColumn,
155                          const llvm::Regex &CommentPragmasRegex) const = 0;
156 
157   /// Emits the previously retrieved \p Split via \p Whitespaces.
158   virtual void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
159                            unsigned ContentIndent,
160                            WhitespaceManager &Whitespaces) const = 0;
161 
162   /// Returns the number of columns needed to format
163   /// \p RemainingTokenColumns, assuming that Split is within the range measured
164   /// by \p RemainingTokenColumns, and that the whitespace in Split is reduced
165   /// to a single space.
166   unsigned getLengthAfterCompression(unsigned RemainingTokenColumns,
167                                      Split Split) const;
168 
169   /// Replaces the whitespace range described by \p Split with a single
170   /// space.
171   virtual void compressWhitespace(unsigned LineIndex, unsigned TailOffset,
172                                   Split Split,
173                                   WhitespaceManager &Whitespaces) const = 0;
174 
175   /// Returns whether the token supports reflowing text.
176   virtual bool supportsReflow() const { return false; }
177 
178   /// Returns a whitespace range (offset, length) of the content at \p
179   /// LineIndex such that the content of that line is reflown to the end of the
180   /// previous one.
181   ///
182   /// Returning (StringRef::npos, 0) indicates reflowing is not possible.
183   ///
184   /// The range will include any whitespace preceding the specified line's
185   /// content.
186   ///
187   /// If the split is not contained within one token, for example when reflowing
188   /// line comments, returns (0, <length>).
189   virtual Split getReflowSplit(unsigned LineIndex,
190                                const llvm::Regex &CommentPragmasRegex) const {
191     return Split(StringRef::npos, 0);
192   }
193 
194   /// Reflows the current line into the end of the previous one.
195   virtual void reflow(unsigned LineIndex,
196                       WhitespaceManager &Whitespaces) const {}
197 
198   /// Returns whether there will be a line break at the start of the
199   /// token.
200   virtual bool introducesBreakBeforeToken() const { return false; }
201 
202   /// Replaces the whitespace between \p LineIndex-1 and \p LineIndex.
203   virtual void adaptStartOfLine(unsigned LineIndex,
204                                 WhitespaceManager &Whitespaces) const {}
205 
206   /// Returns a whitespace range (offset, length) of the content at
207   /// the last line that needs to be reformatted after the last line has been
208   /// reformatted.
209   ///
210   /// A result having offset == StringRef::npos means that no reformat is
211   /// necessary.
212   virtual Split getSplitAfterLastLine(unsigned TailOffset) const {
213     return Split(StringRef::npos, 0);
214   }
215 
216   /// Replaces the whitespace from \p SplitAfterLastLine on the last line
217   /// after the last line has been formatted by performing a reformatting.
218   void replaceWhitespaceAfterLastLine(unsigned TailOffset,
219                                       Split SplitAfterLastLine,
220                                       WhitespaceManager &Whitespaces) const {
221     insertBreak(getLineCount() - 1, TailOffset, SplitAfterLastLine,
222                 /*ContentIndent=*/0, Whitespaces);
223   }
224 
225   /// Updates the next token of \p State to the next token after this
226   /// one. This can be used when this token manages a set of underlying tokens
227   /// as a unit and is responsible for the formatting of the them.
228   virtual void updateNextToken(LineState &State) const {}
229 
230   /// Adds replacements that are needed when the token is broken. Such as
231   /// wrapping a JavaScript string in parentheses after it gets broken with plus
232   /// signs.
233   virtual void updateAfterBroken(WhitespaceManager &Whitespaces) const {}
234 
235 protected:
236   BreakableToken(const FormatToken &Tok, bool InPPDirective,
237                  encoding::Encoding Encoding, const FormatStyle &Style)
238       : Tok(Tok), InPPDirective(InPPDirective), Encoding(Encoding),
239         Style(Style) {}
240 
241   const FormatToken &Tok;
242   const bool InPPDirective;
243   const encoding::Encoding Encoding;
244   const FormatStyle &Style;
245 };
246 
247 class BreakableStringLiteral : public BreakableToken {
248 public:
249   /// Creates a breakable token for a single line string literal.
250   ///
251   /// \p StartColumn specifies the column in which the token will start
252   /// after formatting.
253   BreakableStringLiteral(const FormatToken &Tok, unsigned StartColumn,
254                          StringRef Prefix, StringRef Postfix,
255                          unsigned UnbreakableTailLength, bool InPPDirective,
256                          encoding::Encoding Encoding, const FormatStyle &Style);
257 
258   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
259                  unsigned ContentStartColumn,
260                  const llvm::Regex &CommentPragmasRegex) const override;
261   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
262                    unsigned ContentIndent,
263                    WhitespaceManager &Whitespaces) const override;
264   void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
265                           WhitespaceManager &Whitespaces) const override {}
266   unsigned getLineCount() const override;
267   unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
268                           StringRef::size_type Length,
269                           unsigned StartColumn) const override;
270   unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
271                               unsigned StartColumn) const override;
272   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
273 
274 protected:
275   // The column in which the token starts.
276   unsigned StartColumn;
277   // The prefix a line needs after a break in the token.
278   StringRef Prefix;
279   // The postfix a line needs before introducing a break.
280   StringRef Postfix;
281   // The token text excluding the prefix and postfix.
282   StringRef Line;
283   // Length of the sequence of tokens after this string literal that cannot
284   // contain line breaks.
285   unsigned UnbreakableTailLength;
286 };
287 
288 class BreakableStringLiteralUsingOperators : public BreakableStringLiteral {
289 public:
290   enum QuoteStyleType {
291     DoubleQuotes,   // The string is quoted with double quotes.
292     SingleQuotes,   // The JavaScript string is quoted with single quotes.
293     AtDoubleQuotes, // The C# verbatim string is quoted with the at sign and
294                     // double quotes.
295   };
296   /// Creates a breakable token for a single line string literal for C#, Java,
297   /// JavaScript, or Verilog.
298   ///
299   /// \p StartColumn specifies the column in which the token will start
300   /// after formatting.
301   BreakableStringLiteralUsingOperators(
302       const FormatToken &Tok, QuoteStyleType QuoteStyle, bool UnindentPlus,
303       unsigned StartColumn, unsigned UnbreakableTailLength, bool InPPDirective,
304       encoding::Encoding Encoding, const FormatStyle &Style);
305   unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
306                               unsigned StartColumn) const override;
307   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
308   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
309                    unsigned ContentIndent,
310                    WhitespaceManager &Whitespaces) const override;
311   void updateAfterBroken(WhitespaceManager &Whitespaces) const override;
312 
313 protected:
314   // Whether braces or parentheses should be inserted around the string to form
315   // a concatenation.
316   bool BracesNeeded;
317   QuoteStyleType QuoteStyle;
318   // The braces or parentheses along with the first character which they
319   // replace, either a quote or at sign.
320   StringRef LeftBraceQuote;
321   StringRef RightBraceQuote;
322   // Width added to the left due to the added brace or parenthesis. Does not
323   // apply to the first line.
324   int ContinuationIndent;
325 };
326 
327 class BreakableComment : public BreakableToken {
328 protected:
329   /// Creates a breakable token for a comment.
330   ///
331   /// \p StartColumn specifies the column in which the comment will start after
332   /// formatting.
333   BreakableComment(const FormatToken &Token, unsigned StartColumn,
334                    bool InPPDirective, encoding::Encoding Encoding,
335                    const FormatStyle &Style);
336 
337 public:
338   bool supportsReflow() const override { return true; }
339   unsigned getLineCount() const override;
340   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
341                  unsigned ContentStartColumn,
342                  const llvm::Regex &CommentPragmasRegex) const override;
343   void compressWhitespace(unsigned LineIndex, unsigned TailOffset, Split Split,
344                           WhitespaceManager &Whitespaces) const override;
345 
346 protected:
347   // Returns the token containing the line at LineIndex.
348   const FormatToken &tokenAt(unsigned LineIndex) const;
349 
350   // Checks if the content of line LineIndex may be reflown with the previous
351   // line.
352   virtual bool mayReflow(unsigned LineIndex,
353                          const llvm::Regex &CommentPragmasRegex) const = 0;
354 
355   // Contains the original text of the lines of the block comment.
356   //
357   // In case of a block comments, excludes the leading /* in the first line and
358   // trailing */ in the last line. In case of line comments, excludes the
359   // leading // and spaces.
360   SmallVector<StringRef, 16> Lines;
361 
362   // Contains the text of the lines excluding all leading and trailing
363   // whitespace between the lines. Note that the decoration (if present) is also
364   // not considered part of the text.
365   SmallVector<StringRef, 16> Content;
366 
367   // Tokens[i] contains a reference to the token containing Lines[i] if the
368   // whitespace range before that token is managed by this block.
369   // Otherwise, Tokens[i] is a null pointer.
370   SmallVector<FormatToken *, 16> Tokens;
371 
372   // ContentColumn[i] is the target column at which Content[i] should be.
373   // Note that this excludes a leading "* " or "*" in case of block comments
374   // where all lines have a "*" prefix, or the leading "// " or "//" in case of
375   // line comments.
376   //
377   // In block comments, the first line's target column is always positive. The
378   // remaining lines' target columns are relative to the first line to allow
379   // correct indentation of comments in \c WhitespaceManager. Thus they can be
380   // negative as well (in case the first line needs to be unindented more than
381   // there's actual whitespace in another line).
382   SmallVector<int, 16> ContentColumn;
383 
384   // The intended start column of the first line of text from this section.
385   unsigned StartColumn;
386 
387   // The prefix to use in front a line that has been reflown up.
388   // For example, when reflowing the second line after the first here:
389   // // comment 1
390   // // comment 2
391   // we expect:
392   // // comment 1 comment 2
393   // and not:
394   // // comment 1comment 2
395   StringRef ReflowPrefix = " ";
396 };
397 
398 class BreakableBlockComment : public BreakableComment {
399 public:
400   BreakableBlockComment(const FormatToken &Token, unsigned StartColumn,
401                         unsigned OriginalStartColumn, bool FirstInLine,
402                         bool InPPDirective, encoding::Encoding Encoding,
403                         const FormatStyle &Style, bool UseCRLF);
404 
405   Split getSplit(unsigned LineIndex, unsigned TailOffset, unsigned ColumnLimit,
406                  unsigned ContentStartColumn,
407                  const llvm::Regex &CommentPragmasRegex) const override;
408   unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
409                           StringRef::size_type Length,
410                           unsigned StartColumn) const override;
411   unsigned getRemainingLength(unsigned LineIndex, unsigned Offset,
412                               unsigned StartColumn) const override;
413   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
414   unsigned getContentIndent(unsigned LineIndex) const override;
415   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
416                    unsigned ContentIndent,
417                    WhitespaceManager &Whitespaces) const override;
418   Split getReflowSplit(unsigned LineIndex,
419                        const llvm::Regex &CommentPragmasRegex) const override;
420   void reflow(unsigned LineIndex,
421               WhitespaceManager &Whitespaces) const override;
422   bool introducesBreakBeforeToken() const override;
423   void adaptStartOfLine(unsigned LineIndex,
424                         WhitespaceManager &Whitespaces) const override;
425   Split getSplitAfterLastLine(unsigned TailOffset) const override;
426 
427   bool mayReflow(unsigned LineIndex,
428                  const llvm::Regex &CommentPragmasRegex) const override;
429 
430   // Contains Javadoc annotations that require additional indent when continued
431   // on multiple lines.
432   static const llvm::StringSet<> ContentIndentingJavadocAnnotations;
433 
434 private:
435   // Rearranges the whitespace between Lines[LineIndex-1] and Lines[LineIndex].
436   //
437   // Updates Content[LineIndex-1] and Content[LineIndex] by stripping off
438   // leading and trailing whitespace.
439   //
440   // Sets ContentColumn to the intended column in which the text at
441   // Lines[LineIndex] starts (note that the decoration, if present, is not
442   // considered part of the text).
443   void adjustWhitespace(unsigned LineIndex, int IndentDelta);
444 
445   // The column at which the text of a broken line should start.
446   // Note that an optional decoration would go before that column.
447   // IndentAtLineBreak is a uniform position for all lines in a block comment,
448   // regardless of their relative position.
449   // FIXME: Revisit the decision to do this; the main reason was to support
450   // patterns like
451   // /**************//**
452   //  * Comment
453   // We could also support such patterns by special casing the first line
454   // instead.
455   unsigned IndentAtLineBreak;
456 
457   // This is to distinguish between the case when the last line was empty and
458   // the case when it started with a decoration ("*" or "* ").
459   bool LastLineNeedsDecoration;
460 
461   // Either "* " if all lines begin with a "*", or empty.
462   StringRef Decoration;
463 
464   // If this block comment has decorations, this is the column of the start of
465   // the decorations.
466   unsigned DecorationColumn;
467 
468   // If true, make sure that the opening '/**' and the closing '*/' ends on a
469   // line of itself. Styles like jsdoc require this for multiline comments.
470   bool DelimitersOnNewline;
471 
472   // Length of the sequence of tokens after this string literal that cannot
473   // contain line breaks.
474   unsigned UnbreakableTailLength;
475 };
476 
477 class BreakableLineCommentSection : public BreakableComment {
478 public:
479   BreakableLineCommentSection(const FormatToken &Token, unsigned StartColumn,
480                               bool InPPDirective, encoding::Encoding Encoding,
481                               const FormatStyle &Style);
482 
483   unsigned getRangeLength(unsigned LineIndex, unsigned Offset,
484                           StringRef::size_type Length,
485                           unsigned StartColumn) const override;
486   unsigned getContentStartColumn(unsigned LineIndex, bool Break) const override;
487   void insertBreak(unsigned LineIndex, unsigned TailOffset, Split Split,
488                    unsigned ContentIndent,
489                    WhitespaceManager &Whitespaces) const override;
490   Split getReflowSplit(unsigned LineIndex,
491                        const llvm::Regex &CommentPragmasRegex) const override;
492   void reflow(unsigned LineIndex,
493               WhitespaceManager &Whitespaces) const override;
494   void adaptStartOfLine(unsigned LineIndex,
495                         WhitespaceManager &Whitespaces) const override;
496   void updateNextToken(LineState &State) const override;
497   bool mayReflow(unsigned LineIndex,
498                  const llvm::Regex &CommentPragmasRegex) const override;
499 
500 private:
501   // OriginalPrefix[i] contains the original prefix of line i, including
502   // trailing whitespace before the start of the content. The indentation
503   // preceding the prefix is not included.
504   // For example, if the line is:
505   // // content
506   // then the original prefix is "// ".
507   SmallVector<StringRef, 16> OriginalPrefix;
508 
509   /// Prefix[i] + SpacesToAdd[i] contains the intended leading "//" with
510   /// trailing spaces to account for the indentation of content within the
511   /// comment at line i after formatting. It can be different than the original
512   /// prefix.
513   /// When the original line starts like this:
514   /// //content
515   /// Then the OriginalPrefix[i] is "//", but the Prefix[i] is "// " in the LLVM
516   /// style.
517   /// When the line starts like:
518   /// // content
519   /// And we want to remove the spaces the OriginalPrefix[i] is "// " and
520   /// Prefix[i] is "//".
521   SmallVector<std::string, 16> Prefix;
522 
523   /// How many spaces are added or removed from the OriginalPrefix to form
524   /// Prefix.
525   SmallVector<int, 16> PrefixSpaceChange;
526 
527   /// The token to which the last line of this breakable token belongs
528   /// to; nullptr if that token is the initial token.
529   ///
530   /// The distinction is because if the token of the last line of this breakable
531   /// token is distinct from the initial token, this breakable token owns the
532   /// whitespace before the token of the last line, and the whitespace manager
533   /// must be able to modify it.
534   FormatToken *LastLineTok = nullptr;
535 };
536 } // namespace format
537 } // namespace clang
538 
539 #endif
540