xref: /freebsd/contrib/llvm-project/clang/lib/Format/FormatTokenLexer.cpp (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 //===--- FormatTokenLexer.cpp - Lex FormatTokens -------------*- C++ ----*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file implements FormatTokenLexer, which tokenizes a source file
11 /// into a FormatToken stream suitable for ClangFormat.
12 ///
13 //===----------------------------------------------------------------------===//
14 
15 #include "FormatTokenLexer.h"
16 #include "FormatToken.h"
17 #include "clang/Basic/SourceLocation.h"
18 #include "clang/Basic/SourceManager.h"
19 #include "clang/Format/Format.h"
20 #include "llvm/Support/Regex.h"
21 
22 namespace clang {
23 namespace format {
24 
25 FormatTokenLexer::FormatTokenLexer(
26     const SourceManager &SourceMgr, FileID ID, unsigned Column,
27     const FormatStyle &Style, encoding::Encoding Encoding,
28     llvm::SpecificBumpPtrAllocator<FormatToken> &Allocator,
29     IdentifierTable &IdentTable)
30     : FormatTok(nullptr), IsFirstToken(true), StateStack({LexerState::NORMAL}),
31       Column(Column), TrailingWhitespace(0), SourceMgr(SourceMgr), ID(ID),
32       Style(Style), IdentTable(IdentTable), Keywords(IdentTable),
33       Encoding(Encoding), Allocator(Allocator), FirstInLineIndex(0),
34       FormattingDisabled(false), MacroBlockBeginRegex(Style.MacroBlockBegin),
35       MacroBlockEndRegex(Style.MacroBlockEnd) {
36   Lex.reset(new Lexer(ID, SourceMgr.getBufferOrFake(ID), SourceMgr,
37                       getFormattingLangOpts(Style)));
38   Lex->SetKeepWhitespaceMode(true);
39 
40   for (const std::string &ForEachMacro : Style.ForEachMacros)
41     Macros.insert({&IdentTable.get(ForEachMacro), TT_ForEachMacro});
42   for (const std::string &IfMacro : Style.IfMacros)
43     Macros.insert({&IdentTable.get(IfMacro), TT_IfMacro});
44   for (const std::string &AttributeMacro : Style.AttributeMacros)
45     Macros.insert({&IdentTable.get(AttributeMacro), TT_AttributeMacro});
46   for (const std::string &StatementMacro : Style.StatementMacros)
47     Macros.insert({&IdentTable.get(StatementMacro), TT_StatementMacro});
48   for (const std::string &TypenameMacro : Style.TypenameMacros)
49     Macros.insert({&IdentTable.get(TypenameMacro), TT_TypenameMacro});
50   for (const std::string &NamespaceMacro : Style.NamespaceMacros)
51     Macros.insert({&IdentTable.get(NamespaceMacro), TT_NamespaceMacro});
52   for (const std::string &WhitespaceSensitiveMacro :
53        Style.WhitespaceSensitiveMacros) {
54     Macros.insert(
55         {&IdentTable.get(WhitespaceSensitiveMacro), TT_UntouchableMacroFunc});
56   }
57   for (const std::string &StatementAttributeLikeMacro :
58        Style.StatementAttributeLikeMacros)
59     Macros.insert({&IdentTable.get(StatementAttributeLikeMacro),
60                    TT_StatementAttributeLikeMacro});
61 }
62 
63 ArrayRef<FormatToken *> FormatTokenLexer::lex() {
64   assert(Tokens.empty());
65   assert(FirstInLineIndex == 0);
66   do {
67     Tokens.push_back(getNextToken());
68     if (Style.Language == FormatStyle::LK_JavaScript) {
69       tryParseJSRegexLiteral();
70       handleTemplateStrings();
71     }
72     if (Style.Language == FormatStyle::LK_TextProto)
73       tryParsePythonComment();
74     tryMergePreviousTokens();
75     if (Style.isCSharp())
76       // This needs to come after tokens have been merged so that C#
77       // string literals are correctly identified.
78       handleCSharpVerbatimAndInterpolatedStrings();
79     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
80       FirstInLineIndex = Tokens.size() - 1;
81   } while (Tokens.back()->Tok.isNot(tok::eof));
82   return Tokens;
83 }
84 
85 void FormatTokenLexer::tryMergePreviousTokens() {
86   if (tryMerge_TMacro())
87     return;
88   if (tryMergeConflictMarkers())
89     return;
90   if (tryMergeLessLess())
91     return;
92   if (tryMergeForEach())
93     return;
94   if (Style.isCpp() && tryTransformTryUsageForC())
95     return;
96 
97   if (Style.Language == FormatStyle::LK_JavaScript || Style.isCSharp()) {
98     static const tok::TokenKind NullishCoalescingOperator[] = {tok::question,
99                                                                tok::question};
100     static const tok::TokenKind NullPropagatingOperator[] = {tok::question,
101                                                              tok::period};
102     static const tok::TokenKind FatArrow[] = {tok::equal, tok::greater};
103 
104     if (tryMergeTokens(FatArrow, TT_FatArrow))
105       return;
106     if (tryMergeTokens(NullishCoalescingOperator, TT_NullCoalescingOperator)) {
107       // Treat like the "||" operator (as opposed to the ternary ?).
108       Tokens.back()->Tok.setKind(tok::pipepipe);
109       return;
110     }
111     if (tryMergeTokens(NullPropagatingOperator, TT_NullPropagatingOperator)) {
112       // Treat like a regular "." access.
113       Tokens.back()->Tok.setKind(tok::period);
114       return;
115     }
116     if (tryMergeNullishCoalescingEqual()) {
117       return;
118     }
119   }
120 
121   if (Style.isCSharp()) {
122     static const tok::TokenKind CSharpNullConditionalLSquare[] = {
123         tok::question, tok::l_square};
124 
125     if (tryMergeCSharpKeywordVariables())
126       return;
127     if (tryMergeCSharpStringLiteral())
128       return;
129     if (tryTransformCSharpForEach())
130       return;
131     if (tryMergeTokens(CSharpNullConditionalLSquare,
132                        TT_CSharpNullConditionalLSquare)) {
133       // Treat like a regular "[" operator.
134       Tokens.back()->Tok.setKind(tok::l_square);
135       return;
136     }
137   }
138 
139   if (tryMergeNSStringLiteral())
140     return;
141 
142   if (Style.Language == FormatStyle::LK_JavaScript) {
143     static const tok::TokenKind JSIdentity[] = {tok::equalequal, tok::equal};
144     static const tok::TokenKind JSNotIdentity[] = {tok::exclaimequal,
145                                                    tok::equal};
146     static const tok::TokenKind JSShiftEqual[] = {tok::greater, tok::greater,
147                                                   tok::greaterequal};
148     static const tok::TokenKind JSExponentiation[] = {tok::star, tok::star};
149     static const tok::TokenKind JSExponentiationEqual[] = {tok::star,
150                                                            tok::starequal};
151     static const tok::TokenKind JSPipePipeEqual[] = {tok::pipepipe, tok::equal};
152     static const tok::TokenKind JSAndAndEqual[] = {tok::ampamp, tok::equal};
153 
154     // FIXME: Investigate what token type gives the correct operator priority.
155     if (tryMergeTokens(JSIdentity, TT_BinaryOperator))
156       return;
157     if (tryMergeTokens(JSNotIdentity, TT_BinaryOperator))
158       return;
159     if (tryMergeTokens(JSShiftEqual, TT_BinaryOperator))
160       return;
161     if (tryMergeTokens(JSExponentiation, TT_JsExponentiation))
162       return;
163     if (tryMergeTokens(JSExponentiationEqual, TT_JsExponentiationEqual)) {
164       Tokens.back()->Tok.setKind(tok::starequal);
165       return;
166     }
167     if (tryMergeTokens(JSAndAndEqual, TT_JsAndAndEqual) ||
168         tryMergeTokens(JSPipePipeEqual, TT_JsPipePipeEqual)) {
169       // Treat like the "=" assignment operator.
170       Tokens.back()->Tok.setKind(tok::equal);
171       return;
172     }
173     if (tryMergeJSPrivateIdentifier())
174       return;
175   }
176 
177   if (Style.Language == FormatStyle::LK_Java) {
178     static const tok::TokenKind JavaRightLogicalShiftAssign[] = {
179         tok::greater, tok::greater, tok::greaterequal};
180     if (tryMergeTokens(JavaRightLogicalShiftAssign, TT_BinaryOperator))
181       return;
182   }
183 }
184 
185 bool FormatTokenLexer::tryMergeNSStringLiteral() {
186   if (Tokens.size() < 2)
187     return false;
188   auto &At = *(Tokens.end() - 2);
189   auto &String = *(Tokens.end() - 1);
190   if (!At->is(tok::at) || !String->is(tok::string_literal))
191     return false;
192   At->Tok.setKind(tok::string_literal);
193   At->TokenText = StringRef(At->TokenText.begin(),
194                             String->TokenText.end() - At->TokenText.begin());
195   At->ColumnWidth += String->ColumnWidth;
196   At->setType(TT_ObjCStringLiteral);
197   Tokens.erase(Tokens.end() - 1);
198   return true;
199 }
200 
201 bool FormatTokenLexer::tryMergeJSPrivateIdentifier() {
202   // Merges #idenfier into a single identifier with the text #identifier
203   // but the token tok::identifier.
204   if (Tokens.size() < 2)
205     return false;
206   auto &Hash = *(Tokens.end() - 2);
207   auto &Identifier = *(Tokens.end() - 1);
208   if (!Hash->is(tok::hash) || !Identifier->is(tok::identifier))
209     return false;
210   Hash->Tok.setKind(tok::identifier);
211   Hash->TokenText =
212       StringRef(Hash->TokenText.begin(),
213                 Identifier->TokenText.end() - Hash->TokenText.begin());
214   Hash->ColumnWidth += Identifier->ColumnWidth;
215   Hash->setType(TT_JsPrivateIdentifier);
216   Tokens.erase(Tokens.end() - 1);
217   return true;
218 }
219 
220 // Search for verbatim or interpolated string literals @"ABC" or
221 // $"aaaaa{abc}aaaaa" i and mark the token as TT_CSharpStringLiteral, and to
222 // prevent splitting of @, $ and ".
223 // Merging of multiline verbatim strings with embedded '"' is handled in
224 // handleCSharpVerbatimAndInterpolatedStrings with lower-level lexing.
225 bool FormatTokenLexer::tryMergeCSharpStringLiteral() {
226   if (Tokens.size() < 2)
227     return false;
228 
229   // Interpolated strings could contain { } with " characters inside.
230   // $"{x ?? "null"}"
231   // should not be split into $"{x ?? ", null, "}" but should treated as a
232   // single string-literal.
233   //
234   // We opt not to try and format expressions inside {} within a C#
235   // interpolated string. Formatting expressions within an interpolated string
236   // would require similar work as that done for JavaScript template strings
237   // in `handleTemplateStrings()`.
238   auto &CSharpInterpolatedString = *(Tokens.end() - 2);
239   if (CSharpInterpolatedString->getType() == TT_CSharpStringLiteral &&
240       (CSharpInterpolatedString->TokenText.startswith(R"($")") ||
241        CSharpInterpolatedString->TokenText.startswith(R"($@")"))) {
242     int UnmatchedOpeningBraceCount = 0;
243 
244     auto TokenTextSize = CSharpInterpolatedString->TokenText.size();
245     for (size_t Index = 0; Index < TokenTextSize; ++Index) {
246       char C = CSharpInterpolatedString->TokenText[Index];
247       if (C == '{') {
248         // "{{"  inside an interpolated string is an escaped '{' so skip it.
249         if (Index + 1 < TokenTextSize &&
250             CSharpInterpolatedString->TokenText[Index + 1] == '{') {
251           ++Index;
252           continue;
253         }
254         ++UnmatchedOpeningBraceCount;
255       } else if (C == '}') {
256         // "}}"  inside an interpolated string is an escaped '}' so skip it.
257         if (Index + 1 < TokenTextSize &&
258             CSharpInterpolatedString->TokenText[Index + 1] == '}') {
259           ++Index;
260           continue;
261         }
262         --UnmatchedOpeningBraceCount;
263       }
264     }
265 
266     if (UnmatchedOpeningBraceCount > 0) {
267       auto &NextToken = *(Tokens.end() - 1);
268       CSharpInterpolatedString->TokenText =
269           StringRef(CSharpInterpolatedString->TokenText.begin(),
270                     NextToken->TokenText.end() -
271                         CSharpInterpolatedString->TokenText.begin());
272       CSharpInterpolatedString->ColumnWidth += NextToken->ColumnWidth;
273       Tokens.erase(Tokens.end() - 1);
274       return true;
275     }
276   }
277 
278   // Look for @"aaaaaa" or $"aaaaaa".
279   auto &String = *(Tokens.end() - 1);
280   if (!String->is(tok::string_literal))
281     return false;
282 
283   auto &At = *(Tokens.end() - 2);
284   if (!(At->is(tok::at) || At->TokenText == "$"))
285     return false;
286 
287   if (Tokens.size() > 2 && At->is(tok::at)) {
288     auto &Dollar = *(Tokens.end() - 3);
289     if (Dollar->TokenText == "$") {
290       // This looks like $@"aaaaa" so we need to combine all 3 tokens.
291       Dollar->Tok.setKind(tok::string_literal);
292       Dollar->TokenText =
293           StringRef(Dollar->TokenText.begin(),
294                     String->TokenText.end() - Dollar->TokenText.begin());
295       Dollar->ColumnWidth += (At->ColumnWidth + String->ColumnWidth);
296       Dollar->setType(TT_CSharpStringLiteral);
297       Tokens.erase(Tokens.end() - 2);
298       Tokens.erase(Tokens.end() - 1);
299       return true;
300     }
301   }
302 
303   // Convert back into just a string_literal.
304   At->Tok.setKind(tok::string_literal);
305   At->TokenText = StringRef(At->TokenText.begin(),
306                             String->TokenText.end() - At->TokenText.begin());
307   At->ColumnWidth += String->ColumnWidth;
308   At->setType(TT_CSharpStringLiteral);
309   Tokens.erase(Tokens.end() - 1);
310   return true;
311 }
312 
313 // Valid C# attribute targets:
314 // https://docs.microsoft.com/en-us/dotnet/csharp/programming-guide/concepts/attributes/#attribute-targets
315 const llvm::StringSet<> FormatTokenLexer::CSharpAttributeTargets = {
316     "assembly", "module",   "field",  "event", "method",
317     "param",    "property", "return", "type",
318 };
319 
320 bool FormatTokenLexer::tryMergeNullishCoalescingEqual() {
321   if (Tokens.size() < 2)
322     return false;
323   auto &NullishCoalescing = *(Tokens.end() - 2);
324   auto &Equal = *(Tokens.end() - 1);
325   if (NullishCoalescing->getType() != TT_NullCoalescingOperator ||
326       !Equal->is(tok::equal))
327     return false;
328   NullishCoalescing->Tok.setKind(tok::equal); // no '??=' in clang tokens.
329   NullishCoalescing->TokenText =
330       StringRef(NullishCoalescing->TokenText.begin(),
331                 Equal->TokenText.end() - NullishCoalescing->TokenText.begin());
332   NullishCoalescing->ColumnWidth += Equal->ColumnWidth;
333   NullishCoalescing->setType(TT_NullCoalescingEqual);
334   Tokens.erase(Tokens.end() - 1);
335   return true;
336 }
337 
338 bool FormatTokenLexer::tryMergeCSharpKeywordVariables() {
339   if (Tokens.size() < 2)
340     return false;
341   auto &At = *(Tokens.end() - 2);
342   auto &Keyword = *(Tokens.end() - 1);
343   if (!At->is(tok::at))
344     return false;
345   if (!Keywords.isCSharpKeyword(*Keyword))
346     return false;
347 
348   At->Tok.setKind(tok::identifier);
349   At->TokenText = StringRef(At->TokenText.begin(),
350                             Keyword->TokenText.end() - At->TokenText.begin());
351   At->ColumnWidth += Keyword->ColumnWidth;
352   At->setType(Keyword->getType());
353   Tokens.erase(Tokens.end() - 1);
354   return true;
355 }
356 
357 // In C# transform identifier foreach into kw_foreach
358 bool FormatTokenLexer::tryTransformCSharpForEach() {
359   if (Tokens.size() < 1)
360     return false;
361   auto &Identifier = *(Tokens.end() - 1);
362   if (!Identifier->is(tok::identifier))
363     return false;
364   if (Identifier->TokenText != "foreach")
365     return false;
366 
367   Identifier->setType(TT_ForEachMacro);
368   Identifier->Tok.setKind(tok::kw_for);
369   return true;
370 }
371 
372 bool FormatTokenLexer::tryMergeForEach() {
373   if (Tokens.size() < 2)
374     return false;
375   auto &For = *(Tokens.end() - 2);
376   auto &Each = *(Tokens.end() - 1);
377   if (!For->is(tok::kw_for))
378     return false;
379   if (!Each->is(tok::identifier))
380     return false;
381   if (Each->TokenText != "each")
382     return false;
383 
384   For->setType(TT_ForEachMacro);
385   For->Tok.setKind(tok::kw_for);
386 
387   For->TokenText = StringRef(For->TokenText.begin(),
388                              Each->TokenText.end() - For->TokenText.begin());
389   For->ColumnWidth += Each->ColumnWidth;
390   Tokens.erase(Tokens.end() - 1);
391   return true;
392 }
393 
394 bool FormatTokenLexer::tryTransformTryUsageForC() {
395   if (Tokens.size() < 2)
396     return false;
397   auto &Try = *(Tokens.end() - 2);
398   if (!Try->is(tok::kw_try))
399     return false;
400   auto &Next = *(Tokens.end() - 1);
401   if (Next->isOneOf(tok::l_brace, tok::colon, tok::hash, tok::comment))
402     return false;
403 
404   if (Tokens.size() > 2) {
405     auto &At = *(Tokens.end() - 3);
406     if (At->is(tok::at))
407       return false;
408   }
409 
410   Try->Tok.setKind(tok::identifier);
411   return true;
412 }
413 
414 bool FormatTokenLexer::tryMergeLessLess() {
415   // Merge X,less,less,Y into X,lessless,Y unless X or Y is less.
416   if (Tokens.size() < 3)
417     return false;
418 
419   bool FourthTokenIsLess = false;
420   if (Tokens.size() > 3)
421     FourthTokenIsLess = (Tokens.end() - 4)[0]->is(tok::less);
422 
423   auto First = Tokens.end() - 3;
424   if (First[2]->is(tok::less) || First[1]->isNot(tok::less) ||
425       First[0]->isNot(tok::less) || FourthTokenIsLess)
426     return false;
427 
428   // Only merge if there currently is no whitespace between the two "<".
429   if (First[1]->WhitespaceRange.getBegin() !=
430       First[1]->WhitespaceRange.getEnd())
431     return false;
432 
433   First[0]->Tok.setKind(tok::lessless);
434   First[0]->TokenText = "<<";
435   First[0]->ColumnWidth += 1;
436   Tokens.erase(Tokens.end() - 2);
437   return true;
438 }
439 
440 bool FormatTokenLexer::tryMergeTokens(ArrayRef<tok::TokenKind> Kinds,
441                                       TokenType NewType) {
442   if (Tokens.size() < Kinds.size())
443     return false;
444 
445   SmallVectorImpl<FormatToken *>::const_iterator First =
446       Tokens.end() - Kinds.size();
447   if (!First[0]->is(Kinds[0]))
448     return false;
449   unsigned AddLength = 0;
450   for (unsigned i = 1; i < Kinds.size(); ++i) {
451     if (!First[i]->is(Kinds[i]) || First[i]->WhitespaceRange.getBegin() !=
452                                        First[i]->WhitespaceRange.getEnd())
453       return false;
454     AddLength += First[i]->TokenText.size();
455   }
456   Tokens.resize(Tokens.size() - Kinds.size() + 1);
457   First[0]->TokenText = StringRef(First[0]->TokenText.data(),
458                                   First[0]->TokenText.size() + AddLength);
459   First[0]->ColumnWidth += AddLength;
460   First[0]->setType(NewType);
461   return true;
462 }
463 
464 // Returns \c true if \p Tok can only be followed by an operand in JavaScript.
465 bool FormatTokenLexer::precedesOperand(FormatToken *Tok) {
466   // NB: This is not entirely correct, as an r_paren can introduce an operand
467   // location in e.g. `if (foo) /bar/.exec(...);`. That is a rare enough
468   // corner case to not matter in practice, though.
469   return Tok->isOneOf(tok::period, tok::l_paren, tok::comma, tok::l_brace,
470                       tok::r_brace, tok::l_square, tok::semi, tok::exclaim,
471                       tok::colon, tok::question, tok::tilde) ||
472          Tok->isOneOf(tok::kw_return, tok::kw_do, tok::kw_case, tok::kw_throw,
473                       tok::kw_else, tok::kw_new, tok::kw_delete, tok::kw_void,
474                       tok::kw_typeof, Keywords.kw_instanceof, Keywords.kw_in) ||
475          Tok->isBinaryOperator();
476 }
477 
478 bool FormatTokenLexer::canPrecedeRegexLiteral(FormatToken *Prev) {
479   if (!Prev)
480     return true;
481 
482   // Regex literals can only follow after prefix unary operators, not after
483   // postfix unary operators. If the '++' is followed by a non-operand
484   // introducing token, the slash here is the operand and not the start of a
485   // regex.
486   // `!` is an unary prefix operator, but also a post-fix operator that casts
487   // away nullability, so the same check applies.
488   if (Prev->isOneOf(tok::plusplus, tok::minusminus, tok::exclaim))
489     return (Tokens.size() < 3 || precedesOperand(Tokens[Tokens.size() - 3]));
490 
491   // The previous token must introduce an operand location where regex
492   // literals can occur.
493   if (!precedesOperand(Prev))
494     return false;
495 
496   return true;
497 }
498 
499 // Tries to parse a JavaScript Regex literal starting at the current token,
500 // if that begins with a slash and is in a location where JavaScript allows
501 // regex literals. Changes the current token to a regex literal and updates
502 // its text if successful.
503 void FormatTokenLexer::tryParseJSRegexLiteral() {
504   FormatToken *RegexToken = Tokens.back();
505   if (!RegexToken->isOneOf(tok::slash, tok::slashequal))
506     return;
507 
508   FormatToken *Prev = nullptr;
509   for (auto I = Tokens.rbegin() + 1, E = Tokens.rend(); I != E; ++I) {
510     // NB: Because previous pointers are not initialized yet, this cannot use
511     // Token.getPreviousNonComment.
512     if ((*I)->isNot(tok::comment)) {
513       Prev = *I;
514       break;
515     }
516   }
517 
518   if (!canPrecedeRegexLiteral(Prev))
519     return;
520 
521   // 'Manually' lex ahead in the current file buffer.
522   const char *Offset = Lex->getBufferLocation();
523   const char *RegexBegin = Offset - RegexToken->TokenText.size();
524   StringRef Buffer = Lex->getBuffer();
525   bool InCharacterClass = false;
526   bool HaveClosingSlash = false;
527   for (; !HaveClosingSlash && Offset != Buffer.end(); ++Offset) {
528     // Regular expressions are terminated with a '/', which can only be
529     // escaped using '\' or a character class between '[' and ']'.
530     // See http://www.ecma-international.org/ecma-262/5.1/#sec-7.8.5.
531     switch (*Offset) {
532     case '\\':
533       // Skip the escaped character.
534       ++Offset;
535       break;
536     case '[':
537       InCharacterClass = true;
538       break;
539     case ']':
540       InCharacterClass = false;
541       break;
542     case '/':
543       if (!InCharacterClass)
544         HaveClosingSlash = true;
545       break;
546     }
547   }
548 
549   RegexToken->setType(TT_RegexLiteral);
550   // Treat regex literals like other string_literals.
551   RegexToken->Tok.setKind(tok::string_literal);
552   RegexToken->TokenText = StringRef(RegexBegin, Offset - RegexBegin);
553   RegexToken->ColumnWidth = RegexToken->TokenText.size();
554 
555   resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
556 }
557 
558 void FormatTokenLexer::handleCSharpVerbatimAndInterpolatedStrings() {
559   FormatToken *CSharpStringLiteral = Tokens.back();
560 
561   if (CSharpStringLiteral->getType() != TT_CSharpStringLiteral)
562     return;
563 
564   // Deal with multiline strings.
565   if (!(CSharpStringLiteral->TokenText.startswith(R"(@")") ||
566         CSharpStringLiteral->TokenText.startswith(R"($@")")))
567     return;
568 
569   const char *StrBegin =
570       Lex->getBufferLocation() - CSharpStringLiteral->TokenText.size();
571   const char *Offset = StrBegin;
572   if (CSharpStringLiteral->TokenText.startswith(R"(@")"))
573     Offset += 2;
574   else // CSharpStringLiteral->TokenText.startswith(R"($@")")
575     Offset += 3;
576 
577   // Look for a terminating '"' in the current file buffer.
578   // Make no effort to format code within an interpolated or verbatim string.
579   for (; Offset != Lex->getBuffer().end(); ++Offset) {
580     if (Offset[0] == '"') {
581       // "" within a verbatim string is an escaped double quote: skip it.
582       if (Offset + 1 < Lex->getBuffer().end() && Offset[1] == '"')
583         ++Offset;
584       else
585         break;
586     }
587   }
588 
589   // Make no attempt to format code properly if a verbatim string is
590   // unterminated.
591   if (Offset == Lex->getBuffer().end())
592     return;
593 
594   StringRef LiteralText(StrBegin, Offset - StrBegin + 1);
595   CSharpStringLiteral->TokenText = LiteralText;
596 
597   // Adjust width for potentially multiline string literals.
598   size_t FirstBreak = LiteralText.find('\n');
599   StringRef FirstLineText = FirstBreak == StringRef::npos
600                                 ? LiteralText
601                                 : LiteralText.substr(0, FirstBreak);
602   CSharpStringLiteral->ColumnWidth = encoding::columnWidthWithTabs(
603       FirstLineText, CSharpStringLiteral->OriginalColumn, Style.TabWidth,
604       Encoding);
605   size_t LastBreak = LiteralText.rfind('\n');
606   if (LastBreak != StringRef::npos) {
607     CSharpStringLiteral->IsMultiline = true;
608     unsigned StartColumn = 0;
609     CSharpStringLiteral->LastLineColumnWidth = encoding::columnWidthWithTabs(
610         LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
611         Style.TabWidth, Encoding);
612   }
613 
614   SourceLocation loc = Offset < Lex->getBuffer().end()
615                            ? Lex->getSourceLocation(Offset + 1)
616                            : SourceMgr.getLocForEndOfFile(ID);
617   resetLexer(SourceMgr.getFileOffset(loc));
618 }
619 
620 void FormatTokenLexer::handleTemplateStrings() {
621   FormatToken *BacktickToken = Tokens.back();
622 
623   if (BacktickToken->is(tok::l_brace)) {
624     StateStack.push(LexerState::NORMAL);
625     return;
626   }
627   if (BacktickToken->is(tok::r_brace)) {
628     if (StateStack.size() == 1)
629       return;
630     StateStack.pop();
631     if (StateStack.top() != LexerState::TEMPLATE_STRING)
632       return;
633     // If back in TEMPLATE_STRING, fallthrough and continue parsing the
634   } else if (BacktickToken->is(tok::unknown) &&
635              BacktickToken->TokenText == "`") {
636     StateStack.push(LexerState::TEMPLATE_STRING);
637   } else {
638     return; // Not actually a template
639   }
640 
641   // 'Manually' lex ahead in the current file buffer.
642   const char *Offset = Lex->getBufferLocation();
643   const char *TmplBegin = Offset - BacktickToken->TokenText.size(); // at "`"
644   for (; Offset != Lex->getBuffer().end(); ++Offset) {
645     if (Offset[0] == '`') {
646       StateStack.pop();
647       break;
648     }
649     if (Offset[0] == '\\') {
650       ++Offset; // Skip the escaped character.
651     } else if (Offset + 1 < Lex->getBuffer().end() && Offset[0] == '$' &&
652                Offset[1] == '{') {
653       // '${' introduces an expression interpolation in the template string.
654       StateStack.push(LexerState::NORMAL);
655       ++Offset;
656       break;
657     }
658   }
659 
660   StringRef LiteralText(TmplBegin, Offset - TmplBegin + 1);
661   BacktickToken->setType(TT_TemplateString);
662   BacktickToken->Tok.setKind(tok::string_literal);
663   BacktickToken->TokenText = LiteralText;
664 
665   // Adjust width for potentially multiline string literals.
666   size_t FirstBreak = LiteralText.find('\n');
667   StringRef FirstLineText = FirstBreak == StringRef::npos
668                                 ? LiteralText
669                                 : LiteralText.substr(0, FirstBreak);
670   BacktickToken->ColumnWidth = encoding::columnWidthWithTabs(
671       FirstLineText, BacktickToken->OriginalColumn, Style.TabWidth, Encoding);
672   size_t LastBreak = LiteralText.rfind('\n');
673   if (LastBreak != StringRef::npos) {
674     BacktickToken->IsMultiline = true;
675     unsigned StartColumn = 0; // The template tail spans the entire line.
676     BacktickToken->LastLineColumnWidth = encoding::columnWidthWithTabs(
677         LiteralText.substr(LastBreak + 1, LiteralText.size()), StartColumn,
678         Style.TabWidth, Encoding);
679   }
680 
681   SourceLocation loc = Offset < Lex->getBuffer().end()
682                            ? Lex->getSourceLocation(Offset + 1)
683                            : SourceMgr.getLocForEndOfFile(ID);
684   resetLexer(SourceMgr.getFileOffset(loc));
685 }
686 
687 void FormatTokenLexer::tryParsePythonComment() {
688   FormatToken *HashToken = Tokens.back();
689   if (!HashToken->isOneOf(tok::hash, tok::hashhash))
690     return;
691   // Turn the remainder of this line into a comment.
692   const char *CommentBegin =
693       Lex->getBufferLocation() - HashToken->TokenText.size(); // at "#"
694   size_t From = CommentBegin - Lex->getBuffer().begin();
695   size_t To = Lex->getBuffer().find_first_of('\n', From);
696   if (To == StringRef::npos)
697     To = Lex->getBuffer().size();
698   size_t Len = To - From;
699   HashToken->setType(TT_LineComment);
700   HashToken->Tok.setKind(tok::comment);
701   HashToken->TokenText = Lex->getBuffer().substr(From, Len);
702   SourceLocation Loc = To < Lex->getBuffer().size()
703                            ? Lex->getSourceLocation(CommentBegin + Len)
704                            : SourceMgr.getLocForEndOfFile(ID);
705   resetLexer(SourceMgr.getFileOffset(Loc));
706 }
707 
708 bool FormatTokenLexer::tryMerge_TMacro() {
709   if (Tokens.size() < 4)
710     return false;
711   FormatToken *Last = Tokens.back();
712   if (!Last->is(tok::r_paren))
713     return false;
714 
715   FormatToken *String = Tokens[Tokens.size() - 2];
716   if (!String->is(tok::string_literal) || String->IsMultiline)
717     return false;
718 
719   if (!Tokens[Tokens.size() - 3]->is(tok::l_paren))
720     return false;
721 
722   FormatToken *Macro = Tokens[Tokens.size() - 4];
723   if (Macro->TokenText != "_T")
724     return false;
725 
726   const char *Start = Macro->TokenText.data();
727   const char *End = Last->TokenText.data() + Last->TokenText.size();
728   String->TokenText = StringRef(Start, End - Start);
729   String->IsFirst = Macro->IsFirst;
730   String->LastNewlineOffset = Macro->LastNewlineOffset;
731   String->WhitespaceRange = Macro->WhitespaceRange;
732   String->OriginalColumn = Macro->OriginalColumn;
733   String->ColumnWidth = encoding::columnWidthWithTabs(
734       String->TokenText, String->OriginalColumn, Style.TabWidth, Encoding);
735   String->NewlinesBefore = Macro->NewlinesBefore;
736   String->HasUnescapedNewline = Macro->HasUnescapedNewline;
737 
738   Tokens.pop_back();
739   Tokens.pop_back();
740   Tokens.pop_back();
741   Tokens.back() = String;
742   return true;
743 }
744 
745 bool FormatTokenLexer::tryMergeConflictMarkers() {
746   if (Tokens.back()->NewlinesBefore == 0 && Tokens.back()->isNot(tok::eof))
747     return false;
748 
749   // Conflict lines look like:
750   // <marker> <text from the vcs>
751   // For example:
752   // >>>>>>> /file/in/file/system at revision 1234
753   //
754   // We merge all tokens in a line that starts with a conflict marker
755   // into a single token with a special token type that the unwrapped line
756   // parser will use to correctly rebuild the underlying code.
757 
758   FileID ID;
759   // Get the position of the first token in the line.
760   unsigned FirstInLineOffset;
761   std::tie(ID, FirstInLineOffset) = SourceMgr.getDecomposedLoc(
762       Tokens[FirstInLineIndex]->getStartOfNonWhitespace());
763   StringRef Buffer = SourceMgr.getBufferOrFake(ID).getBuffer();
764   // Calculate the offset of the start of the current line.
765   auto LineOffset = Buffer.rfind('\n', FirstInLineOffset);
766   if (LineOffset == StringRef::npos) {
767     LineOffset = 0;
768   } else {
769     ++LineOffset;
770   }
771 
772   auto FirstSpace = Buffer.find_first_of(" \n", LineOffset);
773   StringRef LineStart;
774   if (FirstSpace == StringRef::npos) {
775     LineStart = Buffer.substr(LineOffset);
776   } else {
777     LineStart = Buffer.substr(LineOffset, FirstSpace - LineOffset);
778   }
779 
780   TokenType Type = TT_Unknown;
781   if (LineStart == "<<<<<<<" || LineStart == ">>>>") {
782     Type = TT_ConflictStart;
783   } else if (LineStart == "|||||||" || LineStart == "=======" ||
784              LineStart == "====") {
785     Type = TT_ConflictAlternative;
786   } else if (LineStart == ">>>>>>>" || LineStart == "<<<<") {
787     Type = TT_ConflictEnd;
788   }
789 
790   if (Type != TT_Unknown) {
791     FormatToken *Next = Tokens.back();
792 
793     Tokens.resize(FirstInLineIndex + 1);
794     // We do not need to build a complete token here, as we will skip it
795     // during parsing anyway (as we must not touch whitespace around conflict
796     // markers).
797     Tokens.back()->setType(Type);
798     Tokens.back()->Tok.setKind(tok::kw___unknown_anytype);
799 
800     Tokens.push_back(Next);
801     return true;
802   }
803 
804   return false;
805 }
806 
807 FormatToken *FormatTokenLexer::getStashedToken() {
808   // Create a synthesized second '>' or '<' token.
809   Token Tok = FormatTok->Tok;
810   StringRef TokenText = FormatTok->TokenText;
811 
812   unsigned OriginalColumn = FormatTok->OriginalColumn;
813   FormatTok = new (Allocator.Allocate()) FormatToken;
814   FormatTok->Tok = Tok;
815   SourceLocation TokLocation =
816       FormatTok->Tok.getLocation().getLocWithOffset(Tok.getLength() - 1);
817   FormatTok->Tok.setLocation(TokLocation);
818   FormatTok->WhitespaceRange = SourceRange(TokLocation, TokLocation);
819   FormatTok->TokenText = TokenText;
820   FormatTok->ColumnWidth = 1;
821   FormatTok->OriginalColumn = OriginalColumn + 1;
822 
823   return FormatTok;
824 }
825 
826 FormatToken *FormatTokenLexer::getNextToken() {
827   if (StateStack.top() == LexerState::TOKEN_STASHED) {
828     StateStack.pop();
829     return getStashedToken();
830   }
831 
832   FormatTok = new (Allocator.Allocate()) FormatToken;
833   readRawToken(*FormatTok);
834   SourceLocation WhitespaceStart =
835       FormatTok->Tok.getLocation().getLocWithOffset(-TrailingWhitespace);
836   FormatTok->IsFirst = IsFirstToken;
837   IsFirstToken = false;
838 
839   // Consume and record whitespace until we find a significant token.
840   unsigned WhitespaceLength = TrailingWhitespace;
841   while (FormatTok->Tok.is(tok::unknown)) {
842     StringRef Text = FormatTok->TokenText;
843     auto EscapesNewline = [&](int pos) {
844       // A '\r' here is just part of '\r\n'. Skip it.
845       if (pos >= 0 && Text[pos] == '\r')
846         --pos;
847       // See whether there is an odd number of '\' before this.
848       // FIXME: This is wrong. A '\' followed by a newline is always removed,
849       // regardless of whether there is another '\' before it.
850       // FIXME: Newlines can also be escaped by a '?' '?' '/' trigraph.
851       unsigned count = 0;
852       for (; pos >= 0; --pos, ++count)
853         if (Text[pos] != '\\')
854           break;
855       return count & 1;
856     };
857     // FIXME: This miscounts tok:unknown tokens that are not just
858     // whitespace, e.g. a '`' character.
859     for (int i = 0, e = Text.size(); i != e; ++i) {
860       switch (Text[i]) {
861       case '\n':
862         ++FormatTok->NewlinesBefore;
863         FormatTok->HasUnescapedNewline = !EscapesNewline(i - 1);
864         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
865         Column = 0;
866         break;
867       case '\r':
868         FormatTok->LastNewlineOffset = WhitespaceLength + i + 1;
869         Column = 0;
870         break;
871       case '\f':
872       case '\v':
873         Column = 0;
874         break;
875       case ' ':
876         ++Column;
877         break;
878       case '\t':
879         Column +=
880             Style.TabWidth - (Style.TabWidth ? Column % Style.TabWidth : 0);
881         break;
882       case '\\':
883         if (i + 1 == e || (Text[i + 1] != '\r' && Text[i + 1] != '\n'))
884           FormatTok->setType(TT_ImplicitStringLiteral);
885         break;
886       default:
887         FormatTok->setType(TT_ImplicitStringLiteral);
888         break;
889       }
890       if (FormatTok->getType() == TT_ImplicitStringLiteral)
891         break;
892     }
893 
894     if (FormatTok->is(TT_ImplicitStringLiteral))
895       break;
896     WhitespaceLength += FormatTok->Tok.getLength();
897 
898     readRawToken(*FormatTok);
899   }
900 
901   // JavaScript and Java do not allow to escape the end of the line with a
902   // backslash. Backslashes are syntax errors in plain source, but can occur in
903   // comments. When a single line comment ends with a \, it'll cause the next
904   // line of code to be lexed as a comment, breaking formatting. The code below
905   // finds comments that contain a backslash followed by a line break, truncates
906   // the comment token at the backslash, and resets the lexer to restart behind
907   // the backslash.
908   if ((Style.Language == FormatStyle::LK_JavaScript ||
909        Style.Language == FormatStyle::LK_Java) &&
910       FormatTok->is(tok::comment) && FormatTok->TokenText.startswith("//")) {
911     size_t BackslashPos = FormatTok->TokenText.find('\\');
912     while (BackslashPos != StringRef::npos) {
913       if (BackslashPos + 1 < FormatTok->TokenText.size() &&
914           FormatTok->TokenText[BackslashPos + 1] == '\n') {
915         const char *Offset = Lex->getBufferLocation();
916         Offset -= FormatTok->TokenText.size();
917         Offset += BackslashPos + 1;
918         resetLexer(SourceMgr.getFileOffset(Lex->getSourceLocation(Offset)));
919         FormatTok->TokenText = FormatTok->TokenText.substr(0, BackslashPos + 1);
920         FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
921             FormatTok->TokenText, FormatTok->OriginalColumn, Style.TabWidth,
922             Encoding);
923         break;
924       }
925       BackslashPos = FormatTok->TokenText.find('\\', BackslashPos + 1);
926     }
927   }
928 
929   // In case the token starts with escaped newlines, we want to
930   // take them into account as whitespace - this pattern is quite frequent
931   // in macro definitions.
932   // FIXME: Add a more explicit test.
933   while (FormatTok->TokenText.size() > 1 && FormatTok->TokenText[0] == '\\') {
934     unsigned SkippedWhitespace = 0;
935     if (FormatTok->TokenText.size() > 2 &&
936         (FormatTok->TokenText[1] == '\r' && FormatTok->TokenText[2] == '\n'))
937       SkippedWhitespace = 3;
938     else if (FormatTok->TokenText[1] == '\n')
939       SkippedWhitespace = 2;
940     else
941       break;
942 
943     ++FormatTok->NewlinesBefore;
944     WhitespaceLength += SkippedWhitespace;
945     FormatTok->LastNewlineOffset = SkippedWhitespace;
946     Column = 0;
947     FormatTok->TokenText = FormatTok->TokenText.substr(SkippedWhitespace);
948   }
949 
950   FormatTok->WhitespaceRange = SourceRange(
951       WhitespaceStart, WhitespaceStart.getLocWithOffset(WhitespaceLength));
952 
953   FormatTok->OriginalColumn = Column;
954 
955   TrailingWhitespace = 0;
956   if (FormatTok->Tok.is(tok::comment)) {
957     // FIXME: Add the trimmed whitespace to Column.
958     StringRef UntrimmedText = FormatTok->TokenText;
959     FormatTok->TokenText = FormatTok->TokenText.rtrim(" \t\v\f");
960     TrailingWhitespace = UntrimmedText.size() - FormatTok->TokenText.size();
961   } else if (FormatTok->Tok.is(tok::raw_identifier)) {
962     IdentifierInfo &Info = IdentTable.get(FormatTok->TokenText);
963     FormatTok->Tok.setIdentifierInfo(&Info);
964     FormatTok->Tok.setKind(Info.getTokenID());
965     if (Style.Language == FormatStyle::LK_Java &&
966         FormatTok->isOneOf(tok::kw_struct, tok::kw_union, tok::kw_delete,
967                            tok::kw_operator)) {
968       FormatTok->Tok.setKind(tok::identifier);
969       FormatTok->Tok.setIdentifierInfo(nullptr);
970     } else if (Style.Language == FormatStyle::LK_JavaScript &&
971                FormatTok->isOneOf(tok::kw_struct, tok::kw_union,
972                                   tok::kw_operator)) {
973       FormatTok->Tok.setKind(tok::identifier);
974       FormatTok->Tok.setIdentifierInfo(nullptr);
975     }
976   } else if (FormatTok->Tok.is(tok::greatergreater)) {
977     FormatTok->Tok.setKind(tok::greater);
978     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
979     ++Column;
980     StateStack.push(LexerState::TOKEN_STASHED);
981   } else if (FormatTok->Tok.is(tok::lessless)) {
982     FormatTok->Tok.setKind(tok::less);
983     FormatTok->TokenText = FormatTok->TokenText.substr(0, 1);
984     ++Column;
985     StateStack.push(LexerState::TOKEN_STASHED);
986   }
987 
988   // Now FormatTok is the next non-whitespace token.
989 
990   StringRef Text = FormatTok->TokenText;
991   size_t FirstNewlinePos = Text.find('\n');
992   if (FirstNewlinePos == StringRef::npos) {
993     // FIXME: ColumnWidth actually depends on the start column, we need to
994     // take this into account when the token is moved.
995     FormatTok->ColumnWidth =
996         encoding::columnWidthWithTabs(Text, Column, Style.TabWidth, Encoding);
997     Column += FormatTok->ColumnWidth;
998   } else {
999     FormatTok->IsMultiline = true;
1000     // FIXME: ColumnWidth actually depends on the start column, we need to
1001     // take this into account when the token is moved.
1002     FormatTok->ColumnWidth = encoding::columnWidthWithTabs(
1003         Text.substr(0, FirstNewlinePos), Column, Style.TabWidth, Encoding);
1004 
1005     // The last line of the token always starts in column 0.
1006     // Thus, the length can be precomputed even in the presence of tabs.
1007     FormatTok->LastLineColumnWidth = encoding::columnWidthWithTabs(
1008         Text.substr(Text.find_last_of('\n') + 1), 0, Style.TabWidth, Encoding);
1009     Column = FormatTok->LastLineColumnWidth;
1010   }
1011 
1012   if (Style.isCpp()) {
1013     auto it = Macros.find(FormatTok->Tok.getIdentifierInfo());
1014     if (!(Tokens.size() > 0 && Tokens.back()->Tok.getIdentifierInfo() &&
1015           Tokens.back()->Tok.getIdentifierInfo()->getPPKeywordID() ==
1016               tok::pp_define) &&
1017         it != Macros.end()) {
1018       FormatTok->setType(it->second);
1019       if (it->second == TT_IfMacro) {
1020         // The lexer token currently has type tok::kw_unknown. However, for this
1021         // substitution to be treated correctly in the TokenAnnotator, faking
1022         // the tok value seems to be needed. Not sure if there's a more elegant
1023         // way.
1024         FormatTok->Tok.setKind(tok::kw_if);
1025       }
1026     } else if (FormatTok->is(tok::identifier)) {
1027       if (MacroBlockBeginRegex.match(Text)) {
1028         FormatTok->setType(TT_MacroBlockBegin);
1029       } else if (MacroBlockEndRegex.match(Text)) {
1030         FormatTok->setType(TT_MacroBlockEnd);
1031       }
1032     }
1033   }
1034 
1035   return FormatTok;
1036 }
1037 
1038 void FormatTokenLexer::readRawToken(FormatToken &Tok) {
1039   Lex->LexFromRawLexer(Tok.Tok);
1040   Tok.TokenText = StringRef(SourceMgr.getCharacterData(Tok.Tok.getLocation()),
1041                             Tok.Tok.getLength());
1042   // For formatting, treat unterminated string literals like normal string
1043   // literals.
1044   if (Tok.is(tok::unknown)) {
1045     if (!Tok.TokenText.empty() && Tok.TokenText[0] == '"') {
1046       Tok.Tok.setKind(tok::string_literal);
1047       Tok.IsUnterminatedLiteral = true;
1048     } else if (Style.Language == FormatStyle::LK_JavaScript &&
1049                Tok.TokenText == "''") {
1050       Tok.Tok.setKind(tok::string_literal);
1051     }
1052   }
1053 
1054   if ((Style.Language == FormatStyle::LK_JavaScript ||
1055        Style.Language == FormatStyle::LK_Proto ||
1056        Style.Language == FormatStyle::LK_TextProto) &&
1057       Tok.is(tok::char_constant)) {
1058     Tok.Tok.setKind(tok::string_literal);
1059   }
1060 
1061   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format on" ||
1062                                Tok.TokenText == "/* clang-format on */")) {
1063     FormattingDisabled = false;
1064   }
1065 
1066   Tok.Finalized = FormattingDisabled;
1067 
1068   if (Tok.is(tok::comment) && (Tok.TokenText == "// clang-format off" ||
1069                                Tok.TokenText == "/* clang-format off */")) {
1070     FormattingDisabled = true;
1071   }
1072 }
1073 
1074 void FormatTokenLexer::resetLexer(unsigned Offset) {
1075   StringRef Buffer = SourceMgr.getBufferData(ID);
1076   Lex.reset(new Lexer(SourceMgr.getLocForStartOfFile(ID),
1077                       getFormattingLangOpts(Style), Buffer.begin(),
1078                       Buffer.begin() + Offset, Buffer.end()));
1079   Lex->SetKeepWhitespaceMode(true);
1080   TrailingWhitespace = 0;
1081 }
1082 
1083 } // namespace format
1084 } // namespace clang
1085